In [2]:
from sklearn.preprocessing import StandardScaler

In [5]:
from sklearn.cluster import KMeans

In [7]:
import pandas as pd
df = pd.read_csv("../data/processed/nepal_features.csv")

In [8]:
import numpy as np

In [9]:
cluster_features = df[['age','BMI','lifestyle_risk_score','ses_risk_score']]
X_scaled = StandardScaler().fit_transform(cluster_features)

In [10]:
X_scaled_df = pd.DataFrame(X_scaled, columns=cluster_features.columns)

In [11]:
X_scaled_df.head()

Unnamed: 0,age,BMI,lifestyle_risk_score,ses_risk_score
0,-1.636188,-0.095347,0.100504,-0.383893
1,0.853363,0.358687,2.613098,-0.383893
2,0.438437,-1.457451,-1.155793,0.745204
3,-0.391413,-1.457451,-1.155793,-1.51299
4,-0.391413,-1.457451,-1.155793,-1.51299


In [12]:
kmeans = KMeans(n_clusters=3, random_state=42)
df['segment_id'] = kmeans.fit_predict(X_scaled)



In [13]:
df.head()

Unnamed: 0,person_id,age,gender,province,urban_rural,education_level,wealth_index,smoker,alcohol_use,physical_activity,BMI,systolic_bp,diastolic_bp,diabetes_flag,hypertension_flag,BMI_category,lifestyle_risk_score,ses_risk_score,segment_id
0,1,22,Male,6,1,Primary,Medium,1,0,1,26,157,60,0,1,Overweight,1,1,1
1,2,58,Male,2,1,Higher,Low,1,1,0,28,108,72,1,0,Overweight,3,1,0
2,3,52,Male,7,0,,Low,0,0,1,20,105,80,0,0,Normal,0,2,0
3,4,40,Male,3,1,Higher,Medium,0,0,1,20,156,96,0,1,Normal,0,0,2
4,5,40,Female,4,1,Secondary,High,0,0,1,20,133,89,0,0,Normal,0,0,2


In [14]:
segment_profile = df.groupby('segment_id').agg({
    'age':'mean', 'BMI':'mean', 
    'lifestyle_risk_score':'mean', 
    'ses_risk_score':'mean', 
    'person_id':'count'
}).rename(columns={'person_id':'population'})

In [15]:
segment_profile.head()

Unnamed: 0_level_0,age,BMI,lifestyle_risk_score,ses_risk_score,population
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,54.888889,24.055556,1.444444,2.055556,18
1,31.0,29.294118,0.764706,1.294118,17
2,51.2,26.0,0.466667,0.533333,15


In [16]:
def label_segment(row):
    if row['age'] < 35 and row['lifestyle_risk_score'] < 1:
        return "Young Healthy Low Risk"
    elif 35 <= row['age'] < 55:
        return "Middle Age Emerging Risk"
    else:
        return "Elderly High Risk"

In [17]:
segment_profile['segment_name'] = segment_profile.apply(label_segment, axis=1)

In [19]:
segment_profile.head()

Unnamed: 0_level_0,age,BMI,lifestyle_risk_score,ses_risk_score,population,segment_name
segment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,54.888889,24.055556,1.444444,2.055556,18,Middle Age Emerging Risk
1,31.0,29.294118,0.764706,1.294118,17,Young Healthy Low Risk
2,51.2,26.0,0.466667,0.533333,15,Middle Age Emerging Risk


In [20]:
segment_map = segment_profile['segment_name'].to_dict()

In [21]:
df['segment_name'] = df['segment_id'].map(segment_map)

In [22]:
df.head()

Unnamed: 0,person_id,age,gender,province,urban_rural,education_level,wealth_index,smoker,alcohol_use,physical_activity,BMI,systolic_bp,diastolic_bp,diabetes_flag,hypertension_flag,BMI_category,lifestyle_risk_score,ses_risk_score,segment_id,segment_name
0,1,22,Male,6,1,Primary,Medium,1,0,1,26,157,60,0,1,Overweight,1,1,1,Young Healthy Low Risk
1,2,58,Male,2,1,Higher,Low,1,1,0,28,108,72,1,0,Overweight,3,1,0,Middle Age Emerging Risk
2,3,52,Male,7,0,,Low,0,0,1,20,105,80,0,0,Normal,0,2,0,Middle Age Emerging Risk
3,4,40,Male,3,1,Higher,Medium,0,0,1,20,156,96,0,1,Normal,0,0,2,Middle Age Emerging Risk
4,5,40,Female,4,1,Secondary,High,0,0,1,20,133,89,0,0,Normal,0,0,2,Middle Age Emerging Risk


In [23]:
df.to_csv("../data/processed/nepal_segmented.csv", index=False)
segment_profile.to_csv("../data/processed/nepal_segment_profile.csv")

In [24]:
df

Unnamed: 0,person_id,age,gender,province,urban_rural,education_level,wealth_index,smoker,alcohol_use,physical_activity,BMI,systolic_bp,diastolic_bp,diabetes_flag,hypertension_flag,BMI_category,lifestyle_risk_score,ses_risk_score,segment_id,segment_name
0,1,22,Male,6,1,Primary,Medium,1,0,1,26,157,60,0,1,Overweight,1,1,1,Young Healthy Low Risk
1,2,58,Male,2,1,Higher,Low,1,1,0,28,108,72,1,0,Overweight,3,1,0,Middle Age Emerging Risk
2,3,52,Male,7,0,,Low,0,0,1,20,105,80,0,0,Normal,0,2,0,Middle Age Emerging Risk
3,4,40,Male,3,1,Higher,Medium,0,0,1,20,156,96,0,1,Normal,0,0,2,Middle Age Emerging Risk
4,5,40,Female,4,1,Secondary,High,0,0,1,20,133,89,0,0,Normal,0,0,2,Middle Age Emerging Risk
5,6,62,Female,5,1,,High,0,1,1,27,126,91,0,0,Overweight,1,0,2,Middle Age Emerging Risk
6,7,22,Female,1,0,Primary,Low,0,1,0,18,144,81,0,1,Underweight,2,3,0,Middle Age Emerging Risk
7,8,54,Male,4,0,Secondary,High,0,0,1,27,122,64,0,0,Overweight,0,1,2,Middle Age Emerging Risk
8,9,28,Male,4,0,Primary,Low,0,0,0,26,107,75,0,0,Overweight,1,3,1,Young Healthy Low Risk
9,10,22,Male,6,1,,High,0,0,0,25,143,99,0,1,Overweight,1,0,1,Young Healthy Low Risk
