### 2. Data Pre-processing


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets


In [2]:
df = pd.read_csv("health_lifestyle_classification.csv")

In [3]:
df.describe()

Unnamed: 0,survey_code,age,height,weight,bmi,bmi_estimated,bmi_scaled,bmi_corrected,waist_size,blood_pressure,...,water_intake,screen_time,stress_level,mental_health_score,income,meals_per_day,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,92331.0,...,100000.0,100000.0,100000.0,100000.0,91530.0,100000.0,100000.0,89526.0,100000.0,100000.0
mean,50000.5,48.52599,170.023707,70.064862,24.493876,24.493876,73.481627,24.49414,84.933043,119.980149,...,2.006373,6.021525,4.9916,5.00468,4038.127284,2.99872,0.0,1.0,5.5,0.015726
std,28867.657797,17.886768,9.982798,14.693667,5.951069,5.951069,17.853206,5.954184,12.040314,15.015503,...,0.688868,2.933835,3.154997,3.164228,1930.025678,1.414786,0.0,0.0,0.0,5.764489
min,1.0,18.0,140.0,40.0,9.988495,9.988495,29.965484,9.893845,34.093185,59.128168,...,0.5,0.0,0.0,0.0,500.0,1.0,0.0,1.0,5.5,-9.999895
25%,25000.75,33.0,163.306615,59.856938,20.271405,20.271405,60.814215,20.271059,76.795185,109.81206,...,1.532011,3.971318,2.0,2.0,2665.402843,2.0,0.0,1.0,5.5,-4.980501
50%,50000.5,48.0,170.016778,69.924141,24.156734,24.156734,72.470201,24.151699,84.957139,119.951794,...,2.000659,5.991171,5.0,5.0,4004.601345,3.0,0.0,1.0,5.5,0.015589
75%,75000.25,64.0,176.72892,80.027418,28.258696,28.258696,84.776088,28.247648,93.018713,130.120621,...,2.473047,8.02447,8.0,8.0,5360.012694,4.0,0.0,1.0,5.5,5.008424
max,100000.0,79.0,210.0,139.250894,59.234792,59.234792,177.704377,59.142646,133.153631,184.439195,...,5.0,16.0,10.0,10.0,12029.409353,5.0,0.0,1.0,5.5,9.999966


In [4]:
print(df.dtypes)
print(df.isnull().sum())

survey_code                   int64
age                           int64
gender                       object
height                      float64
weight                      float64
bmi                         float64
bmi_estimated               float64
bmi_scaled                  float64
bmi_corrected               float64
waist_size                  float64
blood_pressure              float64
heart_rate                  float64
cholesterol                 float64
glucose                     float64
insulin                     float64
sleep_hours                 float64
sleep_quality                object
work_hours                  float64
physical_activity           float64
daily_steps                 float64
calorie_intake              float64
sugar_intake                float64
alcohol_consumption          object
smoking_level                object
water_intake                float64
screen_time                 float64
stress_level                  int64
mental_health_score         

In [5]:
numerical_cols_to_impute = [
    'blood_pressure',
    'heart_rate',
    'insulin',
    'daily_steps',
    'income',
    'gene_marker_flag'
]

# List of categorical columns with missing values
categorical_cols_to_impute = [
    'alcohol_consumption',
    'exercise_type',
    'smoking_level',
    'caffeine_intake'
]

# Impute numerical columns with the mean
for col in numerical_cols_to_impute:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mean())

# Impute categorical columns with the mode
for col in categorical_cols_to_impute:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

redundant_cols = ['bmi_estimate', 'bmi_scaled','bmi_category','bmi_estimated', 'bmi_corrected']

df.drop(columns=redundant_cols, errors='ignore', inplace=True)



In [6]:
def create_bmi_category(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi < 24.9:
        return 'normal'
    elif 24.9 <= bmi < 29.9:
        return 'overweight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(create_bmi_category)

df['work_life_balance'] = df['sleep_hours'] - df['work_hours']

In [7]:
df.head()

Unnamed: 0,survey_code,age,gender,height,weight,bmi,waist_size,blood_pressure,heart_rate,cholesterol,...,caffeine_intake,family_history,pet_owner,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage,target,bmi_category,work_life_balance
0,1,56,Male,173.416872,56.88664,18.915925,72.16513,118.264254,60.749825,214.580523,...,Moderate,No,Yes,0,1.0,5.5,-2.275502,healthy,normal,-1.195429
1,2,69,Female,163.20738,97.799859,36.716278,85.598889,117.917986,66.463696,115.794002,...,High,Yes,No,0,1.0,5.5,6.23934,healthy,obese,-1.086788
2,3,46,Male,177.281966,80.687562,25.67305,90.29503,123.073698,76.043212,138.134787,...,Moderate,No,No,0,1.0,5.5,5.423737,healthy,overweight,-0.127689
3,4,32,Female,172.101255,63.142868,21.31848,100.504211,148.173453,68.781981,203.017447,...,Moderate,No,Yes,0,1.0,5.5,8.388611,healthy,normal,-4.301377
4,5,60,Female,163.608816,40.0,14.943302,69.02115,150.613181,92.335358,200.412439,...,High,Yes,Yes,0,1.0,5.5,0.332622,healthy,underweight,0.637064


In [8]:
numerical_df = df.select_dtypes(include=['int64', 'float64'])

# Calculate the variance for each numerical column
variances = numerical_df.var()

print(variances)

survey_code                 8.333417e+08
age                         3.199365e+02
height                      9.965626e+01
weight                      2.159038e+02
bmi                         3.541522e+01
waist_size                  1.449692e+02
blood_pressure              2.081742e+02
heart_rate                  8.499652e+01
cholesterol                 8.989164e+02
glucose                     3.993134e+02
insulin                     2.106439e+01
sleep_hours                 2.240473e+00
work_hours                  3.978920e+00
physical_activity           3.551247e+00
daily_steps                 5.679076e+06
calorie_intake              1.604133e+05
sugar_intake                3.986751e+02
water_intake                4.745397e-01
screen_time                 8.607390e+00
stress_level                9.954009e+00
mental_health_score         1.001234e+01
income                      3.409489e+06
meals_per_day               2.001618e+00
electrolyte_level           0.000000e+00
gene_marker_flag

In [9]:
#remove electrolyte_level, gene_marker_flag, environmental_risk_score
threshold = 0.1

# Identify features with variance below the threshold
low_variance_features = variances[variances < threshold].index.tolist()

df.drop(columns=low_variance_features, inplace=True, errors='ignore')
df.shape

(100000, 44)

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# --- 4. Separate Features (X) and Target (y) ---
X = df.drop('target', axis=1)
y = df['target'].map({'healthy': 0, 'diseased': 1})

categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# --- 6. Split the Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- 7. Scale Numerical Features ---
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# The variables X_train, X_test, y_train, y_test are now ready to be used for model training.
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Use SelectFromModel to select features based on importance
selector = SelectFromModel(model, prefit=True, threshold='mean')
X_train_reduced = selector.transform(X_train)
X_test_reduced = selector.transform(X_test)

print(f"Original number of features: {X_train.shape[1]}")
print(f"Number of features after selection: {X_train_reduced.shape[1]}")

Original number of features: 64
Number of features after selection: 24




### 3. Regression Models: Linear Regression, Regularized Regression (Ridge)


In [11]:

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

logreg_cw = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
logreg_cw.fit(X_train, y_train)
y_pred_logreg_cw = logreg_cw.predict(X_test)
print("\nModel: Logistic Regression (Class Weights)")
print("Classification Report:")
print(classification_report(y_test, y_pred_logreg_cw))


Model: Logistic Regression (Class Weights)
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.50      0.58     14019
           1       0.30      0.49      0.37      5981

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.48     20000
weighted avg       0.58      0.50      0.52     20000



In [12]:

print("\n--- L2 Regularization (Ridge) ---")
logreg_l2 = LogisticRegression(penalty='l2', solver='liblinear', random_state=42, class_weight='balanced')
logreg_l2.fit(X_train, y_train)
y_pred_l2 = logreg_l2.predict(X_test)
print(classification_report(y_test, y_pred_l2))


--- L2 Regularization (Ridge) ---
              precision    recall  f1-score   support

           0       0.70      0.50      0.58     14019
           1       0.30      0.49      0.37      5981

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.48     20000
weighted avg       0.58      0.50      0.52     20000



### 4. Classification Models:  Decision Trees, Random Forests, SVMs, K-Nearest Neighbors


In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize and train the Decision Tree model
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = dtree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.4f}")


Decision Tree Accuracy: 0.5776


In [14]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rforest = RandomForestClassifier(n_estimators=100, random_state=42)
rforest.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = rforest.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

Random Forest Accuracy: 0.7006


In [15]:
from sklearn.svm import SVC

# Initialize and train the SVM model
# Using a Radial Basis Function (RBF) kernel is a common choice
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Accuracy: {accuracy:.4f}")

SVM Accuracy: 0.7009


In [16]:
from sklearn.neighbors import KNeighborsClassifier

# A common starting point for k is the square root of the number of samples
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"K-Nearest Neighbors Accuracy: {accuracy:.4f}")

K-Nearest Neighbors Accuracy: 0.6355


#### It seems SVM accuracy is the highest having 0.7009 accuracy while second being Random Forest having accuracy of 0.7006 and the third being K-Nearest Neighbors having accuracy of 0.6355