In [1]:
### Complete Training Code with Proper Model Saving

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# Load and preprocess data
df = pd.read_csv("health_lifestyle_classification.csv")

print("Initial dataset shape:", df.shape)
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

# Handle missing values
numerical_cols_to_impute = [
    'blood_pressure',
    'heart_rate',
    'insulin',
    'daily_steps',
    'income',
    'gene_marker_flag'
]

categorical_cols_to_impute = [
    'alcohol_consumption',
    'exercise_type',
    'smoking_level',
    'caffeine_intake'
]

# Impute numerical columns with the mean
for col in numerical_cols_to_impute:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mean())

# Impute categorical columns with the mode
for col in categorical_cols_to_impute:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

# Remove redundant columns
redundant_cols = ['bmi_estimate', 'bmi_scaled', 'bmi_category', 'bmi_estimated', 'bmi_corrected']
df.drop(columns=redundant_cols, errors='ignore', inplace=True)

# Create BMI category
def create_bmi_category(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi < 24.9:
        return 'normal'
    elif 24.9 <= bmi < 29.9:
        return 'overweight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(create_bmi_category)

# Create work-life balance feature
df['work_life_balance'] = df['sleep_hours'] - df['work_hours']

# Remove low variance features
numerical_df = df.select_dtypes(include=['int64', 'float64'])
variances = numerical_df.var()
threshold = 0.1
low_variance_features = variances[variances < threshold].index.tolist()
df.drop(columns=low_variance_features, inplace=True, errors='ignore')

print(f"Dataset shape after preprocessing: {df.shape}")


Initial dataset shape: (100000, 48)

Data types:
survey_code                   int64
age                           int64
gender                       object
height                      float64
weight                      float64
bmi                         float64
bmi_estimated               float64
bmi_scaled                  float64
bmi_corrected               float64
waist_size                  float64
blood_pressure              float64
heart_rate                  float64
cholesterol                 float64
glucose                     float64
insulin                     float64
sleep_hours                 float64
sleep_quality                object
work_hours                  float64
physical_activity           float64
daily_steps                 float64
calorie_intake              float64
sugar_intake                float64
alcohol_consumption          object
smoking_level                object
water_intake                float64
screen_time                 float64
stress_level   

In [2]:

# --- CORRECTED MODEL TRAINING PIPELINE ---

# 1. Separate Features (X) and Target (y)
X = df.drop('target', axis=1)
y = df['target'].map({'healthy': 0, 'diseased': 1})

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

# 2. Handle categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns
print(f"Categorical columns: {list(categorical_cols)}")

X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print(f"Shape after encoding: {X_encoded.shape}")

# 3. Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


Features shape: (100000, 43)
Target distribution:
target
0    70097
1    29903
Name: count, dtype: int64
Categorical columns: ['gender', 'sleep_quality', 'alcohol_consumption', 'smoking_level', 'mental_health_support', 'education_level', 'job_type', 'occupation', 'diet_type', 'exercise_type', 'device_usage', 'healthcare_access', 'insurance', 'sunlight_exposure', 'caffeine_intake', 'family_history', 'pet_owner', 'bmi_category']
Shape after encoding: (100000, 64)
Training set shape: (80000, 64)
Test set shape: (20000, 64)


In [3]:

# 4. Scale numerical features PROPERLY
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
print(f"Numerical features to scale: {list(numerical_features)}")

# Create scaler and fit on training data only
scaler = StandardScaler()

# IMPORTANT: Create copies to avoid modifying original data
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scale only numerical features
X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])

print("✅ Scaling completed")

# 5. Train model for feature selection
print("Training initial model for feature selection...")
initial_model = RandomForestClassifier(n_estimators=100, random_state=42)
initial_model.fit(X_train_scaled, y_train)

# 6. Feature selection
selector = SelectFromModel(initial_model, prefit=True, threshold='mean')
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

print(f"Original features: {X_train_scaled.shape[1]}")
print(f"Selected features: {X_train_selected.shape[1]}")

# Get selected feature names
selected_features_mask = selector.get_support()
selected_feature_names = X_train_scaled.columns[selected_features_mask]
print(f"Selected features: {list(selected_feature_names)}")

# 7. Train final models
print("\n=== TRAINING FINAL MODELS ===")

# Random Forest (final model)
print("\nTraining Random Forest...")
rforest_final = RandomForestClassifier(n_estimators=100, random_state=42)
rforest_final.fit(X_train_selected, y_train)
y_pred_rf = rforest_final.predict(X_test_selected)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")


Numerical features to scale: ['survey_code', 'age', 'height', 'weight', 'bmi', 'waist_size', 'blood_pressure', 'heart_rate', 'cholesterol', 'glucose', 'insulin', 'sleep_hours', 'work_hours', 'physical_activity', 'daily_steps', 'calorie_intake', 'sugar_intake', 'water_intake', 'screen_time', 'stress_level', 'mental_health_score', 'income', 'meals_per_day', 'daily_supplement_dosage', 'work_life_balance']
✅ Scaling completed
Training initial model for feature selection...




Original features: 64
Selected features: 24
Selected features: ['survey_code', 'age', 'height', 'weight', 'bmi', 'waist_size', 'blood_pressure', 'heart_rate', 'cholesterol', 'glucose', 'insulin', 'sleep_hours', 'work_hours', 'physical_activity', 'daily_steps', 'calorie_intake', 'sugar_intake', 'water_intake', 'screen_time', 'stress_level', 'mental_health_score', 'income', 'daily_supplement_dosage', 'work_life_balance']

=== TRAINING FINAL MODELS ===

Training Random Forest...
Random Forest Accuracy: 0.7005


In [4]:

# SVM
print("\nTraining SVM...")
svm = SVC(kernel='rbf', random_state=42, probability=True)  # Enable probability
svm.fit(X_train_selected, y_train)
y_pred_svm = svm.predict(X_test_selected)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy:.4f}")


Training SVM...
SVM Accuracy: 0.7009


In [5]:

# K-Nearest Neighbors
print("\nTraining KNN...")
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_selected, y_train)
y_pred_knn = knn.predict(X_test_selected)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {knn_accuracy:.4f}")


Training KNN...
KNN Accuracy: 0.6417


In [6]:

# 8. Choose best model (based on your results, SVM was best)
best_model = svm if svm_accuracy >= rf_accuracy else rforest_final
best_accuracy = max(svm_accuracy, rf_accuracy)
model_name = "SVM" if svm_accuracy >= rf_accuracy else "Random Forest"


In [None]:

# 11. Model evaluation
print(f"\n=== FINAL MODEL EVALUATION ===")
print(f"Best Model: {model_name}")
print(f"Accuracy: {best_accuracy:.4f}")
print(f"\nClassification Report:")
if model_name == "SVM":
    print(classification_report(y_test, y_pred_svm, target_names=['Healthy', 'Diseased']))
else:
    print(classification_report(y_test, y_pred_rf, target_names=['Healthy', 'Diseased']))

# Feature importance (if Random Forest)
if model_name == "Random Forest":
    print(f"\nTop 10 Most Important Features:")
    feature_importance = pd.DataFrame({
        'feature': selected_feature_names,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    print(feature_importance.head(10))

SyntaxError: '(' was never closed (784107380.py, line 18)