In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [3]:
# Load Data
df = pd.read_csv('HealthSurvey.csv')


In [4]:
# Drop irrelevant or high-missing columns
df.drop(['comments', 'state', 'Timestamp'], axis=1, inplace=True)

In [5]:
# Clean Age
df = df[(df['Age'] >= 18) & (df['Age'] <= 65)]


In [7]:
# Fill missing values
df.fillna({'self_employed': 'No', 'work_interfere': "Don't know"}, inplace=True)

In [8]:
selected_features = [
    'Age',
    'family_history',
    'work_interfere',
    'benefits',
    'care_options',
    'leave',
    'supervisor'
]

In [9]:
selected_features.append('treatment')
df = df[selected_features]

In [10]:
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

In [11]:
X = df.drop('treatment', axis=1)
y = df['treatment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'XGBoost': XGBClassifier()
}

In [25]:
model_performance = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    model_performance[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }

In [26]:
print(y.value_counts(normalize=True))

treatment
1    0.5048
0    0.4952
Name: proportion, dtype: float64


In [19]:
print("Model Performance Comparison:")
for model_name, metrics in model_performance.items():
    print(f"\n{model_name}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

Model Performance Comparison:

Logistic Regression:
Accuracy: 0.8200
Precision: 0.8309
Recall: 0.8370
F1 Score: 0.8339

Random Forest:
Accuracy: 0.7960
Precision: 0.8182
Recall: 0.8000
F1 Score: 0.8090

XGBoost:
Accuracy: 0.8040
Precision: 0.8116
Recall: 0.8296
F1 Score: 0.8205


In [23]:
# Save best model (assume XGBoost is best)
best_model = models['XGBoost']
with open('app/model.pkl', 'wb') as f:
    pickle.dump(best_model, f)


In [24]:
# Save encoders for inference
with open('app/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)