In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.chdir('/content/gdrive/MyDrive/health')

In [None]:
import pandas as pd
features_path = 'training_set_features.csv'
labels_path = 'training_set_labels.csv'
features_df = pd.read_csv(features_path)
labels_df = pd.read_csv(labels_path)

In [None]:
df = pd.merge(features_df, labels_df, on='respondent_id')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
features = df.drop(columns=['h1n1_vaccine', 'seasonal_vaccine'])
h1n1_target = df['h1n1_vaccine']
seasonal_target = df['seasonal_vaccine']

In [None]:
X_train_h1n1, X_val_h1n1, y_train_h1n1, y_val_h1n1 = train_test_split(features, h1n1_target, test_size=0.2, random_state=42)
X_train_seasonal, X_val_seasonal, y_train_seasonal, y_val_seasonal = train_test_split(features, seasonal_target, test_size=0.2, random_state=42)

In [None]:
# Identifying numerical and categorical columns
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = features.select_dtypes(include=['object', 'bool']).columns

# Creating pipelines for numerical and categorical preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [None]:
# Model dictionary
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train, X_val, y_val):
    results = []
    for name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        roc_auc = roc_auc_score(y_val, pipeline.predict_proba(X_val)[:, 1])

        results.append({"Model": name, "Accuracy": accuracy, "ROC AUC": roc_auc})
    return pd.DataFrame(results)

In [None]:
# Evaluate models for H1N1
h1n1_results = evaluate_models(models, X_train_h1n1, y_train_h1n1, X_val_h1n1, y_val_h1n1)
print("H1N1 Vaccine Predictions:")
print(h1n1_results)

H1N1 Vaccine Predictions:
                 Model  Accuracy   ROC AUC
0  Logistic Regression  0.840509  0.834358
1         RandomForest  0.850431  0.863636
2     GradientBoosting  0.854362  0.869895
3                  SVM  0.845376  0.844666
4              XGBoost  0.850805  0.855948


In [None]:
# Evaluate models for Seasonal Flu
seasonal_results = evaluate_models(models, X_train_seasonal, y_train_seasonal, X_val_seasonal, y_val_seasonal)
print("\nSeasonal Flu Vaccine Predictions:")
print(seasonal_results)


Seasonal Flu Vaccine Predictions:
                 Model  Accuracy   ROC AUC
0  Logistic Regression  0.785474  0.856446
1         RandomForest  0.778547  0.853965
2     GradientBoosting  0.791838  0.863505
3                  SVM  0.784725  0.856926
4              XGBoost  0.783040  0.857340
