In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
os.chdir('/content/gdrive/MyDrive/health')

In [3]:
import pandas as pd

features_path = 'training_set_features.csv'
labels_path = 'training_set_labels.csv'
features_df = pd.read_csv(features_path)
labels_df = pd.read_csv(labels_path)

In [4]:
data_df = pd.merge(features_df, labels_df, on='respondent_id')

In [5]:
data_df.shape

(26707, 38)

In [6]:
# Calculate the correlation matrix for the entire dataset including the target variables
full_corr_matrix = data_df.corr()

# Extract the correlations with the target variables and sort them in descending order
h1n1_corr = full_corr_matrix['h1n1_vaccine'].sort_values(ascending=False)
seasonal_corr = full_corr_matrix['seasonal_vaccine'].sort_values(ascending=False)

# Display the sorted correlations for each target variable
print("Correlation with H1N1 Vaccine:")
print(h1n1_corr)
print("\nCorrelation with Seasonal Vaccine:")
print(seasonal_corr)

Correlation with H1N1 Vaccine:
h1n1_vaccine                   1.000000
doctor_recc_h1n1               0.393890
seasonal_vaccine               0.377143
opinion_h1n1_risk              0.323265
opinion_h1n1_vacc_effective    0.269347
opinion_seas_risk              0.258571
doctor_recc_seasonal           0.209864
opinion_seas_vacc_effective    0.179272
health_worker                  0.169768
h1n1_concern                   0.121929
health_insurance               0.121170
h1n1_knowledge                 0.117951
chronic_med_condition          0.095207
opinion_h1n1_sick_from_vacc    0.075091
behavioral_wash_hands          0.074712
behavioral_touch_face          0.071648
behavioral_face_mask           0.070498
child_under_6_months           0.066962
behavioral_avoidance           0.047690
behavioral_antiviral_meds      0.040608
behavioral_outside_home        0.021768
behavioral_large_gatherings    0.017822
opinion_seas_sick_from_vacc    0.008360
household_adults               0.007545
responden

  full_corr_matrix = data_df.corr()


## H1N1: Feature Focused - With Mean and Most Frequent Imputation

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

# Requirements - machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# selected features based on correlation
features = [
    'doctor_recc_h1n1', 'seasonal_vaccine', 'opinion_h1n1_risk',
    'opinion_h1n1_vacc_effective', 'opinion_seas_risk', 'doctor_recc_seasonal',
    'opinion_seas_vacc_effective', 'health_worker', 'h1n1_concern',
    'health_insurance', 'h1n1_knowledge'
]
X = data_df[features]
y = data_df['h1n1_vaccine']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# missing values for categorical data - most frequent value
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# numerical data - mean value
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}
for name, model in models.items():
    # Create and fit the pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)

    # Predictions
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1] # For ROC AUC score

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    results[name] = {"Accuracy": accuracy, "ROC AUC": roc_auc}

for model_name, metrics in results.items():
    print(f"{model_name} - Accuracy: {metrics['Accuracy']:.4f}, ROC AUC: {metrics['ROC AUC']:.4f}")

Logistic Regression - Accuracy: 0.8478, ROC AUC: 0.8639
Random Forest - Accuracy: 0.8424, ROC AUC: 0.8552
Gradient Boosting - Accuracy: 0.8575, ROC AUC: 0.8880
SVM - Accuracy: 0.8598, ROC AUC: 0.8451
XGBoost - Accuracy: 0.8545, ROC AUC: 0.8788


In [9]:
import joblib

# Define the Gradient Boosting model
gradient_boosting_model = GradientBoostingClassifier()

# Create and fit the pipeline with Gradient Boosting
pipeline_gb = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', gradient_boosting_model)])
pipeline_gb.fit(X_train, y_train)

# Evaluate the Gradient Boosting model
y_pred_gb = pipeline_gb.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
roc_auc_gb = roc_auc_score(y_test, pipeline_gb.predict_proba(X_test)[:, 1])
print(f"Accuracy with Gradient Boosting: {accuracy_gb}, ROC AUC: {roc_auc_gb}")

# Save the Gradient Boosting model's weights
model_filename_gb = 'gradient_boosting_ff_h1n1_vaccine_model.joblib'
joblib.dump(pipeline_gb, model_filename_gb)
print(f"Gradient Boosting model saved to {model_filename_gb}")

Accuracy with Gradient Boosting: 0.8575439910146013, ROC AUC: 0.887986761560587
Gradient Boosting model saved to gradient_boosting_ff_h1n1_vaccine_model.joblib


## Seasonal Vaccine: Feature Focused - With Mean and Most Frequent Imputation

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score

# requirements - machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# selected features based on correlation for seasonal vaccine
features_seasonal = [
    'opinion_seas_risk', 'h1n1_vaccine', 'doctor_recc_seasonal',
    'opinion_seas_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_vacc_effective',
    'health_insurance', 'doctor_recc_h1n1', 'chronic_med_condition',
    'h1n1_concern', 'health_worker', 'behavioral_touch_face'
]
X = data_df[features_seasonal]
y = data_df['seasonal_vaccine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# categorical data - most frequent value
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# numerical data - mean value
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results_seasonal = {}
for name, model in models.items():
    # Create and fit the pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    pipeline.fit(X_train, y_train)

    # Predictions
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1] # For ROC AUC score

    # Calculate accuracy and ROC AUC score
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    results_seasonal[name] = {"Accuracy": accuracy, "ROC AUC": roc_auc}

# Display results for seasonal vaccine prediction
for model_name, metrics in results_seasonal.items():
    print(f"{model_name} - Accuracy: {metrics['Accuracy']:.4f}, ROC AUC: {metrics['ROC AUC']:.4f}")

Logistic Regression - Accuracy: 0.7849, ROC AUC: 0.8568
Random Forest - Accuracy: 0.7628, ROC AUC: 0.8272
Gradient Boosting - Accuracy: 0.7939, ROC AUC: 0.8680
SVM - Accuracy: 0.7853, ROC AUC: 0.8470
XGBoost - Accuracy: 0.7849, ROC AUC: 0.8571


In [11]:
gradient_boosting_model = GradientBoostingClassifier()

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', gradient_boosting_model)])
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
print(f"Accuracy: {accuracy}, ROC AUC: {roc_auc}")

model_filename = 'gradient_boosting_ff_seasonal_vaccine_model.joblib'
joblib.dump(pipeline, model_filename)

print(f"Model saved to {model_filename}")

Accuracy: 0.793897416697866, ROC AUC: 0.8680182551701291
Model saved to gradient_boosting_ff_seasonal_vaccine_model.joblib
