In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Load features and labels data
features_df = pd.read_csv('training_set_features.csv')
labels_df = pd.read_csv('training_set_labels.csv')

# Merge features and labels based on respondent_id
data = pd.merge(features_df, labels_df, on='respondent_id')

# Define the features and the target variables
features = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
target = data[['xyz_vaccine', 'seasonal_vaccine']]

# Identify categorical and numerical columns
categorical_cols = features.select_dtypes(include=['object']).columns
numerical_cols = features.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Create and evaluate the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the Model
clf.fit(X_train, y_train)

# Make Predictions
xyz_pred_prob = clf.predict_proba(X_test)[:, 1]
seasonal_pred_prob = clf.predict_proba(X_test)[:, 1]

# Evaluate the Model
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], xyz_pred_prob)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], seasonal_pred_prob)
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print('Mean ROC AUC Score:', mean_roc_auc)

# Hyperparameter Tuning (Optional)
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5]
}

grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model
best_clf = grid_search.best_estimator_
best_xyz_pred_prob = best_clf.predict_proba(X_test)[:, 1]
best_seasonal_pred_prob = best_clf.predict_proba(X_test)[:, 1]
best_roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], best_xyz_pred_prob)
best_roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], best_seasonal_pred_prob)
best_mean_roc_auc = np.mean([best_roc_auc_xyz, best_roc_auc_seasonal])

print('Best Mean ROC AUC Score:', best_mean_roc_auc)

# Make predictions on the entire dataset
final_predictions = best_clf.predict_proba(features)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'respondent_id': data['respondent_id'],
    'xyz_vaccine': final_predictions[:, 0],  # Probability of receiving xyz vaccine
    'seasonal_vaccine': final_predictions[:, 1]  # Probability of receiving seasonal vaccine
})

# Ensure probabilities are within valid range [0, 1]
submission['xyz_vaccine'] = np.clip(submission['xyz_vaccine'], 0, 1)
submission['seasonal_vaccine'] = np.clip(submission['seasonal_vaccine'], 0, 1)

# Save submission to CSV
submission.to_csv('submission.csv', index=False)

Mean ROC AUC Score: 0.7818408920358839
Best parameters found:  {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
Best Mean ROC AUC Score: 0.7900819474747551
