In [None]:

features_df = pd.read_csv('training_set_features.csv')
labels_df= pd.read_csv('training_set_labels.csv')
data = pd.merge(features_df, labels_df, on='respondent_id')

features = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
target= data[['xyz_vaccine', 'seasonal_vaccine']]

categorical_cols= features.select_dtypes(include=['object']).columns
numerical_cols = features.select_dtypes(exclude=['object']).columns

numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor =ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])

model= XGBClassifier(use_label_encoder=False, eval_metric='logloss')

clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', model)])

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

xyz_pred_prob= clf.predict_proba(X_test)[:, 1]
seasonal_pred_prob= clf.predict_proba(X_test)[:, 1]

roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], xyz_pred_prob)
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], seasonal_pred_prob)
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print('Mean ROC AUC Score:', mean_roc_auc)

param_grid= {
'classifier__n_estimators': [100, 200],
'classifier__learning_rate': [0.01, 0.1],
'classifier__max_depth': [3, 5]
}

grid_search= GridSearchCV(clf, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)

best_clf= grid_search.best_estimator_
best_xyz_pred_prob = best_clf.predict_proba(X_test)[:, 1]
best_seasonal_pred_prob = best_clf.predict_proba(X_test)[:, 1]
best_roc_auc_xyz= roc_auc_score(y_test['xyz_vaccine'], best_xyz_pred_prob)
best_roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], best_seasonal_pred_prob)
best_mean_roc_auc= np.mean([best_roc_auc_xyz, best_roc_auc_seasonal])

print('Best Cruel ROC-AUC Score:', best_mean_roc_auc)

final_predictions= best_clf.predict_proba(features)

submission = pd.DataFrame({
'respondent_id': data['respondent_id'],
'xyz_vaccine': final_predictions[:, 0],
'seasonal_vaccine': final_predictions[:, 1] ,
})

submission['xyz_vaccine'] = np.clip(submission['xyz_vaccine'], 0 , 1)
submission['seasonal_vaccine']= np.clip(submission['seasonal_vaccine'],0, 1)

submission.to_csv('submission.csv', index=False)

Mean ROC AUC Score: 0.7818408920358839
Best parameters found:  {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
Best Mean ROC AUC Score: 0.7900819474747551
