In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score


In [24]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')


In [26]:
train_data = train_features.merge(train_labels, on='respondent_id')

In [28]:
feature_cols = train_features.columns[1:]  # Exclude respondent_id


In [30]:
X = train_data[feature_cols]
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']


In [32]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [34]:
# Models
model_xyz = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_seasonal = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [36]:
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

In [38]:
# Training pipeline
pipeline_xyz = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model_xyz)])

pipeline_seasonal = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', model_seasonal)])

# Train models
pipeline_xyz.fit(X_train, y_train_xyz)
pipeline_seasonal.fit(X_train, y_train_seasonal)

In [40]:
val_preds_xyz = pipeline_xyz.predict_proba(X_val)[:, 1]
val_preds_seasonal = pipeline_seasonal.predict_proba(X_val)[:, 1]

roc_auc_xyz = roc_auc_score(y_val_xyz, val_preds_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, val_preds_seasonal)
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'ROC AUC for XYZ vaccine: {roc_auc_xyz}')
print(f'ROC AUC for Seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

ROC AUC for XYZ vaccine: 0.8595842893040532
ROC AUC for Seasonal vaccine: 0.8582123843874001
Mean ROC AUC: 0.8588983368457266


In [42]:
# Predict on test set
test_preds_xyz = pipeline_xyz.predict_proba(test_features[feature_cols])[:, 1]
test_preds_seasonal = pipeline_seasonal.predict_proba(test_features[feature_cols])[:, 1]


In [44]:
# Create submission
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_preds_xyz,
    'seasonal_vaccine': test_preds_seasonal
})

# Save submission
submission.to_csv('submission.csv', index=False)