In [33]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
Note: you may need to restart the kernel to use updated packages.


In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Load datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

# Merge training features and labels
train_data = train_features.merge(train_labels, on='respondent_id')

# Feature columns
feature_cols = train_features.columns[1:]  # Exclude respondent_id

# Separate features and targets
X = train_data[feature_cols]
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']

# Preprocessing pipeline
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Models
model_xyz = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_seasonal = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Split the data for validation
X_train, X_val, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
_, _, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Training pipeline
pipeline_xyz = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model_xyz)])

pipeline_seasonal = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', model_seasonal)])

# Train models
pipeline_xyz.fit(X_train, y_train_xyz)
pipeline_seasonal.fit(X_train, y_train_seasonal)

# Predict and evaluate
val_preds_xyz = pipeline_xyz.predict_proba(X_val)[:, 1]
val_preds_seasonal = pipeline_seasonal.predict_proba(X_val)[:, 1]

roc_auc_xyz = roc_auc_score(y_val_xyz, val_preds_xyz)
roc_auc_seasonal = roc_auc_score(y_val_seasonal, val_preds_seasonal)
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'ROC AUC for XYZ vaccine: {roc_auc_xyz}')
print(f'ROC AUC for Seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

# Predict on test set
test_preds_xyz = pipeline_xyz.predict_proba(test_features[feature_cols])[:, 1]
test_preds_seasonal = pipeline_seasonal.predict_proba(test_features[feature_cols])[:, 1]

# Create submission
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_preds_xyz,
    'seasonal_vaccine': test_preds_seasonal
})

# Save submission
submission.to_csv('submission.csv', index=False)


ModuleNotFoundError: No module named 'xgboost'