In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Load data
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Identify categorical features
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 
                        'marital_status', 'rent_or_own', 'employment_status', 
                        'hhs_geo_region', 'census_msa', 'employment_industry', 
                        'employment_occupation']

# Define preprocessors
imputer = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', imputer, [col for col in train_features.columns if train_features[col].dtype in ['int64', 'float64'] and col != 'respondent_id']),
        ('cat', encoder, categorical_features)
    ])

# Build the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(MultinomialNB()))
])

# Prepare training data
X_train = train_features.drop(columns=['respondent_id'])
y_train = train_labels[['xyz_vaccine', 'seasonal_vaccine']]

# Train the model
model.fit(X_train, y_train)

# Predict on training data for evaluation
y_train_pred = model.predict_proba(X_train)
y_train_pred_proba = np.hstack([y_train_pred[0][:, 1].reshape(-1, 1), y_train_pred[1][:, 1].reshape(-1, 1)])

# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_train['xyz_vaccine'], y_train_pred_proba[:, 0])
roc_auc_seasonal = roc_auc_score(y_train['seasonal_vaccine'], y_train_pred_proba[:, 1])
mean_roc_auc = np.mean([roc_auc_xyz, roc_auc_seasonal])

print(f'ROC AUC for xyz vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

# Predict on test data
X_test = test_features.drop(columns=['respondent_id'])
final_predictions = model.predict_proba(X_test)
final_pred_proba = np.hstack([final_predictions[0][:, 1].reshape(-1, 1), final_predictions[1][:, 1].reshape(-1, 1)])

# Create submission
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': final_pred_proba[:, 0],
    'seasonal_vaccine': final_pred_proba[:, 1]
})

submission.to_csv('submission_1.csv', index=False)
print(final_pred_proba)
print(submission)

ROC AUC for xyz vaccine: 0.7829674003225139
ROC AUC for seasonal vaccine: 0.7918871010420103
Mean ROC AUC: 0.7874272506822622
[[0.03365079 0.07739073]
 [0.01516127 0.01323429]
 [0.22937406 0.27589797]
 ...
 [0.10167946 0.26534063]
 [0.25555113 0.75082858]
 [0.25440945 0.4626989 ]]
       respondent_id  xyz_vaccine  seasonal_vaccine
0              26707     0.033651          0.077391
1              26708     0.015161          0.013234
2              26709     0.229374          0.275898
3              26710     0.564942          0.978360
4              26711     0.211314          0.431995
...              ...          ...               ...
26703          53410     0.166031          0.349265
26704          53411     0.141344          0.049243
26705          53412     0.101679          0.265341
26706          53413     0.255551          0.750829
26707          53414     0.254409          0.462699

[26708 rows x 3 columns]
