In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load datasets
labels = pd.read_csv('/content/training_set_labels.csv')
features = pd.read_csv('/content/training_set_features.csv')

# Merge datasets on respondent_id
data = pd.merge(labels, features, on='respondent_id')

# Separate features and targets
X = data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = data['xyz_vaccine']
y_seasonal = data['seasonal_vaccine']

# Identify categorical and numerical columns
cat_features = X.select_dtypes(include=['object']).columns
num_features = X.select_dtypes(exclude=['object']).columns

# Preprocessing pipelines for numerical and categorical data
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# Train model for xyz_vaccine using Logistic Regression with L1 and L2 regularization
X_train, X_val, y_train, y_val = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
model_xyz = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42, max_iter=10000))
])
model_xyz.fit(X_train, y_train)
y_pred_xyz = model_xyz.predict_proba(X_val)[:, 1]
roc_auc_xyz = roc_auc_score(y_val, y_pred_xyz)
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')

# Train model for seasonal_vaccine using Logistic Regression with L1 and L2 regularization
X_train, X_val, y_train, y_val = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)
model_seasonal = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42, max_iter=10000))
])
model_seasonal.fit(X_train, y_train)
y_pred_seasonal = model_seasonal.predict_proba(X_val)[:, 1]
roc_auc_seasonal = roc_auc_score(y_val, y_pred_seasonal)
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')

# Average ROC AUC
average_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
print(f'Average ROC AUC: {average_roc_auc}')

# Predictions for submission
test_data = pd.read_csv('/content/test_set_features.csv')
test_preds_xyz = model_xyz.predict_proba(test_data)[:, 1]
test_preds_seasonal = model_seasonal.predict_proba(test_data)[:, 1]

submission = pd.DataFrame({
    'respondent_id': test_data['respondent_id'],
    'xyz_vaccine': test_preds_xyz,
    'seasonal_vaccine': test_preds_seasonal
})
submission.to_csv('submission.csv', index=False)


ROC AUC for xyz_vaccine: 0.831444293169957
ROC AUC for seasonal_vaccine: 0.8560803156604843
Average ROC AUC: 0.8437623044152207
