In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [2]:


# Step 1: Load the Data
# Load training features, training labels, and test features
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Drop the 'respondent_id' column as it's not a feature for prediction
X = train_features.drop(columns=['respondent_id'])
y = train_labels.drop(columns=['respondent_id'])
X_test = test_features.drop(columns=['respondent_id'])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



In [3]:
# Apply preprocessing to the training and validation sets
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

# Convert the data to numpy arrays for use in TensorFlow
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_val = np.array(y_val)

In [4]:
# Step 3: Build and Train the Deep Learning Model
def build_sequential_model(input_shape):
    model = Sequential()
    model.add(Dense(256, activation='relu', input_shape=(input_shape,)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(2, activation='sigmoid'))
    return model

# Compile the model
model = build_sequential_model(X_train.shape[1])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])

# Early stopping and learning rate reduction callbacks
early_stopping = EarlyStopping(monitor='val_auc', patience=10, restore_best_weights=True, mode='max')
reduce_lr = ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=5, min_lr=1e-6, mode='max')

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=64, callbacks=[early_stopping, reduce_lr])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [8]:
# Step 4: Validate the Model
# Predict probabilities on the validation set
y_val_pred_proba = model.predict(X_val)

# Extract probabilities for each class (1 = vaccinated)
y_val_pred_proba_xyz = y_val_pred_proba[:, 0]
y_val_pred_proba_seasonal = y_val_pred_proba[:, 1]

# Calculate the ROC AUC score for both vaccines
roc_auc_xyz = roc_auc_score(y_val[:, 0], y_val_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_val[:, 1], y_val_pred_proba_seasonal)
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')

ROC AUC for xyz_vaccine: 0.8381033120708636
ROC AUC for seasonal_vaccine: 0.8612858374891561
Mean ROC AUC: 0.8496945747800099


In [9]:
# Step 5: Make Predictions on Test Set
# Predict probabilities on the test set
y_test_pred_proba = model.predict(X_test)

# Extract probabilities for each class (1 = vaccinated)
y_test_pred_proba_xyz = y_test_pred_proba[:, 0]
y_test_pred_proba_seasonal = y_test_pred_proba[:, 1]



In [10]:
# Step 6: Prepare Submission
# Create a DataFrame with respondent IDs and predicted probabilities
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': y_test_pred_proba_xyz,
    'seasonal_vaccine': y_test_pred_proba_seasonal
})

# Save the predictions to a CSV file
submission.to_csv('test_set_labels.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
