In [5]:
import pandas as pd

# Load the datasets using raw strings or forward slashes
submission_format = pd.read_csv(r'C:\Users\User\Desktop\New folder\Py Files\submission_format.csv')
test_set_features = pd.read_csv(r'C:\Users\User\Desktop\New folder\Py Files\test_set_features.csv')
training_set_features = pd.read_csv(r'C:\Users\User\Desktop\New folder\Py Files\training_set_features.csv')
training_set_labels = pd.read_csv(r'C:\Users\User\Desktop\New folder\Py Files\training_set_labels.csv')

# Display the first few rows of each dataset to understand their structure
print("Submission Format:")
print(submission_format.head())
print("\nTest Set Features:")
print(test_set_features.head())
print("\nTraining Set Features:")
print(training_set_features.head())
print("\nTraining Set Labels:")
print(training_set_labels.head())


Submission Format:
   respondent_id  h1n1_vaccine  seasonal_vaccine
0          26707           0.5               0.7
1          26708           0.5               0.7
2          26709           0.5               0.7
3          26710           0.5               0.7
4          26711           0.5               0.7

Test Set Features:
   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0          26707          2.0            2.0                        0.0   
1          26708          1.0            1.0                        0.0   
2          26709          2.0            2.0                        0.0   
3          26710          1.0            1.0                        0.0   
4          26711          3.0            1.0                        1.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   1.0                   0.0                    1.0   
1                   0.0                   0.0                    0.0   


In [7]:
# Inspect the shape of the datasets
print(f"Submission Format Shape: {submission_format.shape}")
print(f"Test Set Features Shape: {test_set_features.shape}")
print(f"Training Set Features Shape: {training_set_features.shape}")
print(f"Training Set Labels Shape: {training_set_labels.shape}")

# Inspect the data types and missing values
print("\nTraining Set Features Info:")
print(training_set_features.info())
print("\nTraining Set Labels Info:")
print(training_set_labels.info())

# Summary statistics of the training set features
print("\nTraining Set Features Description:")
print(training_set_features.describe())


Submission Format Shape: (26708, 3)
Test Set Features Shape: (26708, 36)
Training Set Features Shape: (26707, 36)
Training Set Labels Shape: (26707, 3)

Training Set Features Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   xyz_concern                  26615 non-null  float64
 2   xyz_knowledge                26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_re

In [13]:
from sklearn.impute import SimpleImputer

# Define preprocessing pipeline with imputation
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_features),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Update the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])


In [25]:
from sklearn.impute import SimpleImputer  # Import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier  # Use an estimator that handles NaN values natively

# Merge training features and labels
training_data = pd.merge(training_set_features, training_set_labels, on='respondent_id')

# Separate features and targets
features = training_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
targets = training_data[['xyz_vaccine', 'seasonal_vaccine']]

# Identify categorical and numerical features
categorical_features = features.select_dtypes(include=['object']).columns
numerical_features = features.select_dtypes(exclude=['object']).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_features),  # Impute missing values with median
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create the model pipeline with HistGradientBoostingClassifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(HistGradientBoostingClassifier(random_state=42)))
])

# Split the data
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=42)

# Fit the model (including preprocessing)
model.fit(X_train, y_train)

# Predict probabilities for the validation set
y_val_pred_proba = model.predict_proba(X_val)

# Convert predictions to the correct format
y_val_pred_proba_formatted = pd.DataFrame({
    'xyz_vaccine': [prob[1] for prob in y_val_pred_proba[0]],  # Probability for class 1 for xyz_vaccine
    'seasonal_vaccine': [prob[1] for prob in y_val_pred_proba[1]]  # Probability for class 1 for seasonal_vaccine
})

# Evaluate the model using ROC AUC
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba_formatted['xyz_vaccine'])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba_formatted['seasonal_vaccine'])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz:.4f}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal:.4f}')
print(f'Mean ROC AUC: {mean_roc_auc:.4f}')


ROC AUC for xyz_vaccine: 0.8406
ROC AUC for seasonal_vaccine: 0.8608
Mean ROC AUC: 0.8507


In [27]:
# Prepare the test set features
X_test = test_set_features.drop(columns=['respondent_id'])

# Predict probabilities for the test set
y_test_pred_proba = model.predict_proba(X_test)

# Convert predictions to the correct format
y_test_pred_proba_formatted = pd.DataFrame({
    'respondent_id': test_set_features['respondent_id'],
    'xyz_vaccine': [prob[1] for prob in y_test_pred_proba[0]],  # Probability for class 1 for xyz_vaccine
    'seasonal_vaccine': [prob[1] for prob in y_test_pred_proba[1]]  # Probability for class 1 for seasonal_vaccine
})


In [29]:
# Save the submission file
submission_file_path = 'submission.csv'
y_test_pred_proba_formatted.to_csv(submission_file_path, index=False)
print(f'Submission file saved to {submission_file_path}')


Submission file saved to submission.csv
