In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

# Load datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Merge training features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Fill missing values with a placeholder value
train_data.fillna('-1', inplace=True)
test_features.fillna('-1', inplace=True)

# Ensure all categorical data are strings
categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty',
                        'marital_status', 'rent_or_own', 'employment_status',
                        'hhs_geo_region', 'census_msa', 'employment_industry',
                        'employment_occupation']

for feature in categorical_features:
    train_data[feature] = train_data[feature].astype(str)
    test_features[feature] = test_features[feature].astype(str)

# Encode categorical features
label_encoders = {}
for feature in categorical_features:
    encoder = LabelEncoder()
    train_data[feature] = encoder.fit_transform(train_data[feature])
    test_features[feature] = encoder.transform(test_features[feature])
    label_encoders[feature] = encoder

# Standardize numerical features
numerical_features = ['xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective',
                      'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc',
                      'opinion_seas_vacc_effective', 'opinion_seas_risk',
                      'opinion_seas_sick_from_vacc']

scaler = StandardScaler()
train_data[numerical_features] = scaler.fit_transform(train_data[numerical_features])
test_features[numerical_features] = scaler.transform(test_features[numerical_features])

# Define features and labels
X_train = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_train = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(['respondent_id'], axis=1)

# Initialize and train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(rf, n_jobs=-1)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict_proba(X_test)

# Prepare submission
respondent_ids = test_features['respondent_id']
xyz_preds = [pred[1] for pred in predictions[0]]
seasonal_preds = [pred[1] for pred in predictions[1]]

submission = pd.DataFrame({
    'respondent_id': respondent_ids,
    'xyz_vaccine': xyz_preds,
    'seasonal_vaccine': seasonal_preds
})

# Save to CSV
submission.to_csv('submission_format.csv', index=False)

# Evaluate with cross-validation
cv_scores_xyz = cross_val_score(rf, X_train, y_train['xyz_vaccine'], cv=5, scoring='roc_auc')
cv_scores_seasonal = cross_val_score(rf, X_train, y_train['seasonal_vaccine'], cv=5, scoring='roc_auc')

print(f'XYZ Vaccine ROC AUC: {np.mean(cv_scores_xyz)}')
print(f'Seasonal Vaccine ROC AUC: {np.mean(cv_scores_seasonal)}')

""" We can use nessesuary libraries pandas,nampy etc and It merges the training features and labels into one dataframe (train_data) using 'respondent_id' as the key for merging. and standardized using StandardScaler() to have zero mean and unit variance. and The code follows the algorithm designed to process categorical and numerical data, train random forest models for multiple labels, make predictions, and evaluate metric-based ROC AUC performance to ensure the data is consistent and suitable for submission to machine learning competitions or the like."""

XYZ Vaccine ROC AUC: 0.8577214810963284
Seasonal Vaccine ROC AUC: 0.8521805736207897
