In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
import zipfile
import os

zip_file_path = r'C:\Users\ADITYA UPADHYAY\Downloads\dataset and all.zip'
extracted_folder_path = r'C:\Users\ADITYA UPADHYAY\Downloads\dataset and all'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

train_features = pd.read_csv(os.path.join(extracted_folder_path, 'training_set_features.csv'))
train_labels = pd.read_csv(os.path.join(extracted_folder_path, 'training_set_labels.csv'))
test_features = pd.read_csv(os.path.join(extracted_folder_path, 'test_set_features.csv'))


data = pd.merge(train_features, train_labels, on='respondent_id')

X = data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y = data[['xyz_vaccine', 'seasonal_vaccine']]

categorical_features = [col for col in X.columns if X[col].dtype == 'object']
numerical_features = [col for col in X.columns if X[col].dtype != 'object']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numerical_features),
        ('cat', Pipeline(steps=[
            ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])


model = MultiOutputClassifier(RandomForestClassifier())


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', model)
])


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\ADITYA UPADHYAY\\Downloads\\dataset and all\\submission_format.csv'

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'classifier__estimator__n_estimators': [100, 200],
    'classifier__estimator__max_depth': [5, 10, 20]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='roc_auc', verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_val)

roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_pred_proba[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_pred_proba[1][:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2
print('Mean ROC AUC Score:', mean_roc_auc)


In [None]:
test_pred_proba = best_model.predict_proba(test_features.drop('respondent_id', axis=1))

submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_proba[0][:, 1],
    'seasonal_vaccine': test_pred_proba[1][:, 1]
})
submission.to_csv(os.path.join(extracted_folder_path, 'submission.csv'), index=False)


In [None]:
# Just checking if submission file was created
import os

submission_file_path = os.path.join(extracted_folder_path, 'submission.csv')

if os.path.exists(submission_file_path):
    print('Submission file created successfully!')
    # Display the contents of the submission file
    submission = pd.read_csv(submission_file_path)
    print(submission.head())
else:
    print('Submission file not found.')
