In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

In [2]:
# Loading the data
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
submission_format = pd.read_csv('submission_format.csv')

In [3]:
train_features

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [4]:
train_labels


Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0
...,...,...,...
26702,26702,0,0
26703,26703,0,0
26704,26704,0,1
26705,26705,0,0


In [5]:
test_features

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26703,53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,,,,,dqpwygqj,"MSA, Principle City",1.0,1.0,,
26704,53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,Below Poverty,Married,Rent,Employed,qufhixun,Non-MSA,1.0,3.0,fcxhlnwr,vlluhbov
26705,53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,qufhixun,"MSA, Not Principle City",1.0,0.0,,
26706,53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,bhuqouqj,"MSA, Not Principle City",1.0,0.0,,


In [6]:
submission_format

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7
...,...,...,...
26703,53410,0.5,0.7
26704,53411,0.5,0.7
26705,53412,0.5,0.7
26706,53413,0.5,0.7


In [7]:
# Merge train features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [33]:
train_data

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,xyz_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,,0,0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,,0,1
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0,0


In [9]:
# Identify feature columns
binary_features = [
    'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask',
    'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home',
    'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
    'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance'
]
categorical_features = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
    'employment_industry', 'employment_occupation'
]
ordinal_features = [
    'xyz_concern', 'xyz_knowledge', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
    'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
    'opinion_seas_sick_from_vacc'
]
numerical_features = ['household_adults', 'household_children']

In [10]:
# Preprocessing pipeline for numerical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [11]:
# Preprocessing pipeline for categorical features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
# Preprocessing pipeline for ordinal features
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [13]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features),
    ('ord', ordinal_pipeline, ordinal_features),
    ('bin', 'passthrough', binary_features)
])

In [14]:
# Prepare features and target variables
X = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_data[['xyz_vaccine', 'seasonal_vaccine']]
X_test = test_features.drop(columns=['respondent_id'])

In [15]:
# Split the training data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.multioutput import MultiOutputClassifier

In [23]:
# Define the model
model = MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss'))

In [24]:
# Create pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [25]:
# Train the model
pipeline.fit(X_train, y_train)


In [26]:
# Predict probabilities for validation set
y_valid_pred = pipeline.predict_proba(X_valid)
y_test_pred = pipeline.predict_proba(X_test)

In [27]:
# Calculate ROC AUC score for validation
y_valid_pred_xyz = y_valid_pred[0][:, 1]
y_valid_pred_seasonal = y_valid_pred[1][:, 1]

roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_valid_pred_xyz)
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_valid_pred_seasonal)
roc_auc_avg = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'Validation ROC AUC Score - XYZ Vaccine: {roc_auc_xyz:.4f}')
print(f'Validation ROC AUC Score - Seasonal Vaccine: {roc_auc_seasonal:.4f}')
print(f'Validation ROC AUC Score - Average: {roc_auc_avg:.4f}')


Validation ROC AUC Score - XYZ Vaccine: 0.8584
Validation ROC AUC Score - Seasonal Vaccine: 0.8565
Validation ROC AUC Score - Average: 0.8574


In [31]:
# Prepare submission
submission_format['xyz_vaccine'] = y_test_pred[0][:, 1]
submission_format['seasonal_vaccine'] = y_test_pred[1][:, 1]

submission = submission_format[['respondent_id', 'xyz_vaccine', 'seasonal_vaccine']]

# Save the submission file
submission.to_csv('submission.csv', index=False)


In [32]:
# Verify the submission file
submission = pd.read_csv('submission.csv')
print(submission.head())


   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707     0.212634          0.092494
1          26708     0.017186          0.018541
2          26709     0.062300          0.628198
3          26710     0.734843          0.935032
4          26711     0.121209          0.367034


In [34]:
import os

# Save the submission file
submission.to_csv('submission.csv', index=False)

# Get the current working directory
current_directory = os.getcwd()
print(f"Current Directory: {current_directory}")

# List all files in the current directory to confirm the presence of the submission file
files = os.listdir(current_directory)
print("Files in the current directory:")
print(files)

# Check specifically for 'submission.csv'
if 'submission.csv' in files:
    print("submission.csv is successfully saved in the current directory.")
else:
    print("submission.csv is not found in the current directory.")


Current Directory: C:\Users\user
Files in the current directory:
['.atom', '.bash_history', '.conda', '.condarc', '.continuum', '.gitconfig', '.idlerc', '.ipynb_checkpoints', '.ipython', '.jupyter', '.keras', '.lesshst', '.matplotlib', '.ms-ad', '.VirtualBox', '.vscode', '3D Objects', '7_hologram.png', '7_hologram_reconstructed.png', '7_hologram_spectrum.png', 'ABHISHEK IITG HACKATHON.ipynb', 'AI PROJECT.ipynb', 'AirQualityUCI.xlsx', 'anaconda3', 'ANOVA.py', 'ansel', 'AppData', 'Application Data', 'array.ipynb', 'ass1.c', 'Attention Mediation Prediction.ipynb', 'BN assignment.txt', 'bn.c', 'code till now-Copy1.ipynb', 'code till now.ipynb', 'Contacts', 'Cookies', 'copy of final project.ipynb', 'cvae.gif', 'data_structures.ipynb', 'Desktop', 'Documents', 'Downloads', 'energydata_complete.csv', 'energydata_complete.ipynb', 'example-app', 'example.png', 'Favorites', 'FINAL PROJECT -Copy1.ipynb', 'FINAL PROJECT .ipynb', 'FInal PRoject-Copy1.ipynb', 'FInal PRoject.ipynb', 'Final_code.ipynb'

In [35]:
# Specify the path where you want to save the file
path_to_save = os.path.join(current_directory, 'submission.csv')
submission.to_csv(path_to_save, index=False)
