In [64]:
#importing the modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [65]:
train_set = pd.read_csv("training_set_features.csv")
print(train_set.describe())
print(train_set.info())

## pre - processing

#na values
train_set = train_set.fillna(train_set.mode().iloc[0])

#there are categorical lables as well which needs encoding
train_set = pd.get_dummies(train_set, drop_first=True)

print(train_set)
print(train_set.info())

       respondent_id   xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
count   26707.000000  26615.000000   26591.000000               26636.000000   
mean    13353.000000      1.618486       1.262532                   0.048844   
std      7709.791156      0.910311       0.618149                   0.215545   
min         0.000000      0.000000       0.000000                   0.000000   
25%      6676.500000      1.000000       1.000000                   0.000000   
50%     13353.000000      2.000000       1.000000                   0.000000   
75%     20029.500000      2.000000       2.000000                   0.000000   
max     26706.000000      3.000000       2.000000                   1.000000   

       behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
count          26499.000000          26688.000000           26665.000000   
mean               0.725612              0.068982               0.825614   
std                0.446214              0.253429  

In [66]:
#convert bool into binary

def bool_to_binary(data):
    for col in data.columns:
        data[col] = data[col].astype(float)
    return data

bool_to_binary(train_set)

print(train_set)

       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0                0.0          1.0            0.0                        0.0   
1                1.0          3.0            2.0                        0.0   
2                2.0          1.0            1.0                        0.0   
3                3.0          1.0            1.0                        0.0   
4                4.0          2.0            1.0                        0.0   
...              ...          ...            ...                        ...   
26702        26702.0          2.0            0.0                        0.0   
26703        26703.0          1.0            2.0                        0.0   
26704        26704.0          2.0            2.0                        0.0   
26705        26705.0          1.0            1.0                        0.0   
26706        26706.0          0.0            0.0                        0.0   

       behavioral_avoidance  behavioral_face_mask  

In [67]:
labels = pd.read_csv("training_set_labels.csv")

test_set = pd.read_csv("test_set_features.csv")

## pre - processing

#na values
test_set = test_set.fillna(test_set.mode().iloc[0])

#there are categorical lables as well which needs encoding
test_set = pd.get_dummies(test_set, drop_first=True)

print(test_set)
print(test_set.info())

bool_to_binary(test_set)

print(test_set)

       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              26707          2.0            2.0                        0.0   
1              26708          1.0            1.0                        0.0   
2              26709          2.0            2.0                        0.0   
3              26710          1.0            1.0                        0.0   
4              26711          3.0            1.0                        1.0   
...              ...          ...            ...                        ...   
26703          53410          1.0            1.0                        0.0   
26704          53411          3.0            1.0                        0.0   
26705          53412          0.0            1.0                        0.0   
26706          53413          3.0            1.0                        0.0   
26707          53414          2.0            1.0                        0.0   

       behavioral_avoidance  behavioral_face_mask  

In [68]:
#now preprocessing is done


#modeling
#naive bayes
#xyz_vaccine 

target_xyz = labels['xyz_vaccine']

X_train_xyz, X_test_xyz, y_train_xyz, y_test_xyz = train_test_split(train_set, target_xyz, test_size=0.2, random_state=42)

nb_classifier_xyz = GaussianNB()

nb_classifier_xyz.fit(X_train_xyz, y_train_xyz)

pred_probs_xyz = nb_classifier_xyz.predict_proba(X_test_xyz)[:, 1]

roc_auc_xyz = roc_auc_score(y_test_xyz, pred_probs_xyz)

print(f"ROC AUC for XYZ Vaccine: {roc_auc_xyz}")


ROC AUC for XYZ Vaccine: 0.7962486028120247


In [69]:
#since roc score is 0.79 we can improve this by using Random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf_classifier_xyz = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier_xyz.fit(X_train_xyz, y_train_xyz)

pred_probs_xyz_rf_train = rf_classifier_xyz.predict_proba(X_test_xyz)[:, 1]

roc_auc_xyz_rf_train = roc_auc_score(y_test_xyz, pred_probs_xyz_rf_train)

print(f"ROC AUC for XYZ Vaccine (Random Forest): {roc_auc_xyz_rf_train}")


ROC AUC for XYZ Vaccine (Random Forest): 0.831091424417383


In [70]:
## we increased the roc auc score for xyz vaccine
#now we will implement the same for seasonal_vaccine

target_seasonal = labels['seasonal_vaccine']

X_train_seasonal,X_test_seasonal, y_train_seasonal, y_test_seasonal = train_test_split(train_set, target_seasonal, test_size=0.2, random_state=42)

rf_classifier_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier_seasonal.fit(X_train_seasonal, y_train_seasonal)

pred_probs_seasonal_rf_train = rf_classifier_seasonal.predict_proba(X_test_seasonal)[:, 1]

roc_auc_seasonal_rf_train = roc_auc_score(y_test_seasonal, pred_probs_seasonal_rf_train)

print(f"ROC AUC for seasonal vaccine (Random Forest): {roc_auc_seasonal_rf}")



ROC AUC for seasonal vaccine (Random Forest): 0.8520934212325679


In [71]:
mean_roc_auc = np.mean([roc_auc_xyz_rf_train, roc_auc_seasonal_rf_train])

print('mean ROC AUC', mean_roc_auc)

#an roc auc of 0.84 is good

mean ROC AUC 0.8415924228249755


In [72]:

# rf_classifier_xyz = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_classifier_xyz.fit(X_test_xyz, y_test_xyz)

# pred_probs_xyz_rf_test = rf_classifier_xyz.predict_proba(X_train_xyz)[:, 1]

# roc_auc_xyz_rf_test = roc_auc_score(y_train_xyz, pred_probs_xyz_rf_test)

# print(f"ROC AUC for XYZ Vaccine (Random Forest with train set): {roc_auc_xyz_rf_test}")


In [73]:
# rf_classifier_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_classifier_seasonal.fit(X_test_seasonal, y_test_seasonal)

# pred_probs_seasonal_rf_test = rf_classifier_seasonal.predict_proba(X_train_seasonal)[:, 1]

# roc_auc_seasonal_rf_test = roc_auc_score(y_train_seasonal, pred_probs_seasonal_rf_test)

# print(f"ROC AUC for seasonal vaccine (Random Forest with train set): {roc_auc_seasonal_rf}")

In [74]:
# mean_roc_auc_test = np.mean([roc_auc_xyz_rf_test, roc_auc_seasonal_rf_test])

# print('mean ROC AUC', mean_roc_auc_test)

# #an roc auc of 0.82 is obtained

In [77]:
#submission_format

submission1 = pd.read_csv("submission_format.csv")

test_pred_probs_xyz = rf_classifier_xyz.predict_proba(test_set)[:, 1]
test_pred_probs_seasonal = rf_classifier_seasonal.predict_proba(test_set)[:, 1]

submission = pd.DataFrame({
    'respondent_id': submission1['respondent_id'],
    'xyz_vaccine': test_pred_probs_xyz,
    'seasonal_vaccine': test_pred_probs_seasonal
})

submission.to_csv("C:/Users/anus_/Downloads/dataset and all(2)/submission_format.csv", index=False)

