Importing Dependencies

In [23]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_auc_score

Importing Training Data's Features

In [24]:
train_features = pd.read_csv('training_set_features.csv', index_col='respondent_id')
train_features.index.name = None
train_features.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


Data Cleaning and Standardising

In [25]:
train_features["income_poverty"] = train_features["income_poverty"].map({"Below Poverty": 0, "<= $75,000, Above Poverty": 1, "> $75,000": 2})
train_features["rent_or_own"] = train_features["rent_or_own"].map({"Rent": 0, "Own": 1})
train_features["employment_status"] = train_features["employment_status"].map({"Not in Labor Force": 0, "Unemployed": 1, "Employed": 2})
train_features["hhs_geo_region"] = train_features["hhs_geo_region"].map({"atmpeygn": 0, "bhuqouqj": 1, "dqpwygqj": 2, "fpwskwrf": 3, "kbazzjca": 4, "lrircsnp": 5, "lzgpxyit": 6, "mlyzmhmf": 7, "oxchjgsf": 8, "qufhixun": 9})
train_features["marital_status"] = train_features["marital_status"].map({"Not Married": 0, "Married": 1})
train_features["age_group"] = train_features["age_group"].map({"18 - 34 Years": 0, "35 - 44 Years": 1, "45 - 54 Years": 2, "55 - 64 Years": 3, "65+ Years": 4})
train_features["education"] = train_features["education"].map({"< 12 Years": 0, "12 Years": 1, "Some College": 2, "College Graduate": 3})
train_features["race"] = train_features["race"].map({"White": 0, "Black": 1, "Hispanic": 2, "Other or Multiple": 3})
train_features["sex"] = train_features["sex"].map({"Female": 0, "Male": 1})

train_features.drop(columns=['employment_occupation', 'health_insurance', 'employment_industry', 'census_msa'], inplace=True)
train_features.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,household_adults,household_children
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0,0,0.0,0.0,1.0,0.0,8,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0,1,0.0,0.0,0.0,2.0,1,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,3.0,0,1,1.0,0.0,1.0,2.0,9,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0,0,0.0,0.0,0.0,0.0,5,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,2.0,0,0,1.0,1.0,1.0,2.0,9,1.0,0.0


In [26]:
print(train_features.isnull().sum().head())
print(train_features.shape)

train_features = train_features.apply(lambda x: x.fillna(x.mode()[0]))          # Filling with mode is best for categorical data
# train_features = train_features.apply(lambda x: x.fillna(x.mean()))

train_features.head()

xyz_concern                   92
xyz_knowledge                116
behavioral_antiviral_meds     71
behavioral_avoidance         208
behavioral_face_mask          19
dtype: int64
(26707, 31)


Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,household_adults,household_children
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0,0,0.0,0.0,1.0,0.0,8,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0,1,0.0,0.0,0.0,2.0,1,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0,1,1.0,0.0,1.0,2.0,9,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0,0,0.0,0.0,0.0,0.0,5,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,2.0,0,0,1.0,1.0,1.0,2.0,9,1.0,0.0


Feature Selection

In [27]:
x = train_features
y = pd.read_csv('training_set_labels.csv', index_col='respondent_id')
y.index.name = None
y1 = y["xyz_vaccine"]
y2 = y["seasonal_vaccine"]

print(y.head())
print(y1.head())
print(y2.head())

   xyz_vaccine  seasonal_vaccine
0            0                 0
1            0                 1
2            0                 0
3            0                 1
4            0                 0
0    0
1    0
2    0
3    0
4    0
Name: xyz_vaccine, dtype: int64
0    0
1    1
2    0
3    1
4    0
Name: seasonal_vaccine, dtype: int64


In [28]:
selector = SelectKBest(score_func=chi2, k=23)
fit = selector.fit(x, y)

selected_indices = selector.get_support(indices=True)
selected_features = x.columns[selected_indices]

print("Dropped Features :")
for i, feature in enumerate(x.columns):
    if i not in selected_indices:
        print(f"{i+1}. {feature}")

x = x[selected_features]
train_features.head()

Dropped Features :
3. behavioral_antiviral_meds
4. behavioral_avoidance
8. behavioral_outside_home
25. income_poverty
26. marital_status
27. rent_or_own
29. hhs_geo_region
30. household_adults


Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,household_adults,household_children
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0,0,0.0,0.0,1.0,0.0,8,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0,1,0.0,0.0,0.0,2.0,1,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0,1,1.0,0.0,1.0,2.0,9,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0,0,0.0,0.0,0.0,0.0,5,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,2.0,0,0,1.0,1.0,1.0,2.0,9,1.0,0.0


Modelling

In [37]:
# xyz_vaccine model (y1)
x1_train, x1_test, y1_train, y1_test = train_test_split(x, y1, test_size=0.2, random_state=42)

model1 = LogisticRegression(random_state=1)
model1.fit(x1_train, y1_train.values)

print(model1.score(x1_train, y1_train))

0.8344956704891178


In [30]:
# seasonal_vaccine model (y2)
x2_train, x2_test, y2_train, y2_test = train_test_split(x, y2, test_size=0.2, random_state=42)

model2 = LogisticRegression(random_state=1)
model2.fit(x2_train, y2_train.values)

print(model2.score(x2_train, y2_train))

0.7715422419845541


Evaluation

In [31]:
print("xyz_vaccine model accuracy :", model1.score(x1_test, y1_test))
print("seasonal_vaccine model accuracy :", model2.score(x2_test, y2_test))
print()
print("xyz_vaccine model roc-auc score :", roc_auc_score(y1_test, model1.predict_proba(x1_test)[:, 1]))
print("seasonal_vaccine model roc-auc score :", roc_auc_score(y2_test, model2.predict_proba(x2_test)[:, 1]))

xyz_vaccine model accuracy : 0.8391988019468364
seasonal_vaccine model accuracy : 0.7815424934481467

xyz_vaccine model roc-auc score : 0.8272624990545344
seasonal_vaccine model roc-auc score : 0.8495090420459619


Final Test

In [32]:
test = pd.read_csv('test_set_features.csv', index_col='respondent_id')
test.index.name = None

test["income_poverty"] = test["income_poverty"].map({"Below Poverty": 0, "<= $75,000, Above Poverty": 1, "> $75,000": 2})
test["rent_or_own"] = test["rent_or_own"].map({"Rent": 0, "Own": 1})
test["employment_status"] = test["employment_status"].map({"Not in Labor Force": 0, "Unemployed": 1, "Employed": 2})
test["hhs_geo_region"] = test["hhs_geo_region"].map({"atmpeygn": 0, "bhuqouqj": 1, "dqpwygqj": 2, "fpwskwrf": 3, "kbazzjca": 4, "lrircsnp": 5, "lzgpxyit": 6, "mlyzmhmf": 7, "oxchjgsf": 8, "qufhixun": 9})
test["marital_status"] = test["marital_status"].map({"Not Married": 0, "Married": 1})
test["age_group"] = test["age_group"].map({"18 - 34 Years": 0, "35 - 44 Years": 1, "45 - 54 Years": 2, "55 - 64 Years": 3, "65+ Years": 4})
test["education"] = test["education"].map({"< 12 Years": 0, "12 Years": 1, "Some College": 2, "College Graduate": 3})
test["race"] = test["race"].map({"White": 0, "Black": 1, "Hispanic": 2, "Other or Multiple": 3})
test["sex"] = test["sex"].map({"Female": 0, "Male": 1})

test.drop(columns=['employment_occupation', 'health_insurance', 'employment_industry', 'census_msa'], inplace=True)
test.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,household_adults,household_children
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,3.0,2,0,2.0,0.0,0.0,2.0,7,1.0,0.0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0,1,0.0,0.0,0.0,2.0,1,3.0,0.0
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,3.0,0,1,2.0,1.0,1.0,2.0,5,1.0,0.0
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0,0,1.0,1.0,1.0,0.0,5,1.0,0.0
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1,0,1.0,0.0,1.0,2.0,6,0.0,1.0


In [33]:
test = test.apply(lambda x: x.fillna(x.mode()[0]))

In [34]:
y1_pred = model1.predict_proba(test[selected_features])[:, 1]
y1_pred_rounded = np.around(y1_pred, 1)
y2_pred = model2.predict_proba(test[selected_features])[:, 1]
y2_pred_rounded = np.around(y2_pred, 1)

df = pd.DataFrame({'h1n1_vaccine': y1_pred_rounded, 'seasonal_vaccine': y2_pred_rounded}, index=test.index)
df.index.name = 'respondent_id'
df.to_csv('submission.csv')
df

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.1,0.3
26708,0.0,0.1
26709,0.4,0.7
26710,0.4,0.9
26711,0.2,0.5
...,...,...
53410,0.3,0.5
53411,0.1,0.3
53412,0.1,0.2
53413,0.1,0.4
