## Flu Shot Learning: Predict H1N1 and Seasonal Flu Vaccines - 11/2020 - Balazs Balogh
https://www.drivendata.org/competitions/66/flu-shot-learning/page/211/

Your goal is to predict how likely individuals are to receive their H1N1 and seasonal flu vaccines. Specifically, you'll be predicting two probabilities: one for h1n1_vaccine and one for seasonal_vaccine.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.experimental import enable_hist_gradient_boosting  # explicitly require this experimental feature
from sklearn.ensemble import HistGradientBoostingClassifier # now you can import normally from ensemble

from sklearn import tree, ensemble, metrics, svm
from xgboost import *
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.utils import class_weight

In [2]:
X_train = pd.read_csv(r"c:\Users\BalazsBalogh\Anaconda\bppy\flu_shot_learning\training_set_features.csv")
y_train = pd.read_csv(r"c:\Users\BalazsBalogh\Anaconda\bppy\flu_shot_learning\training_set_labels.csv")

data = X_train.merge(y_train, on='respondent_id')

In [3]:
data.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [4]:
"""
Imbalanced the data on the h1n1_vaccine. The seasonal_vaccine is balanced.

h1n1_vaccine
0    21033
1     5674

seasonal_vaccine
0    14272
1    12435

"""

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [5]:
data.describe()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
count,26707.0,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,24547.0,24547.0,25736.0,25887.0,25903.0,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0,26707.0,26707.0
mean,13353.0,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,0.220312,0.329735,0.283261,0.08259,0.111918,0.87972,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583,0.212454,0.465608
std,7709.791156,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,0.414466,0.470126,0.450591,0.275266,0.315271,0.3253,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173,0.409052,0.498825
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0,0.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0,0.0,1.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,1.0,1.0


In [6]:
# employment_occupation / employment_industry / health_insurance / income_poverty -> maybe next time

nulls = round(data.isnull().mean() * 100, 2)
print(nulls.sort_values(ascending=False))

employment_occupation          50.44
employment_industry            49.91
health_insurance               45.96
income_poverty                 16.56
doctor_recc_h1n1                8.09
doctor_recc_seasonal            8.09
rent_or_own                     7.65
employment_status               5.48
marital_status                  5.27
education                       5.27
chronic_med_condition           3.64
child_under_6_months            3.07
health_worker                   3.01
opinion_seas_sick_from_vacc     2.01
opinion_seas_risk               1.92
opinion_seas_vacc_effective     1.73
opinion_h1n1_sick_from_vacc     1.48
opinion_h1n1_vacc_effective     1.46
opinion_h1n1_risk               1.45
household_adults                0.93
household_children              0.93
behavioral_avoidance            0.78
behavioral_touch_face           0.48
h1n1_knowledge                  0.43
h1n1_concern                    0.34
behavioral_large_gatherings     0.33
behavioral_outside_home         0.31
b

In [7]:
# Distinct values for each column, and again, the NULL %

for col, colname in zip(data, data.columns):
    print(colname +  ':', data[col].unique())
    print('NULL % ', round(data[col].isnull().mean() * 100, 2), '\n')

respondent_id: [    0     1     2 ... 26704 26705 26706]
NULL %  0.0 

h1n1_concern: [ 1.  3.  2.  0. nan]
NULL %  0.34 

h1n1_knowledge: [ 0.  2.  1. nan]
NULL %  0.43 

behavioral_antiviral_meds: [ 0.  1. nan]
NULL %  0.27 

behavioral_avoidance: [ 0.  1. nan]
NULL %  0.78 

behavioral_face_mask: [ 0.  1. nan]
NULL %  0.07 

behavioral_wash_hands: [ 0.  1. nan]
NULL %  0.16 

behavioral_large_gatherings: [ 0.  1. nan]
NULL %  0.33 

behavioral_outside_home: [ 1.  0. nan]
NULL %  0.31 

behavioral_touch_face: [ 1.  0. nan]
NULL %  0.48 

doctor_recc_h1n1: [ 0. nan  1.]
NULL %  8.09 

doctor_recc_seasonal: [ 0. nan  1.]
NULL %  8.09 

chronic_med_condition: [ 0.  1. nan]
NULL %  3.64 

child_under_6_months: [ 0.  1. nan]
NULL %  3.07 

health_worker: [ 0.  1. nan]
NULL %  3.01 

health_insurance: [ 1. nan  0.]
NULL %  45.96 

opinion_h1n1_vacc_effective: [ 3.  5.  4.  2.  1. nan]
NULL %  1.46 

opinion_h1n1_risk: [ 1.  4.  3.  2.  5. nan]
NULL %  1.45 

opinion_h1n1_sick_from_vacc: [ 2

In [8]:
# More than 6% of NaN cols + others I checked, for the distinct values, and NULL count

print(data['income_poverty'].value_counts(dropna=False).head(10), '\n')
print(data['doctor_recc_h1n1'].value_counts(dropna=False).head(10), '\n')
print(data['doctor_recc_seasonal'].value_counts(dropna=False).head(10), '\n')
print(data['rent_or_own'].value_counts(dropna=False).head(10), '\n')

print(data['education'].value_counts(dropna=False).head(10), '\n')
print(data['marital_status'].value_counts(dropna=False).head(10), '\n')
print(data['employment_status'].value_counts(dropna=False).head(10), '\n')

print(data['chronic_med_condition'].value_counts(dropna=False).head(10), '\n')
print(data['child_under_6_months'].value_counts(dropna=False).head(10), '\n')
print(data['health_worker'].value_counts(dropna=False).head(10), '\n')
print(data['opinion_h1n1_vacc_effective'].value_counts(dropna=False).head(10), '\n')
print(data['opinion_h1n1_risk'].value_counts(dropna=False).head(10), '\n')
print(data['opinion_h1n1_sick_from_vacc'].value_counts(dropna=False).head(10), '\n')
print(data['opinion_seas_vacc_effective'].value_counts(dropna=False).head(10), '\n')
print(data['opinion_seas_risk'].value_counts(dropna=False).head(10), '\n')
print(data['opinion_seas_sick_from_vacc'].value_counts(dropna=False).head(10), '\n')

print(data['h1n1_vaccine'].value_counts(dropna=False).head(10), '\n')
print(data['seasonal_vaccine'].value_counts(dropna=False).head(10), '\n')

<= $75,000, Above Poverty    12777
> $75,000                     6810
NaN                           4423
Below Poverty                 2697
Name: income_poverty, dtype: int64 

0.0    19139
1.0     5408
NaN     2160
Name: doctor_recc_h1n1, dtype: int64 

0.0    16453
1.0     8094
NaN     2160
Name: doctor_recc_seasonal, dtype: int64 

Own     18736
Rent     5929
NaN      2042
Name: rent_or_own, dtype: int64 

College Graduate    10097
Some College         7043
12 Years             5797
< 12 Years           2363
NaN                  1407
Name: education, dtype: int64 

Married        13555
Not Married    11744
NaN             1408
Name: marital_status, dtype: int64 

Employed              13560
Not in Labor Force    10231
NaN                    1463
Unemployed             1453
Name: employment_status, dtype: int64 

0.0    18446
1.0     7290
NaN      971
Name: chronic_med_condition, dtype: int64 

0.0    23749
1.0     2138
NaN      820
Name: child_under_6_months, dtype: int64 

0.0    2

## Visualisations

## Feature engineering

In [9]:
# First encode the object columns
# Most of the columnd have <6% NaNs, and first these will be filled with mode. When it needs, ordinal encoding, and one-hot
# encoding will be applied.

data['sex']               = data['sex'].map({'Female' : 0, 'Male' : 1})
data['age_group']         = data['age_group'].map({'18 - 34 Years' : 0, '35 - 44 Years' : 1, '45 - 54 Years' : 2, 
                                                   '55 - 64 Years' : 3, '65+ Years' : 4})

data['education']         = data['education'].fillna(data['education'].mode().iloc[0])
data['education']         = data['education'].map({'< 12 Years' : 0, '12 Years' : 1, 
                                                   'Some College' : 2, 'College Graduate' : 3})

data['census_msa']        = data['census_msa'].map({'Non-MSA' : 0, 'MSA, Not Principle  City' : 1, 'MSA, Principle City' : 2})
# data['hhs_geo_region']    = pd.get_dummies(data['hhs_geo_region'], prefix='hhs')

data['marital_status']    = data['marital_status'].fillna(data['marital_status'].mode().iloc[0])
data['marital_status']    = data['marital_status'].map({'Not Married' : 0, 'Married' : 1})

data['employment_status'] = data['employment_status'].fillna(data['employment_status'].mode())

data['doctor_recc_h1n1']        = data['doctor_recc_h1n1'].fillna(data['doctor_recc_h1n1'].median())
data['doctor_recc_seasonal']    = data['doctor_recc_seasonal'].fillna(data['doctor_recc_seasonal'].median())
data['chronic_med_condition']   = data['chronic_med_condition'].fillna(data['chronic_med_condition'].median())
data['child_under_6_months']    = data['child_under_6_months'].fillna(data['child_under_6_months'].median())
data['health_worker']           = data['health_worker'].fillna(data['health_worker'].median())

data['h1n1_knowledge']          = data['h1n1_knowledge'].fillna(data['h1n1_knowledge'].median())
data['h1n1_concern']            = data['h1n1_concern'].fillna(data['h1n1_concern'].median())


data['opinion_h1n1_vacc_effective'] = data['opinion_h1n1_vacc_effective'].fillna(
                                                                data['opinion_h1n1_vacc_effective'].median())
data['opinion_h1n1_risk']           = data['opinion_h1n1_risk'].fillna(data['opinion_h1n1_risk'].median())
data['opinion_h1n1_sick_from_vacc'] = data['opinion_h1n1_sick_from_vacc'].fillna(
                                                                data['opinion_h1n1_sick_from_vacc'].median())
data['opinion_seas_vacc_effective'] = data['opinion_seas_vacc_effective'].fillna(
                                                                data['opinion_seas_vacc_effective'].median())
data['opinion_seas_risk']           = data['opinion_seas_risk'].fillna(data['opinion_seas_risk'].median())
data['opinion_seas_sick_from_vacc'] = data['opinion_seas_sick_from_vacc'].fillna(
                                                                data['opinion_seas_sick_from_vacc'].median())

data['behavioral_antiviral_meds']   = data['behavioral_antiviral_meds'].fillna(data['behavioral_antiviral_meds'].median())
data['behavioral_avoidance']        = data['behavioral_avoidance'].fillna(data['behavioral_avoidance'].median())
data['behavioral_face_mask']        = data['behavioral_face_mask'].fillna(data['behavioral_face_mask'].median())
data['behavioral_wash_hands']       = data['behavioral_wash_hands'].fillna(data['behavioral_wash_hands'].median())
data['behavioral_large_gatherings'] = data['behavioral_large_gatherings'].fillna(data['behavioral_large_gatherings'].median())
data['behavioral_outside_home']     = data['behavioral_outside_home'].fillna(data['behavioral_outside_home'].median())
data['behavioral_touch_face']       = data['behavioral_touch_face'].fillna(data['behavioral_touch_face'].median())

data['household_adults']            = data['household_adults'].fillna(data['household_adults'].median())
data['household_children']          = data['household_children'].fillna(data['household_children'].median())

data = pd.get_dummies(data, columns=['race', 'employment_status'])

data.drop(['employment_occupation', 'employment_industry', 'health_insurance',
          'income_poverty', 'rent_or_own', 'hhs_geo_region'], axis=1, inplace=True)

# TODO: Try MICE with NaN handling: https://towardsdatascience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc

In [10]:
# Create the target / features df. Create another two, with the h1n1 or seasonal data only.

features = data.drop(['h1n1_vaccine', 'seasonal_vaccine', 'respondent_id'], axis=1)
target = data[['h1n1_vaccine', 'seasonal_vaccine']]

print(features.shape)
print(target.shape)

(26707, 34)
(26707, 2)


In [11]:
# Make sure there are no NaN values amongst the features columns.

print('Column containing NaN values:', features.isna().any().sum())

Column containing NaN values: 0


In [12]:
# Read the test data

test = pd.read_csv(r"c:\Users\BalazsBalogh\Anaconda\bppy\flu_shot_learning\test_set_features.csv")

In [13]:
test['sex']               = test['sex'].map({'Female' : 0, 'Male' : 1})
test['age_group']         = test['age_group'].map({'18 - 34 Years' : 0, '35 - 44 Years' : 1, '45 - 54 Years' : 2, 
                                                   '55 - 64 Years' : 3, '65+ Years' : 4})

test['education']         = test['education'].fillna(test['education'].mode().iloc[0])
test['education']         = test['education'].map({'< 12 Years' : 0, '12 Years' : 1, 
                                                   'Some College' : 2, 'College Graduate' : 3})

test['census_msa']        = test['census_msa'].map({'Non-MSA' : 0, 'MSA, Not Principle  City' : 1, 'MSA, Principle City' : 2})
# test['hhs_geo_region']    = pd.get_dummies(test['hhs_geo_region'], prefix='hhs')

test['marital_status']    = test['marital_status'].fillna(test['marital_status'].mode().iloc[0])
test['marital_status']    = test['marital_status'].map({'Not Married' : 0, 'Married' : 1})

test['employment_status'] = test['employment_status'].fillna(test['employment_status'].mode())

test['doctor_recc_h1n1']        = test['doctor_recc_h1n1'].fillna(test['doctor_recc_h1n1'].median())
test['doctor_recc_seasonal']    = test['doctor_recc_seasonal'].fillna(test['doctor_recc_seasonal'].median())
test['chronic_med_condition']   = test['chronic_med_condition'].fillna(test['chronic_med_condition'].median())
test['child_under_6_months']    = test['child_under_6_months'].fillna(test['child_under_6_months'].median())
test['health_worker']           = test['health_worker'].fillna(test['health_worker'].median())

test['h1n1_knowledge']          = test['h1n1_knowledge'].fillna(test['h1n1_knowledge'].median())
test['h1n1_concern']            = test['h1n1_concern'].fillna(test['h1n1_concern'].median())


test['opinion_h1n1_vacc_effective'] = test['opinion_h1n1_vacc_effective'].fillna(
                                                                test['opinion_h1n1_vacc_effective'].median())
test['opinion_h1n1_risk']           = test['opinion_h1n1_risk'].fillna(test['opinion_h1n1_risk'].median())
test['opinion_h1n1_sick_from_vacc'] = test['opinion_h1n1_sick_from_vacc'].fillna(
                                                                test['opinion_h1n1_sick_from_vacc'].median())
test['opinion_seas_vacc_effective'] = test['opinion_seas_vacc_effective'].fillna(
                                                                test['opinion_seas_vacc_effective'].median())
test['opinion_seas_risk']           = test['opinion_seas_risk'].fillna(test['opinion_seas_risk'].median())
test['opinion_seas_sick_from_vacc'] = test['opinion_seas_sick_from_vacc'].fillna(
                                                                test['opinion_seas_sick_from_vacc'].median())

test['behavioral_antiviral_meds']   = test['behavioral_antiviral_meds'].fillna(test['behavioral_antiviral_meds'].median())
test['behavioral_avoidance']        = test['behavioral_avoidance'].fillna(test['behavioral_avoidance'].median())
test['behavioral_face_mask']        = test['behavioral_face_mask'].fillna(test['behavioral_face_mask'].median())
test['behavioral_wash_hands']       = test['behavioral_wash_hands'].fillna(test['behavioral_wash_hands'].median())
test['behavioral_large_gatherings'] = test['behavioral_large_gatherings'].fillna(test['behavioral_large_gatherings'].median())
test['behavioral_outside_home']     = test['behavioral_outside_home'].fillna(test['behavioral_outside_home'].median())
test['behavioral_touch_face']       = test['behavioral_touch_face'].fillna(test['behavioral_touch_face'].median())

test['household_adults']            = test['household_adults'].fillna(test['household_adults'].median())
test['household_children']          = test['household_children'].fillna(test['household_children'].median())

test = pd.get_dummies(test, columns=['race', 'employment_status'])

test.drop(['employment_occupation', 'employment_industry', 'health_insurance',
          'income_poverty', 'rent_or_own', 'hhs_geo_region', 'respondent_id'], axis=1, inplace=True)

# TODO: Try MICE with NaN handling: https://towardstestscience.com/whats-the-best-way-to-handle-nan-values-62d50f738fc

In [14]:
# Make sure that test column are the same as features'.

features.columns.difference(test.columns)

Index([], dtype='object')

In [15]:
# Train test split 

X_train, X_test, y_train, y_test = train_test_split(features, target, train_size=0.8, 
                                                    stratify=target, random_state=123)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('\n')
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (21365, 34)
X_test shape: (5342, 34)


y_train shape: (21365, 2)
y_test shape: (5342, 2)


In [16]:
expected = y_test # Because it's easier to read expected than y_test

## Modeling

### Logistic Regression

In [None]:
logreg = LogisticRegression()
model = MultiOutputClassifier(estimator=logreg, n_jobs=-1)

model.fit(X_train, y_train)
predicted_logreg = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_logreg = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_logreg[0][:, 1],
        "seasonal_vaccine": predicted_logreg[1][:, 1],
    },
    index = expected.index
)
print("predictions_logreg.shape:", predictions_logreg.shape)
predictions_logreg.head()

In [None]:
roc_auc_score(expected, predictions_logreg).round(6)

### Random Forest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators=1500, n_jobs=-1)
model = MultiOutputClassifier(estimator=clf, n_jobs=-1)

model.fit(X_train, y_train)
predicted_clf = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_clf = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_clf[0][:, 1],
        "seasonal_vaccine": predicted_clf[1][:, 1],
    },
    index = expected.index
)
print("predictions_clf.shape:", predictions_clf.shape)
predictions_clf.head()

In [None]:
roc_auc_score(expected, predictions_clf).round(6)

## HistGradientBoosting

In [None]:
hgbc = HistGradientBoostingClassifier(learning_rate=0.1, max_iter=70, max_leaf_nodes=30, random_state=123)
model = MultiOutputClassifier(estimator=hgbc, n_jobs=-1)

model.fit(X_train, y_train)
predicted_hgbc = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_hgbc = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_hgbc[0][:, 1],
        "seasonal_vaccine": predicted_hgbc[1][:, 1],
    },
    index = expected.index
)
print("predictions_clf.shape:", predictions_hgbc.shape)
predictions_hgbc.head()

In [None]:
roc_auc_score(expected, predictions_hgbc).round(6)

## AdaBoost Classifier

In [None]:
ada = AdaBoostClassifier(n_estimators=150, learning_rate=1, random_state=123)
model = MultiOutputClassifier(estimator=ada, n_jobs=-1)

model.fit(X_train, y_train)
predicted_ada = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_ada = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_ada[0][:, 1],
        "seasonal_vaccine": predicted_ada[1][:, 1],
    },
    index = expected.index
)
print("predictions_clf.shape:", predictions_ada.shape)
predictions_ada.head()

In [None]:
roc_auc_score(expected, predictions_ada).round(6)

### LightGBM

In [None]:
lgbm = LGBMClassifier(learning_rate=0.01, num_iterations=1020, num_leaves=33, max_depth=10, n_jobs=8, random_state=123)
model = MultiOutputClassifier(estimator=lgbm, n_jobs=-1)

model.fit(X_train, y_train)
predicted_lgbm = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_lgbm = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_lgbm[0][:, 1],
        "seasonal_vaccine": predicted_lgbm[1][:, 1],
    },
    index = expected.index
)
print("predictions_clf.shape:", predictions_lgbm.shape)
predictions_lgbm.head()

In [None]:
roc_auc_score(expected, predictions_lgbm).round(6)

### XGBoost

In [None]:
xgb = XGBClassifier(eta=0.09, max_depth=5, nthread=8, random_state=123)
model = MultiOutputClassifier(estimator=xgb, n_jobs=-1)

model.fit(X_train, y_train)
predicted_xgb = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_xgb = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_xgb[0][:, 1],
        "seasonal_vaccine": predicted_xgb[1][:, 1],
    },
    index = expected.index
)
print("predictions_xgb.shape:", predictions_xgb.shape)
predictions_xgb.head()

In [None]:
roc_auc_score(expected, predictions_xgb).round(6)

### Gradient Boosting

In [26]:
gradient = GradientBoostingClassifier(n_estimators=373, learning_rate=0.1, random_state=123)
model = MultiOutputClassifier(estimator=gradient, n_jobs=-1)

model.fit(X_train, y_train)
predicted_gradient = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_gradient = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_gradient[0][:, 1],
        "seasonal_vaccine": predicted_gradient[1][:, 1],
    },
    index = expected.index
)
print("predictions_gradient.shape:", predictions_gradient.shape)
predictions_gradient.head()

predictions_gradient.shape: (5342, 2)


Unnamed: 0,h1n1_vaccine,seasonal_vaccine
24846,0.093598,0.420696
24354,0.025227,0.094697
1508,0.188062,0.871313
8170,0.024678,0.029215
13210,0.111645,0.761485


In [27]:
roc_auc_score(expected, predictions_gradient).round(6)

0.850734

### GridSearchCV

In [23]:
# For MultiOutputClassifier it's a bit different to GridSearch - 'estimator__' is needed.

param_grid = {'estimator__n_estimators':np.arange(367, 377),
              'estimator__learning_rate':[0.09, 0.1, 0.11]}

gb = GradientBoostingClassifier(random_state=123) # random_state should be the same as the main model!
gs = GridSearchCV(MultiOutputClassifier(gb), param_grid=param_grid, scoring='roc_auc', verbose=7, n_jobs=-1)

gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 17.3min finished


0.8440276126332604
{'estimator__learning_rate': 0.09, 'estimator__n_estimators': 367}


## VotingClassifier

In [None]:
"""
VotingClassifier unifies multiple models to makes the predictions better.
"""

estimators = [('xgboost',  XGBClassifier(eta=0.09, max_depth=5, random_state=123)), 
              ('gradient', GradientBoostingClassifier(n_estimators=387, learning_rate=0.1, random_state=123)), 
              ('lgbm',     LGBMClassifier(learning_rate=0.01, num_iterations=1020, num_leaves=33, 
                                          max_depth=10, random_state=123))]

voting_classifier = VotingClassifier(estimators, voting='soft', n_jobs=-1)

model = MultiOutputClassifier(estimator=voting_classifier, n_jobs=-1)

model.fit(X_train, y_train)
predicted_voting_classifier = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_voting_classifier = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_voting_classifier[0][:, 1],
        "seasonal_vaccine": predicted_voting_classifier[1][:, 1],
    },
    index = expected.index
)
print("predictions_gradient.shape:", predictions_voting_classifier.shape)
predictions_voting_classifier.head()

In [None]:
roc_auc_score(expected, predictions_voting_classifier).round(6)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=123, verbose=10)
mlp.out_activation_ = 'softmax'
model = MultiOutputClassifier(estimator=mlp, n_jobs=-1)

model.fit(X_train, y_train)

predicted_mlp = model.predict_proba(X_test)

# Create a dataframe from the predictions
predictions_mlp = pd.DataFrame(
    {
        "h1n1_vaccine": predicted_mlp[0][:, 1],
        "seasonal_vaccine": predicted_mlp[1][:, 1],
    },
    index = expected.index
)
print("predictions_mlp.shape:", predictions_mlp.shape)
predictions_mlp.head()

In [None]:
roc_auc_score(expected, predictions_mlp).round(6)

## Create the submission

In [None]:
submission = pd.read_csv(r"c:\Users\BalazsBalogh\Anaconda\bppy\flu_shot_learning\submission_format.csv")

for col in submission.columns[1:]:
    submission[col].values[:] = 0
    
submission['h1n1_vaccine']  = model.predict_proba(test)[0][:, 1]
submission['seasonal_vaccine']  = model.predict_proba(test)[1][:, 1]

submission.to_csv('flushot.csv', index=False)

submission.info()