### Imports

In [51]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

from imblearn.pipeline import Pipeline as ImPipeline

### Problem Description

My goal is to predict how likely individuals are to receive their H1N1 and seasonal flu vaccines. Specifically, I'll be predicting two probabilities: one for h1n1_vaccine and one for seasonal_vaccine.

### Data Exploration

In [2]:
submission_format = pd.read_csv('../Data/submission_format.csv')
submission_format.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   respondent_id     26708 non-null  int64  
 1   h1n1_vaccine      26708 non-null  float64
 2   seasonal_vaccine  26708 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 626.1 KB


The submission_format.csv is an example of how my submission should be submitted.

In [3]:
test_set_features = pd.read_csv('../Data/test_set_features.csv')
test_set_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  int64  
 1   h1n1_concern                 26623 non-null  float64
 2   h1n1_knowledge               26586 non-null  float64
 3   behavioral_antiviral_meds    26629 non-null  float64
 4   behavioral_avoidance         26495 non-null  float64
 5   behavioral_face_mask         26689 non-null  float64
 6   behavioral_wash_hands        26668 non-null  float64
 7   behavioral_large_gatherings  26636 non-null  float64
 8   behavioral_outside_home      26626 non-null  float64
 9   behavioral_touch_face        26580 non-null  float64
 10  doctor_recc_h1n1             24548 non-null  float64
 11  doctor_recc_seasonal         24548 non-null  float64
 12  chronic_med_condition        25776 non-null  float64
 13  child_under_6_mo

The test_set_features.csv will be used to run my model and produce a predictive csv in the format of submission_format. 

In [4]:
training_set_features = pd.read_csv('../Data/training_set_features.csv')
training_set_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [5]:
training_set_labels = pd.read_csv('../Data/training_set_labels.csv')
training_set_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   respondent_id     26707 non-null  int64
 1   h1n1_vaccine      26707 non-null  int64
 2   seasonal_vaccine  26707 non-null  int64
dtypes: int64(3)
memory usage: 626.1 KB


The training_set_features and is the dataset used to predict the actual training_set_labels. They are already split, however I will be rejoining for any cleaning I find necessary and splitting again in a format that is more convenient since there is two target variables. 

In [6]:
complete_data = pd.merge(training_set_features, training_set_labels, on='respondent_id')

### Cleaning

In [7]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [8]:
complete_data.isna().sum().sort_values(ascending=False)

employment_occupation          13470
employment_industry            13330
health_insurance               12274
income_poverty                  4423
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
rent_or_own                     2042
employment_status               1463
marital_status                  1408
education                       1407
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
opinion_seas_sick_from_vacc      537
opinion_seas_risk                514
opinion_seas_vacc_effective      462
opinion_h1n1_sick_from_vacc      395
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
household_adults                 249
household_children               249
behavioral_avoidance             208
behavioral_touch_face            128
h1n1_knowledge                   116
h1n1_concern                      92
behavioral_large_gatherings       87
behavioral_outside_home           82
b

#### rent_or_own

In [9]:
complete_data['rent_or_own'].value_counts()

Own     18736
Rent     5929
Name: rent_or_own, dtype: int64

#### Education

In [10]:
complete_data['education'].value_counts()

College Graduate    10097
Some College         7043
12 Years             5797
< 12 Years           2363
Name: education, dtype: int64

In [11]:
complete_data['education'].fillna("Unknown", inplace=True)

#### Doctor Recommended

In [12]:
complete_data['doctor_recc_seasonal'].value_counts()

0.0    16453
1.0     8094
Name: doctor_recc_seasonal, dtype: int64

In [13]:
complete_data['doctor_recc_seasonal'].fillna(complete_data['doctor_recc_seasonal'].value_counts().index[0], inplace=True)

In [14]:
complete_data['doctor_recc_seasonal'].value_counts()

0.0    18613
1.0     8094
Name: doctor_recc_seasonal, dtype: int64

In [15]:
complete_data['doctor_recc_h1n1'].value_counts()

0.0    19139
1.0     5408
Name: doctor_recc_h1n1, dtype: int64

In [16]:
complete_data['doctor_recc_h1n1'].fillna(complete_data['doctor_recc_h1n1'].value_counts().index[0], inplace=True)

In [17]:
complete_data['doctor_recc_h1n1'].value_counts()

0.0    21299
1.0     5408
Name: doctor_recc_h1n1, dtype: int64

#### Health Insurance

In [18]:
complete_data['health_insurance'].isna().sum()

12274

The column health_insurance has 12,274 NA's. This is almost half of the data entries. Will need to replaced with a value or categorized as other.

In [19]:
complete_data['health_insurance'].fillna("Unknown", inplace=True)

#### Employment

In [20]:
employment_df = complete_data.filter(['health_worker', 'employment_status', 'employment_industry', 'employment_occupation', 'income_poverty'], axis=1)
employment_df.head()

Unnamed: 0,health_worker,employment_status,employment_industry,employment_occupation,income_poverty
0,0.0,Not in Labor Force,,,Below Poverty
1,0.0,Employed,pxcmvdjn,xgwztkwe,Below Poverty
2,0.0,Employed,rucpziij,xtkaffoo,"<= $75,000, Above Poverty"
3,0.0,Not in Labor Force,,,Below Poverty
4,0.0,Employed,wxleyezf,emcorrxb,"<= $75,000, Above Poverty"


In [21]:
employment_df['employment_status'].value_counts()

Employed              13560
Not in Labor Force    10231
Unemployed             1453
Name: employment_status, dtype: int64

In [22]:
employment_df['employment_industry'].value_counts()

fcxhlnwr    2468
wxleyezf    1804
ldnlellj    1231
pxcmvdjn    1037
atmlpfrs     926
arjwrbjb     871
xicduogh     851
mfikgejo     614
vjjrobsf     527
rucpziij     523
xqicxuve     511
saaquncn     338
cfqqtusy     325
nduyfdeo     286
mcubkhph     275
wlfvacwt     215
dotnnunm     201
haxffmxo     148
msuufmds     124
phxvnwax      89
qnlwzans      13
Name: employment_industry, dtype: int64

In [23]:
employment_df['income_poverty'].value_counts()

<= $75,000, Above Poverty    12777
> $75,000                     6810
Below Poverty                 2697
Name: income_poverty, dtype: int64

In [24]:
employment_df['income_poverty'].isna().sum()

4423

If the employment_status column has a value of Not in Labor Force, then columns employment_industry and employment_occupation will have NA's. 

Values in columns employment_industry and employment_occupation are represented as short random character strings.

Not in Labor Force = Unemployed?

In [25]:
employment_df['employment_status'].replace("Unemployed", "Not in Labor Force", inplace=True)

In [26]:
employment_df['employment_status'].value_counts()

Employed              13560
Not in Labor Force    11684
Name: employment_status, dtype: int64

In [27]:
complete_data['employment_status'] = employment_df['employment_status']

In [28]:
complete_data['employment_status'].value_counts()

Employed              13560
Not in Labor Force    11684
Name: employment_status, dtype: int64

In [29]:
employment_df.loc[employment_df['employment_status'] == "Not in Labor Force", 'employment_industry'] = "Not in Labor Force"
employment_df.loc[employment_df['employment_status'] == "Not in Labor Force", 'employment_occupation'] = "Not in Labor Force"

In [30]:
employment_df.isna().sum().sort_values(ascending=False)

income_poverty           4423
employment_occupation    1786
employment_industry      1646
employment_status        1463
health_worker             804
dtype: int64

In [31]:
complete_data['employment_occupation'] = employment_df['employment_occupation']
complete_data['employment_industry'] = employment_df['employment_industry']

In [36]:
complete_data.isna().sum().sort_values(ascending=False)

income_poverty                 4423
rent_or_own                    2042
employment_occupation          1786
employment_industry            1646
employment_status              1463
marital_status                 1408
chronic_med_condition           971
child_under_6_months            820
health_worker                   804
opinion_seas_sick_from_vacc     537
opinion_seas_risk               514
opinion_seas_vacc_effective     462
opinion_h1n1_sick_from_vacc     395
opinion_h1n1_vacc_effective     391
opinion_h1n1_risk               388
household_adults                249
household_children              249
behavioral_avoidance            208
behavioral_touch_face           128
h1n1_knowledge                  116
h1n1_concern                     92
behavioral_large_gatherings      87
behavioral_outside_home          82
behavioral_antiviral_meds        71
behavioral_wash_hands            42
behavioral_face_mask             19
doctor_recc_h1n1                  0
seasonal_vaccine            

### Correlations

In [32]:
complete_data.corr()['h1n1_vaccine'].sort_values(ascending=False)

h1n1_vaccine                   1.000000
doctor_recc_h1n1               0.394086
seasonal_vaccine               0.377143
opinion_h1n1_risk              0.323265
opinion_h1n1_vacc_effective    0.269347
opinion_seas_risk              0.258571
doctor_recc_seasonal           0.218976
opinion_seas_vacc_effective    0.179272
health_worker                  0.169768
h1n1_concern                   0.121929
h1n1_knowledge                 0.117951
chronic_med_condition          0.095207
opinion_h1n1_sick_from_vacc    0.075091
behavioral_wash_hands          0.074712
behavioral_touch_face          0.071648
behavioral_face_mask           0.070498
child_under_6_months           0.066962
behavioral_avoidance           0.047690
behavioral_antiviral_meds      0.040608
behavioral_outside_home        0.021768
behavioral_large_gatherings    0.017822
opinion_seas_sick_from_vacc    0.008360
household_adults               0.007545
respondent_id                 -0.003280
household_children            -0.003320


In [33]:
complete_data.corr()['seasonal_vaccine'].sort_values(ascending=False)

seasonal_vaccine               1.000000
opinion_seas_risk              0.390106
h1n1_vaccine                   0.377143
opinion_seas_vacc_effective    0.361875
doctor_recc_seasonal           0.360696
opinion_h1n1_risk              0.216625
opinion_h1n1_vacc_effective    0.205072
doctor_recc_h1n1               0.198560
chronic_med_condition          0.170174
h1n1_concern                   0.154828
health_worker                  0.127311
behavioral_touch_face          0.120228
h1n1_knowledge                 0.120152
behavioral_wash_hands          0.112414
behavioral_avoidance           0.076395
behavioral_large_gatherings    0.064025
behavioral_outside_home        0.053509
behavioral_face_mask           0.050083
opinion_h1n1_sick_from_vacc    0.027404
child_under_6_months           0.012097
behavioral_antiviral_meds      0.006277
respondent_id                 -0.004652
opinion_seas_sick_from_vacc   -0.061510
household_adults              -0.064840
household_children            -0.114614


In [34]:
# with sns.axes_style("darkgrid"):
#     f, ax = plt.subplots(figsize=(10, 8))
    
#     mask = np.triu(np.ones_like(complete_data.corr(), dtype=bool))
#     ax = sns.heatmap(abs(complete_data.corr()),mask=mask,annot=True)


### Split

In [40]:
X = complete_data.drop(['h1n1_vaccine', 'seasonal_vaccine'], axis=1)
y = complete_data.filter(['h1n1_vaccine', 'seasonal_vaccine'])

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Encoding

In [49]:
def grab_numeric(df):
    return df.select_dtypes(include=['float', 'int'])

In [52]:
subpipe_num = Pipeline(steps=[('get_num', GrabNumeric),
                              ('num_impute', SimpleImputer()),
                              ('ss', StandardScaler())])


subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                              ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])


subpipe_ord = Pipeline(steps=[('ord_impute', SimpleImputer(strategy='most_frequent')),
                              ('ord', OrdinalEncoder())])

NameError: name 'Pipeline' is not defined

### Model

#### First Simple Model

In [47]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_train, y_train)

ValueError: could not convert string to float: 'Unknown'

#### Iterative Model

#### Iterative Model Hypertuning

### Fit On Full