In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

In [None]:
df_training_set = pd.read_csv("data/training_set_features.csv", na_values='?')

In [None]:
df_training_set['sex'].value_counts()

In [None]:
df_training_labels = pd.read_csv("data/training_set_labels.csv", na_values='?')

In [None]:
len(df_training_set)

In [None]:
len(df_training_labels)

In [None]:
df_training_labels.columns

In [None]:
df_training_set.columns

In [None]:
print(len(df_training_labels.columns))
print(len(df_training_set.columns))

In [None]:
df_training_set.head()

In [None]:
df_training_labels.head()

In [None]:
# see the distribution of the two target variables. 
# 'seasonal_vaccine' target is more balanced

fig, ax = plt.subplots(2, 1, sharex=True, figsize = (8,10))

n_obs = df_training_labels.shape[0]

(df_training_labels['h1n1_vaccine']
    .value_counts()
    .div(n_obs)
    .plot.bar(title="Proportion of H1N1 Vaccine", ax=ax[0], color = 'maroon')
)
ax[0].set_ylabel("h1n1_vaccine")

(df_training_labels['seasonal_vaccine']
    .value_counts()
    .div(n_obs)
    .plot.bar(title="Proportion of Seasonal Vaccine", ax=ax[1], color = 'maroon')
)
ax[1].set_ylabel("seasonal_vaccine")

fig.tight_layout()

In [None]:
# Phi Coefficient is the same as Pearson for two binary variables
(df_training_labels["h1n1_vaccine"]
     .corr(df_training_labels["seasonal_vaccine"], method="pearson")
)

In [None]:
# how many categories in the 'employment_industry' feature
len(df_training_set['employment_industry'].value_counts())

In [None]:
# # how many categories in the 'employment_occupation' feature
len(df_training_set['employment_occupation'].value_counts())

In [None]:
# concatenate 'training_set_features' and 'training_labels' datasets 
df = pd.concat([df_training_labels,df_training_set], axis = 1)

In [None]:
df.head()

In [None]:
df.shape[1]

In [None]:
df.shape[0]

In [None]:
# drop the id column
df.drop(['respondent_id'], axis = 1, inplace = True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# is the 'seasonal_vaccine' column missing any values?
df['seasonal_vaccine'].isna().value_counts()

In [None]:
# is the 'h1n1_vaccine' column missing any values?
df['h1n1_vaccine'].isna().value_counts()

In [None]:
# encode certain features into a numeric form
df['age_group'].replace({ '18 - 34 Years' : 1, '35 - 44 Years' : 2, '45 - 54 Years' : 3,'55 - 64 Years' : 4,'65+ Years' : 5 }, inplace = True)
df['education'].replace({ '< 12 Years' : 1, '12 Years' : 2, 'Some College' : 3,'College Graduate' : 4}, inplace = True)
df['race'].replace({ 'Black' : 1, 'Hispanic' : 2, 'White' : 3,'Other or Multiple' : 4}, inplace = True)
df['sex'].replace({ 'Female' : 0, 'Male' : 1}, inplace = True)
df['income_poverty'].replace({ 'Below Poverty' : 1, '> $75,000' : 2,'<= $75,000, Above Poverty' : 3}, inplace = True)
df['marital_status'].replace({ 'Married' : 0, 'Not Married' : 1}, inplace = True)
df['rent_or_own'].replace({ 'Rent' : 0, 'Own' : 1}, inplace = True)
df['employment_status'].replace({ 'Unemployed' : 1, 'Not in Labor Force' : 2,'Employed' : 3}, inplace = True)
df['hhs_geo_region'].replace({ 'lzgpxyit' : 1, 'fpwskwrf' : 2, 'qufhixun' : 3,'oxchjgsf' : 4,'kbazzjca' : 5, 'bhuqouqj' : 6, 'mlyzmhmf' : 7, 'lrircsnp' : 8,'atmpeygn' : 9, 'dqpwygqj' : 10 }, inplace = True)
df['census_msa'].replace({ 'Non-MSA' : 1, 'MSA, Not Principle  City' : 2,'MSA, Principle City' : 3}, inplace = True)


In [None]:
df['employment_occupation'].isna().value_counts()

In [None]:
df['employment_industry'].isna().value_counts()

In [None]:
# drop 'employment_occupation' and 'employment_industry' features because too many (~50 %)
# of the data is null.

In [None]:
df.drop(columns = ['employment_occupation','employment_industry'], inplace = True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['race'].isna().value_counts()

In [None]:
df['behavioral_face_mask'].isna().sum()

In [None]:
# the number of null values in each column
df.isna().sum()

In [None]:
ohe = OneHotEncoder(categories="auto", sparse=False, handle_unknown="ignore")

In [None]:
df.columns

In [None]:
df['household_children'].value_counts()

In [None]:
# 'health_insurance' column is missing 46 % of the values so we want to drop this column
df.drop('health_insurance', axis = 1, inplace = True)

In [None]:
# 'income_poverty' column is missing 16.5 % of the values so we will drop this column
df.drop('income_poverty', axis = 1, inplace = True)

In [None]:
df.shape

In [None]:
df_target = df[['h1n1_vaccine','seasonal_vaccine']]

In [None]:
df_target

In [None]:
df.columns

In [None]:
imputer = SimpleImputer(missing_values = np.nan,
                        strategy = "most_frequent")

In [None]:
imputer = imputer.fit(df)

In [None]:
df_simple_imputer = imputer.transform(df)

In [None]:
df = pd.DataFrame(df_simple_imputer, columns = ['h1n1_vaccine', 'seasonal_vaccine', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'marital_status', 'rent_or_own',
       'employment_status', 'hhs_geo_region', 'census_msa', 'household_adults',
       'household_children'])

In [None]:
df.isna().sum()

In [None]:
categorical_nonbinary = ['h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_seas_risk','opinion_seas_sick_from_vacc','age_group','education',
'race','marital_status','employment_status','hhs_geo_region','census_msa','household_adults','household_children','opinion_h1n1_sick_from_vacc','opinion_seas_vacc_effective']


In [None]:
df['h1n1_concern'].isna().sum()

In [None]:
df['h1n1_concern'].value_counts()

In [None]:
# Use OneHotEncoding for the categorical nonbinary features
def feature_ohe(df,my_list):
    ohe = OneHotEncoder(categories='auto', sparse = False)
    for feature in my_list:
        column = df[[feature]]
        column_encoded =ohe.fit_transform(column)
        column_pd = pd.DataFrame(column_encoded, columns = ohe.get_feature_names([feature]), index = df.index)
        df = pd.concat([df,column_pd], axis = 1)
    return df

In [None]:
df = feature_ohe(df,categorical_nonbinary)

In [None]:
# column= df[['h1n1_concern']]
# ohe = OneHotEncoder(categories='auto', sparse = False)
# column_encoded =ohe.fit_transform(column)
# column_pd = pd.DataFrame(column_encoded, columns = ohe.get_feature_names(['h1n1_concern']), index = df.index)
# column_pd

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
# Phi Coefficient is the same as Pearson for two binary variables
(df["h1n1_vaccine"]
     .corr(df["seasonal_vaccine"], method="pearson")
)

In [None]:
# list of binary features
binary_list = ['behavioral_antiviral_meds','behavioral_avoidance','behavioral_face_mask','behavioral_wash_hands' , 'behavioral_large_gatherings','behavioral_outside_home','behavioral_touch_face' , 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker']

In [None]:
def phi_coef(data,target_feature, list_features):
    phi_list = []
    for feature in list_features:
        
        phi = data[feature].corr(data[target_feature], method="pearson")
        my_tuple = (feature, phi)
        phi_list.append(my_tuple)
        #print("Phi coeff for 'seasonal_vaccine' with " + feature + " is " + str (phi))
    return phi_list

In [None]:
phi_list_tuples = phi_coef(df,'seasonal_vaccine',binary_list)

In [None]:
phi_list_tuples.sort(key=lambda y: y[1], reverse = True)

print(phi_list_tuples)

In [None]:
>>> import matplotlib.pyplot as plt
>>> testList =[(0, 6.0705199999997801e-08), (1, 2.1015700100300739e-08), 
 (2, 7.6280656623374823e-09), (3, 5.7348209304555086e-09), 
 (4, 3.6812203579604238e-09), (5, 4.1572516753310418e-09)]
>>> from math import log
>>> testList2 = [(elem1, log(elem2)) for elem1, elem2 in testList]
>>> testList2
[(0, -16.617236475334405), (1, -17.67799605473062), (2, -18.691431541177973), (3, -18.9767093108359), (4, -19.420021520728017), (5, -19.298411635970396)]
>>> zip(*testList2)
[(0, 1, 2, 3, 4, 5), (-16.617236475334405, -17.67799605473062, -18.691431541177973, -18.9767093108359, -19.420021520728017, -19.298411635970396)]
>>> plt.scatter(*zip(*testList2))
>>> plt.show()

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(12, 10), dpi=80)


plt.bar(*zip(*phi_list_tuples), color = 'darkorange')
plt.xlabel('predictive variable')
plt.ylabel('phi coefficient')
plt.title("phi coefficient of seasonal vaccine with binary input features")
plt.xticks( rotation='75')

In [None]:
df_target.columns

In [None]:
X = df.drop('seasonal_vaccine', axis = 1)
y = df.seasonal_vaccine
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.33)

# baseline model

In [None]:
# baseline model for h1n1_vaccine
# predict all to 0
# the accuracy of the baseline model would be 0.78
df_target['h1n1_vaccine'].value_counts(normalize = True)


In [None]:
# baseline model for seasonal_vaccine 
# predict all to 0
# the accuracy of the baseline model would be 0.53
df_target['seasonal_vaccine'].value_counts(normalize = True)


# logistic regression

In [None]:
logreg = LogisticRegression(random_state = 42)
logreg.fit(X_train,y_train)

In [None]:
plot_confusion_matrix(logreg,X_train,y_train, cmap = "Oranges")

In [None]:
# Accuracy
method_acc = accuracy_score(y_test, logreg.predict(X_test))
method_acc

In [None]:
# Precision
method_prec = precision_score(y_test, logreg.predict(X_test))
method_prec

In [None]:
# Recall
method_recall = recall_score(y_test, logreg.predict(X_test))
method_recall

In [None]:
# F1-Score
f1_score = (2*method_prec*method_recall)/(method_prec + method_recall)
f1_score

In [None]:
fig, ax = plt.subplots()
plot_roc_curve(logreg,X_test,y_test, ax = ax, color = 'darkorange')

In [None]:
y_score = logreg.predict_proba(X_test)[:,1]

In [None]:
roc_auc_score(y_test, y_score)

# Decision Tree

In [None]:
# decision tree classifier not optimized yet. 
clf = DecisionTreeClassifier()


In [None]:
first_decision_tree = clf.fit(X_train, y_train)

In [None]:
tree.plot_tree(first_decision_tree)

In [None]:
# confusion matrix on the training data
plot_confusion_matrix(clf, X_train, y_train, cmap = 'Oranges')

In [None]:
# plot confusion matrix on the testing data
plot_confusion_matrix(clf,X_test,y_test, cmap = 'Oranges')

In [None]:
fig, ax = plt.subplots()
plot_roc_curve(clf, X_test, y_test, ax=ax, color = 'darkorange')

In [None]:
y_pred=clf.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

# Random Forest

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train,y_train)

y_pred=rfc.predict(X_test)

In [None]:
y_pred

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
plot_confusion_matrix(rfc, X_train, y_train, cmap = 'Oranges')

In [None]:
plot_confusion_matrix(rfc, X_test, y_test, cmap = 'Oranges')

In [None]:
fig, ax = plt.subplots()
plot_roc_curve(rfc, X_test, y_test, ax=ax, color = 'green')

# k Nearest Neighbour

In [None]:
 knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train,y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
knn.score(X_test,y_test)

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize = (12,10))
plot_roc_curve(logreg,X_test,y_test, ax = ax, color = 'orange')
plot_roc_curve(rfc, X_test, y_test, ax=ax, color = 'green')
plot_roc_curve(clf, X_test, y_test, ax=ax, color = 'pink')
plot_roc_curve(knn, X_test, y_test, ax=ax, color = 'purple')

In [None]:
df.columns

In [None]:

counts = (df[['opinion_seas_risk', 'seasonal_vaccine']]
              .groupby(['opinion_seas_risk', 'seasonal_vaccine'])
              .size()
              .unstack('seasonal_vaccine')
         )
counts

In [None]:
1041/(4034+1041)

In [None]:
2937/(4838+2937)

In [None]:
266/(188+266)

In [None]:
4517/(2150+4517)

In [None]:
1968/(595+1968)

In [None]:
y = [0.2051231527093596,0.3777491961414791,0.5859030837004405,0.6775161241937903,0.7678501755754975]

In [None]:
x = ['very low','somewhat low',"don't know",'somewhat high',"very high"]

In [None]:
plt.plot(x,y)
plt.title("Flu Risk Opinion and Vaccine Rates")
plt.xlabel("Opinion About Flu Risk")
plt.ylabel("Vaccine Rates")

In [None]:
y1 = [25,65]

In [None]:
x1 = ['No', 'Yes']

In [None]:
fig = plt.figure()

In [None]:
ax = fig.add_axes([0,0,1,1])

In [None]:
plt.bar(x1,y1, color = 'darkorange')
plt.title("Doctor Recommendation and Vaccine Rates")
plt.xlabel("Did doctor recommend?")
plt.ylabel("Vaccine Rates")

# Grid Search with Knn Classifier

In [None]:
knn_gridSearch = KNeighborsClassifier()

In [None]:
params = [{'n_neighbors' : [3,5,7,9],'weights' : ['uniform','distance'],'leaf_size' : [15,20]}]

In [None]:
gs_knn = GridSearchCV(knn_gridSearch, param_grid = params, scoring = 'accuracy', cv = 5)

In [None]:
gs_knn.fit(X_train,y_train)

In [None]:
gs_knn.best_params_

In [None]:
print("tuned hyperparameters :(best parameters) ",gs_knn.best_params_)
print("accuracy :",gs_knn.score(X_train, y_train))

# Grid Search with Logistic Regression

In [None]:
logistic_gridSearch = LogisticRegression()

In [None]:
params = {"penalty":["l1","l2"]}

In [None]:
gs_logistic = GridSearchCV(estimator = logistic_gridSearch,param_grid = params,cv=10)

In [None]:
gs_logistic.fit(X_train,y_train)

In [None]:
gs_logistic.best_params_

In [None]:
print("tuned hyperparameters :(best parameters) ",gs_logistic.best_params_)
print("accuracy :",gs_logistic.score(X_train,y_train))