# BIL 470 Project

In [47]:
%matplotlib inline

from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.cluster import DBSCAN
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [2]:
training_set_features_df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
training_set_labels_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

test_set_features_df = pd.read_csv("test_set_features.csv", index_col="respondent_id")

submission_df = pd.read_csv("submission_format.csv", index_col="respondent_id")

training_set_features_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


Use ordinal encoding for hierarchical categories. (Binary categories can be inside hierarchical)

Use one-hot encoding for non-hierarchical categories.

In [3]:
hierarchical_categories = ["age_group", "education", "sex", "income_poverty", "marital_status", "rent_or_own"]
hierarchical_encoding = OrdinalEncoder()

non_hierarchical_categories = ["employment_industry", "employment_occupation", "race", "employment_status", "hhs_geo_region", "census_msa"]
non_hierarchical_encoder = OneHotEncoder(sparse=False)

In [76]:
rf = RandomForestClassifier(
    n_estimators=220,
    max_depth=15,
    random_state=25519,
    criterion="gini",
)

adaboost = Pipeline(
    steps=[
        ("preprocess", StandardScaler()),
        ("adaboost", AdaBoostClassifier(n_estimators=70, random_state=25519, learning_rate=0.5)),
    ]
)

sgd = Pipeline(
    steps=[
        ("preprocess", StandardScaler()),
        ("sgd", SGDClassifier(loss='log', penalty='elasticnet', max_iter=5500, random_state=25519)),
    ]
)

logistic = Pipeline(
    steps=[
        ("preprocess", StandardScaler()),
        ("logistic", LogisticRegression(C=0.2, max_iter=5500)),
    ]
)

multi_vote = MultiOutputClassifier(
    VotingClassifier(
        estimators = [
            ("logistic", logistic),
            ("adaboost", adaboost),
            ("sgd", sgd),
            ("rf", rf),
        ],
        voting='soft',
        weights=[1, 7, 1, 8.99],
    ),
    n_jobs=-1,
)

In [77]:
%%time

encoder = ColumnTransformer(
    transformers = [
        ("non_hierarchical", non_hierarchical_encoder, non_hierarchical_categories),
        ("hierarchical", hierarchical_encoding, hierarchical_categories),
    ], 
    remainder='passthrough'
)
    
complete_pipeline = Pipeline(
    steps=[
        ("encoding", encoder),
        ("replace_nan", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("classify", multi_vote),
    ]
)

score = cross_val_score(complete_pipeline, training_set_features_df, training_set_labels_df, cv=5, scoring='roc_auc')

pd.DataFrame(score, columns=["AUROC Value"])

CPU times: user 4.83 s, sys: 1.05 s, total: 5.87 s
Wall time: 50.1 s


Unnamed: 0,AUROC Value
0,0.860648
1,0.857887
2,0.862692
3,0.868081
4,0.856808


Now time to create a submission result.

In [78]:
%%time

complete_pipeline.fit(training_set_features_df, training_set_labels_df)
test_set_predictions = complete_pipeline.predict_proba(test_set_features_df)

submission_df["h1n1_vaccine"] = test_set_predictions[0][:, 1]
submission_df["seasonal_vaccine"] = test_set_predictions[1][:, 1]

submission_df.to_csv(f"submission_{int(datetime.now().timestamp())}.csv", index=True)

submission_df.head()

CPU times: user 3.15 s, sys: 138 ms, total: 3.29 s
Wall time: 14.6 s


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.298555,0.370533
26708,0.205427,0.219923
26709,0.364314,0.612384
26710,0.527573,0.727688
26711,0.368406,0.485343
