In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.multioutput import *
from sklearn.preprocessing import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.model_selection import *

You are provided a dataset with 36 columns. The first column respondent_id is a unique and random identifier. The remaining 35 features are described below.

For all binary variables: 0 = No; 1 = Yes.

h1n1_concern - Level of concern about the H1N1 flu.
0 = Not at all concerned; 1 = Not very concerned; 2 = Somewhat concerned; 3 = Very concerned.

h1n1_knowledge - Level of knowledge about H1N1 flu.
0 = No knowledge; 1 = A little knowledge; 2 = A lot of knowledge.

behavioral_antiviral_meds - Has taken antiviral medications. (binary)

behavioral_avoidance - Has avoided close contact with others with flu-like symptoms. (binary)

behavioral_face_mask - Has bought a face mask. (binary)

behavioral_wash_hands - Has frequently washed hands or used hand sanitizer. (binary)

behavioral_large_gatherings - Has reduced time at large gatherings. (binary)

behavioral_outside_home - Has reduced contact with people outside of own household. (binary)

behavioral_touch_face - Has avoided touching eyes, nose, or mouth. (binary)

doctor_recc_h1n1 - H1N1 flu vaccine was recommended by doctor. (binary)

doctor_recc_seasonal - Seasonal flu vaccine was recommended by doctor. (binary)

chronic_med_condition - Has any of the following chronic medical conditions: asthma or an other lung condition, diabetes, a heart condition, a kidney condition, sickle cell anemia or other anemia, a neurological or neuromuscular condition, a liver condition, or a weakened immune system caused by a chronic illness or by medicines taken for a chronic illness. (binary)

child_under_6_months - Has regular close contact with a child under the age of six months. (binary)

health_worker - Is a healthcare worker. (binary)

health_insurance - Has health insurance. (binary)

opinion_h1n1_vacc_effective - Respondent's opinion about H1N1 vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.

opinion_h1n1_risk - Respondent's opinion about risk of getting sick with H1N1 flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.

opinion_h1n1_sick_from_vacc - Respondent's worry of getting sick from taking H1N1 vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.

opinion_seas_vacc_effective - Respondent's opinion about seasonal flu vaccine effectiveness.
1 = Not at all effective; 2 = Not very effective; 3 = Don't know; 4 = Somewhat effective; 5 = Very effective.

opinion_seas_risk - Respondent's opinion about risk of getting sick with seasonal flu without vaccine.
1 = Very Low; 2 = Somewhat low; 3 = Don't know; 4 = Somewhat high; 5 = Very high.

opinion_seas_sick_from_vacc - Respondent's worry of getting sick from taking seasonal flu vaccine.
1 = Not at all worried; 2 = Not very worried; 3 = Don't know; 4 = Somewhat worried; 5 = Very worried.

age_group - Age group of respondent.

education - Self-reported education level.

race - Race of respondent.

sex - Sex of respondent.

income_poverty - Household annual income of respondent with respect to 2008 Census poverty thresholds.

marital_status - Marital status of respondent.

rent_or_own - Housing situation of respondent.

employment_status - Employment status of respondent.

hhs_geo_region - Respondent's residence using a 10-region geographic classification defined by the U.S. Dept. of Health and Human Services. Values are represented as short random character strings.

census_msa - Respondent's residence within metropolitan statistical areas (MSA) as defined by the U.S. Census.

household_adults - Number of other adults in household, top-coded to 3.

household_children - Number of children in household, top-coded to 3.

employment_industry - Type of industry respondent is employed in. Values are represented as short random character strings.

employment_occupation - Type of occupation of respondent. Values are represented as short random character strings.

In [198]:
def subtract_opinions_by_1(df):
    opinions = ["opinion_h1n1_vacc_effective","opinion_h1n1_risk","opinion_h1n1_sick_from_vacc","opinion_seas_vacc_effective","opinion_seas_risk","opinion_seas_sick_from_vacc"]
    for i in opinions:
        df[i] = df[i] - 1.0
    return df

In [199]:
train_input = pd.read_csv("training_set_features.csv")
train_output = pd.read_csv("training_set_labels.csv")
test_input = pd.read_csv("test_set_features.csv")

train_input = subtract_opinions_by_1(train_input)
test_input = subtract_opinions_by_1(test_input)

In [200]:
train_input

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [207]:
replace_null_with_values = {

    "0.0" : ["doctor_recc_h1n1","doctor_recc_seasonal","chronic_med_condition","health_worker","income_poverty","marital_status","rent_or_own","employment_status","household_children"],

    "1.0" : ["h1n1_knowledge","health_insurance","census_msa","household_adults"],

    "2.0" : ["h1n1_concern","opinion_h1n1_vacc_effective","opinion_h1n1_risk","opinion_h1n1_sick_from_vacc","opinion_seas_vacc_effective","opinion_seas_risk","opinion_seas_sick_from_vacc","age_group","education"],

    "one_condition" : ["behavioral_avoidance","behavioral_face_mask","behavioral_large_gatherings","behavioral_outside_home","behavioral_touch_face"],# If the person is highly concerned about h1n1 virus (3.0), then 1.0 else 0.0

    "one_inverse_condition" : ["child_under_6_months"],# If the person is highly concerned about h1n1 virus (3.0), then 0.0 else 1.0

    "two_conditions" : ["behavioral_antiviral_meds"], # If the person is highly concerned about h1n1 virus (3.0) and has a lot of knowledge about the virus (2.0)

    "three_conditions" : ["behavioral_wash_hands"], # If the person is highly concerned about h1n1 virus (3.0) and has some knowledge about the virus (1.0 or 2.0)

    "median" : ["hhs_geo_region","employment_industry","employment_occupation","employment_industry","employment_occupation"]
}

In [208]:
value_replace_features = ["age_group","income_poverty","race","sex","hhs_geo_region","employment_industry","employment_occupation","census_msa"]

values_replace_dict_2 = {
    "education" : {"< 12 Years":0,"12 Years":1,"Some College":2,"College Graduate":3},
    "marital_status" : {"Not Married":0,"Married":1},
    "rent_or_own" : {"Rent":0,"Own":1},
    "employment_status" : {"Unemployed":0,"Not in Labor Force":1,"Employed":2}
}

In [209]:
def replace_elements_with_numbers(df,value_replace_features,values_replace_dict_2):
    values_replace_dict = {}
    new_df = df.copy()
    for value in value_replace_features:
        u = train_input[value].unique().tolist()
        for i in u:
            k = str(i)
            if k == "nan":
                u.remove(i)
        u = np.array(u)
        dic = {}
        ctr = 0
        for i in np.sort(u):
            k = str(i)
            if k == "nan":
                continue
            dic[i] = ctr
            ctr += 1
        values_replace_dict[value] = dic
    for i in values_replace_dict:
        new_df[i] = new_df[i].map(values_replace_dict[i])
    for i in values_replace_dict_2:
        new_df[i] = new_df[i].map(values_replace_dict_2[i])
    return new_df

In [212]:
def deal_with_null_values(df,replace_null_with_values):
    
    new_df = df.copy()
    
    for i in replace_null_with_values["0.0"]:
        new_df[i] = new_df[i].fillna(0.0).astype("int64")
    for i in replace_null_with_values["1.0"]:
        new_df[i] = new_df[i].fillna(1.0).astype("int64")
    for i in replace_null_with_values["2.0"]:
        new_df[i] = new_df[i].fillna(2.0).astype("int64")

    for i in replace_null_with_values["one_condition"]:
        new_df[i][new_df["h1n1_concern"]==3.0] = new_df[i][new_df["h1n1_concern"]==3.0].fillna(1.0).astype("int64")
        new_df[i] = new_df[i].fillna(0.0).astype("int64")
    
    for i in replace_null_with_values["one_inverse_condition"]:
        new_df[i][new_df["h1n1_concern"]==3.0] = new_df[i][new_df["h1n1_concern"]==3.0].fillna(0.0).astype("int64")
        new_df[i] = new_df[i].fillna(1.0).astype("int64")
    
    for i in replace_null_with_values["two_conditions"]:
        cond = (new_df["h1n1_concern"] == 3.0) & (new_df["h1n1_knowledge"] == 2.0)
        new_df[i][cond] = new_df[i][cond].fillna(1.0).astype("int64")
        new_df[i] = new_df[i].fillna(0.0).astype("int64")
    
    for i in replace_null_with_values["three_conditions"]:
        cond = (new_df["h1n1_concern"] == 3.0) & ((new_df["h1n1_knowledge"] == 2.0) | (new_df["h1n1_knowledge"] == 1.0))
        new_df[i][cond] = new_df[i][cond].fillna(1.0).astype("int64")
        new_df[i] = new_df[i].fillna(0.0).astype("int64")

    for i in replace_null_with_values["median"]:
        new_df[i] = new_df[i].fillna(new_df[i].median()).astype("int64")
    
    return new_df

In [213]:
df_train_input = replace_elements_with_numbers(train_input,value_replace_features,values_replace_dict_2)
df_train_input = deal_with_null_values(df_train_input,replace_null_with_values)

In [215]:
df_test_input = replace_elements_with_numbers(test_input,value_replace_features,values_replace_dict_2)
df_test_input = deal_with_null_values(df_test_input,replace_null_with_values)

In [216]:
df_train_input

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1,0,0,0,0,0,0,1,1,...,2,0,1,1,8,2,0,0,8,10
1,1,3,2,0,1,0,1,0,1,1,...,2,0,0,2,1,0,0,0,12,19
2,2,1,1,0,1,0,0,0,0,0,...,0,0,1,2,9,0,2,0,14,21
3,3,1,1,0,1,0,1,1,0,0,...,2,0,0,1,5,1,0,0,8,10
4,4,2,1,0,1,0,1,1,0,1,...,0,1,1,2,9,0,1,0,18,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2,0,0,1,0,0,0,1,0,...,0,0,1,1,9,2,0,0,8,10
26703,26703,1,2,0,1,0,1,0,0,0,...,0,0,0,2,6,1,1,0,4,2
26704,26704,2,2,0,1,1,1,1,0,1,...,0,0,1,0,6,0,0,0,8,10
26705,26705,1,1,0,0,0,0,0,0,0,...,0,1,0,2,5,2,1,0,4,6


In [231]:
col = train_input.columns.values[1:]
X_train = df_train_input.drop(columns=["respondent_id"])
X_test = df_test_input.drop(columns=["respondent_id"])
y_train = train_output.drop(columns=["respondent_id"])
mms = MinMaxScaler()
X_train = pd.DataFrame(mms.fit_transform(X_train),columns=col).round(2)
X_test = pd.DataFrame(mms.fit_transform(X_test),columns=col).round(2)

In [232]:
X_train

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.5,0.89,1.0,0.00,0.0,0.4,0.45
1,1.00,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.11,0.0,0.00,0.0,0.6,0.86
2,0.33,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.00,0.0,0.67,0.0,0.7,0.95
3,0.33,0.5,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.5,0.56,0.5,0.00,0.0,0.4,0.45
4,0.67,0.5,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.00,0.0,0.33,0.0,0.9,0.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,0.67,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.5,1.00,1.0,0.00,0.0,0.4,0.45
26703,0.33,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.67,0.5,0.33,0.0,0.2,0.09
26704,0.67,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.67,0.0,0.00,0.0,0.4,0.45
26705,0.33,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.56,1.0,0.33,0.0,0.2,0.27


In [234]:
X_test

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0.67,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.5,0.0,0.0,1.0,0.78,0.0,0.33,0.00,0.05,0.32
1,0.33,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.11,1.0,1.00,0.00,0.05,0.91
2,0.67,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.5,1.0,1.0,1.0,0.56,1.0,0.33,0.00,0.50,0.55
3,0.33,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.5,0.56,0.0,0.33,0.00,0.40,0.45
4,1.00,0.5,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.67,1.0,0.00,0.33,0.20,0.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26703,0.33,0.5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.22,0.5,0.33,0.33,0.40,0.45
26704,1.00,0.5,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.00,1.0,0.33,1.00,0.20,0.82
26705,0.00,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.5,1.00,0.0,0.33,0.00,0.40,0.45
26706,1.00,0.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.5,0.11,0.0,0.33,0.00,0.40,0.45
