In [11]:
import os
import yaml
import numpy as np
import pandas as pd
import argparse

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline



In [2]:
def read_params(config_path):
    with open(config_path) as yaml_file:
        config = yaml.safe_load(yaml_file)

    return config


In [3]:
os.getcwd()
os.chdir('..')

In [4]:
config = read_params("params.yaml")
train_data_path = config["data"]["raw_data"]
train_labels_path = config["data"]["raw_data_labels"]
random_state = config["base"]["random_state"]
id_col = config["base"]["id_col"]
train_preprocessed_path = config["preprocess_data"]["train_path"]
target_1 = config["base"]["target_col_1"]
target_2 = config["base"]["target_col_2"]

In [5]:
train = pd.read_csv(train_data_path, index_col=id_col)
train_labels = pd.read_csv(train_labels_path, index_col=id_col)

data = train.join(train_labels)

data

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,,0,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,,0,1
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0,0


In [8]:
from pandas_profiling import ProfileReport
profile = ProfileReport(data)
profile.to_file("eda.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
def drop_na(df: pd.DataFrame) -> pd.DataFrame:
    return(df.dropna())

In [15]:
data_without_na = drop_na(data)
data_without_na.to_csv(train_preprocessed_path)


In [31]:
columns_subset = ['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 
                 'chronic_med_condition', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 
                 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 
                 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'income_poverty']

train_subset = data.loc[:, columns_subset]

train_subset_numeric = train_subset.loc[:, ['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 
                 'chronic_med_condition', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 
                 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 
                 'opinion_seas_sick_from_vacc']]

train_subset_str = train_subset.loc[:, [ 'age_group', 'education', 'income_poverty']]

# Fill NA of numeric with median of column
imp_train = SimpleImputer(missing_values=np.nan, strategy='median')
train_subset_numeric_imputed = pd.DataFrame(imp_train.fit_transform(train_subset_numeric),
                                            columns = train_subset_numeric.columns,
                                            index = train_subset_numeric.index)

# Scale numeric columns
scaler_train = StandardScaler()
train_subset_numeric_imputed = pd.DataFrame(scaler_train.fit_transform(train_subset_numeric_imputed), 
                                            columns = train_subset_numeric.columns,
                                            index = train_subset_numeric.index)

# Fill NA of str columns with unknown
imp_train = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'unknown')
train_subset_str_imputed = pd.DataFrame(imp_train.fit_transform(train_subset_str),
                                            columns = train_subset_str.columns,
                                            index = train_subset_str.index)

# Re-join data
imputed_train = pd.concat([train_subset_numeric_imputed, train_subset_str_imputed], axis=1).join(train_labels)




In [32]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(train_subset_str)
training_encoded = ordinal_encoder.transform(train_subset_str)
training_encoded

array([[ 3.,  1.,  2.],
       [ 1.,  0.,  2.],
       [ 0.,  2.,  0.],
       ...,
       [ 3.,  3., nan],
       [ 0.,  3.,  0.],
       [ 4.,  3.,  0.]])

In [34]:
encoded_categorical_df = pd.DataFrame(training_encoded, columns=train_subset_str.columns, index=train_subset_str.index)
encoded_categorical_df

Unnamed: 0_level_0,age_group,education,income_poverty
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3.0,1.0,2.0
1,1.0,0.0,2.0
2,0.0,2.0,0.0
3,4.0,0.0,2.0
4,2.0,3.0,0.0
...,...,...,...
26702,4.0,3.0,0.0
26703,0.0,2.0,0.0
26704,3.0,3.0,
26705,0.0,3.0,0.0


In [52]:
def onehot_pipeline(df: pd.DataFrame, is_test: str) -> pd.DataFrame:
    num_features = df.columns[df.dtypes != "object"].values
    cat_features = df.columns[df.dtypes == "object"].values

    num_transformer = Pipeline([
        ('scale', StandardScaler()),
        ('impute', KNNImputer(n_neighbors = 10))
    ])

    cat_transformer = Pipeline([
        ('impute', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
        ('encode', OneHotEncoder(drop = 'first'))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

    pipe = make_pipeline(preprocessor)
    df_preprocessed = pipe.fit_transform(df)
    df_preprocessed = pd.DataFrame(df_preprocessed, index = data.index)

    return(df_preprocessed)



In [53]:
prueba = onehot_pipeline(data.iloc[:, :data.shape[1]-2], "False")
prueba

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.679436,-2.042478,-0.22661,-1.626185,-0.272201,-2.175870,-0.747788,1.401639,0.690311,-0.531568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.517658,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,1.401639,0.690311,-0.531568,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.679436,-0.424715,-0.22661,0.614936,-0.272201,-2.175870,-0.747788,-0.713450,-1.448623,-0.290288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.679436,-0.424715,-0.22661,0.614936,-0.272201,0.459586,1.337277,-0.713450,-1.448623,-0.531568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,1.337277,-0.713450,0.690311,-0.531568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,0.419111,-2.042478,-0.22661,0.614936,-0.272201,-2.175870,-0.747788,1.401639,-1.448623,-0.531568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26703,-0.679436,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.713450,-1.448623,1.881227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26704,0.419111,1.193048,-0.22661,0.614936,3.673754,0.459586,1.337277,-0.713450,0.690311,-0.531568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26705,-0.679436,-0.424715,-0.22661,-1.626185,-0.272201,-2.175870,-0.747788,-0.713450,-1.234729,-0.531568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
prueba.shape

(26707, 102)

In [45]:
data

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,,0,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,0,0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,,0,1
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0,0


In [18]:
pruebadf.size

2724114

In [19]:
pruebadf.shape

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,income_poverty,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,-0.681849,-2.044279,-0.503893,-0.659437,-0.612735,-0.852660,-1.047610,-0.260383,-1.880528,-1.240062,-0.087709,55 - 64 Years,< 12 Years,Below Poverty,0,0
1,1.518373,1.197027,-0.503893,-0.659437,-0.612735,1.146988,1.302012,1.217471,-0.023708,-0.512890,1.427948,35 - 44 Years,12 Years,Below Poverty,0,1
2,-0.681849,-0.423626,-0.503893,-0.659437,1.632027,-0.852660,-1.047610,-0.999310,-0.023708,-1.240062,-0.087709,18 - 34 Years,College Graduate,"<= $75,000, Above Poverty",0,0
3,-0.681849,-0.423626,-0.503893,1.516445,1.632027,-0.852660,0.518805,1.956398,0.904702,0.941455,-0.845538,65+ Years,12 Years,Below Poverty,0,1
4,0.418262,-0.423626,-0.503893,-0.659437,-0.612735,-0.852660,0.518805,-0.260383,-0.952118,-1.240062,1.427948,45 - 54 Years,Some College,"<= $75,000, Above Poverty",0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,0.418262,-2.044279,-0.503893,-0.659437,-0.612735,-0.852660,-1.047610,-0.999310,0.904702,-0.512890,-0.087709,65+ Years,Some College,"<= $75,000, Above Poverty",0,0
26703,-0.681849,1.197027,1.984546,1.516445,-0.612735,0.147164,-0.264403,-0.260383,0.904702,-1.240062,-0.845538,18 - 34 Years,College Graduate,"<= $75,000, Above Poverty",0,0
26704,0.418262,1.197027,-0.503893,-0.659437,-0.612735,0.147164,1.302012,-0.260383,0.904702,0.941455,-0.087709,55 - 64 Years,Some College,unknown,0,1
26705,-0.681849,-0.423626,-0.503893,-0.659437,-0.612735,-0.852660,-1.047610,-0.260383,-1.880528,-1.240062,-0.087709,18 - 34 Years,Some College,"<= $75,000, Above Poverty",0,0
