In [17]:
# Pre-processing
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn import tree
from sklearn import linear_model

# file names
file_train = "training_set_features.csv"
file_labels = "training_set_labels.csv"
file_test = "test_set_features.csv"

# datasets
data_labels = pd.read_csv(file_labels)
data_train = pd.read_csv(file_train)
data_test = pd.read_csv(file_test)

data_train.shape

(26707, 36)

In [18]:
 ## Creating new column for variables that have more than 400 missing values
numer_of_null = data_train.isnull().sum()
names = list(numer_of_null.index)
to_take_out = [] # it's name to_take_out, but I actually don't take out any variables

## getting the names of all columns with more than 400 null values
for element in names:
    if numer_of_null[element] >=400:
        to_take_out.append(element)
        #print("Variable name:",element)                 # useful just to get a sense of which variables
        #print("Number of NA's",numer_of_null[element])
        
# for these values, create new column showing that it is null        
for element in to_take_out:
    name_column = 'is_null_'+ element
    data_train[name_column] = np.where(data_train[element].isnull(), 1, 0)   
        


In [19]:
## setting up the imputer
imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

## testing the imputer with training dataset
var_names = list(data_train)
imp.fit(data_train[var_names])
data_train[var_names] = imp.transform(data_train[var_names])
data_train = pd.DataFrame(data_train)


## testing the imputer with test dataset
var_names = list(data_test)
imp.fit(data_test[var_names])
data_test[var_names]= imp.transform(data_test[var_names])
data_test = pd.DataFrame(data_test)

## checking with variables have null values
data_train.isnull().sum()
data_test.isnull().sum()

respondent_id                  0
h1n1_concern                   0
h1n1_knowledge                 0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_h1n1               0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_h1n1_vacc_effective    0
opinion_h1n1_risk              0
opinion_h1n1_sick_from_vacc    0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [20]:
## on this chunk need to conver categorical variables into one-hot vectors

cat_variables = ['age_group','education','race','income_poverty','marital_status',
                'rent_or_own','employment_status','hhs_geo_region','census_msa',
                 'household_children','employment_industry','employment_occupation','sex']


data_train = pd.get_dummies(data_train, columns = cat_variables)
data_test = pd.get_dummies(data_test, columns = cat_variables)


In [21]:
## Training a tree classifier
X_train = data_train
Y_train = data_labels['h1n1_vaccine']
X_test = data_test

clf = tree.DecisionTreeClassifier(min_samples_leaf=16)
clf.fit(X_train, Y_train)
Y_predicted = clf.predict(X_test)

ValueError: could not convert string to float: 'Female'

In [11]:
for var in cat_variables:
    print(data_train[var].head())

0    55 - 64 Years
1    35 - 44 Years
2    18 - 34 Years
3        65+ Years
4    45 - 54 Years
Name: age_group, dtype: object
0          < 12 Years
1            12 Years
2    College Graduate
3            12 Years
4        Some College
Name: education, dtype: object
0    White
1    White
2    White
3    White
4    White
Name: race, dtype: object
0                Below Poverty
1                Below Poverty
2    <= $75,000, Above Poverty
3                Below Poverty
4    <= $75,000, Above Poverty
Name: income_poverty, dtype: object
0    Not Married
1    Not Married
2    Not Married
3    Not Married
4        Married
Name: marital_status, dtype: object
0     Own
1    Rent
2     Own
3    Rent
4     Own
Name: rent_or_own, dtype: object
0    Not in Labor Force
1              Employed
2              Employed
3    Not in Labor Force
4              Employed
Name: employment_status, dtype: object
0    oxchjgsf
1    bhuqouqj
2    qufhixun
3    lrircsnp
4    qufhixun
Name: hhs_geo_region, dtype:

In [16]:
data_train.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,1,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
