In [51]:
import numpy as np
import pandas as pd
from Util import LabelEncoderWithMissingValues
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

print("Import complete.")

Import complete.


## Import all the datasets

In [52]:
# test data
original_test = pd.read_csv(r"test_set_features.csv")

# train data
original_train = pd.read_csv(r"training_set_features.csv")

#target
training_labels = pd.read_csv(r"training_set_labels.csv")

# check whether rows are equal
print("train df => rows: %s, cols: %s" % (original_train.shape[0], original_train.shape[1]))
print("train labels df => rows: %s, cols: %s" % (training_labels.shape[0], training_labels.shape[1]))
print("test df => rows: %s, cols: %s" % (original_test.shape[0], original_test.shape[1]))

assert(original_train.shape[1] == original_test.shape[1])

train df => rows: 26707, cols: 36
train labels df => rows: 26707, cols: 3
test df => rows: 26708, cols: 36


## Analyse dataset

### Check for duplicates

In [53]:
# check for duplicates
original_data_dup_count = np.sum(original_train.duplicated())
label_dup_count = np.sum(training_labels.duplicated())
original_test_dup_count = np.sum(original_test.duplicated())

print("duplicates in original training dataset: %s" % original_data_dup_count)
print("duplicates in label dataset: %s" % label_dup_count)
print("duplicates in original testing dataset: %s" % original_test_dup_count)

assert(original_data_dup_count == 0 and label_dup_count == 0 and original_test_dup_count == 0)

duplicates in original training dataset: 0
duplicates in label dataset: 0
duplicates in original testing dataset: 0


### Merging both train and test data sets

Before starting data cleaning, I need to merge them together for consistent results

In [54]:
original_train["type"] = "train"
original_test["type"] = "test"
original_data = pd.concat([original_train, original_test], ignore_index=True)

# the row count should be total of both df and column count should be incremented by one
print("data => rows: %s, cols: %s" % (original_data.shape[0], original_data.shape[1]))

assert(original_data.shape[0] == original_train.shape[0] + original_test.shape[0])
assert(original_data.shape[1] == original_train.shape[1])

data => rows: 53415, cols: 37


Now `data` df has both data sets with different `Type` feature

Since there are no duplicates in the dataset, no need to drop duplicates

Before continuing, I'm setting `respondent_id` as index for the df

### Setting `respodent_id` as index for the df

In [55]:
# set `respondent_id` as index
data = original_data.set_index("respondent_id")
training_labels = training_labels.set_index("respondent_id")

data.head(2)

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,type
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,train
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,train


### Look for columns with significant amt of null values

In [56]:
data.loc[data["type"] == "train"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26615 non-null  float64
 1   h1n1_knowledge               26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_h1n1             24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker   

I can see that following in training data have severely missing values:
1. health_insurance
2. employment_industry
3. employment_occupation

I'm doing the same for test data to see if they match

In [57]:
data.loc[data["type"] == "test"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26708 entries, 26707 to 53414
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26623 non-null  float64
 1   h1n1_knowledge               26586 non-null  float64
 2   behavioral_antiviral_meds    26629 non-null  float64
 3   behavioral_avoidance         26495 non-null  float64
 4   behavioral_face_mask         26689 non-null  float64
 5   behavioral_wash_hands        26668 non-null  float64
 6   behavioral_large_gatherings  26636 non-null  float64
 7   behavioral_outside_home      26626 non-null  float64
 8   behavioral_touch_face        26580 non-null  float64
 9   doctor_recc_h1n1             24548 non-null  float64
 10  doctor_recc_seasonal         24548 non-null  float64
 11  chronic_med_condition        25776 non-null  float64
 12  child_under_6_months         25895 non-null  float64
 13  health_worke

Here in test data also following have missing values
1. health_insurance
2. employment_industry
3. employment_occupation

In [58]:
# # im dropping those columns
# 
# data.drop("health_insurance", axis=1)
# data.drop("employment_industry", axis=1)
# data.drop("employment_occupation", axis=1)

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,type
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,train
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,train
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,train
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,train
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,,,,,dqpwygqj,"MSA, Principle City",1.0,1.0,,test
53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,Below Poverty,Married,Rent,Employed,qufhixun,Non-MSA,1.0,3.0,fcxhlnwr,test
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,qufhixun,"MSA, Not Principle City",1.0,0.0,,test
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,bhuqouqj,"MSA, Not Principle City",1.0,0.0,,test


### Encoding categorical data

This was not done in `attempt_1_170020C`. I am going with LabelEncoder to encode categorical features.
For that, I need to know what are the categorical features.

#### Identifying categorical columns and numerical columns

I need to identify categorical columns and unique values for each column.

In [59]:
categorical_columns = np.where(data.dtypes == object)[0]

categorical_features = [data.columns[feature] for feature in categorical_columns]
numerical_features = [feature for feature in data.columns if feature not in categorical_features]

for feature in categorical_features:
    print("%s:\t%s" % (feature, str(data[feature].unique())))

assert (len(categorical_features) + len(numerical_features) == len(data.columns))

age_group:	['55 - 64 Years' '35 - 44 Years' '18 - 34 Years' '65+ Years'
 '45 - 54 Years']
education:	['< 12 Years' '12 Years' 'College Graduate' 'Some College' nan]
race:	['White' 'Black' 'Other or Multiple' 'Hispanic']
sex:	['Female' 'Male']
income_poverty:	['Below Poverty' '<= $75,000, Above Poverty' '> $75,000' nan]
marital_status:	['Not Married' 'Married' nan]
rent_or_own:	['Own' 'Rent' nan]
employment_status:	['Not in Labor Force' 'Employed' 'Unemployed' nan]
hhs_geo_region:	['oxchjgsf' 'bhuqouqj' 'qufhixun' 'lrircsnp' 'atmpeygn' 'lzgpxyit'
 'fpwskwrf' 'mlyzmhmf' 'dqpwygqj' 'kbazzjca']
census_msa:	['Non-MSA' 'MSA, Not Principle  City' 'MSA, Principle City']
employment_industry:	[nan 'pxcmvdjn' 'rucpziij' 'wxleyezf' 'saaquncn' 'xicduogh' 'ldnlellj'
 'wlfvacwt' 'nduyfdeo' 'fcxhlnwr' 'vjjrobsf' 'arjwrbjb' 'atmlpfrs'
 'msuufmds' 'xqicxuve' 'phxvnwax' 'dotnnunm' 'mfikgejo' 'cfqqtusy'
 'mcubkhph' 'haxffmxo' 'qnlwzans']
employment_occupation:	[nan 'xgwztkwe' 'xtkaffoo' 'emcorrxb' 'vlluhb

As you can see, `employment_industry`, `hhs_geo_region` and `employment_occupation` are scrambled for privacy protection.
However, I can still use those columns because, for my use case, knowing actual occupation or industry is not necessary.

Now I can apply LabelEncoding to those columns, because `NaN` values are depicted as `NaN`

#### Applying Label Encoding to categorical columns

The problem with Native encoder is that it also encodes `NaN` values.
I have to make sure `Nan` values are unaffected as I'll be sanitizing them later

In [60]:
encoder, data = LabelEncoderWithMissingValues().categorical_to_numeric(data, ignored=['type'])

assert(data.shape[1] == original_data.shape[1] - 1)

data.tail(2)

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,type
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,,,test
53414,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,3.0,2.0,0.0,0.0,,2.0,test


As you can see `data` is now encoded with the exception of `type` column.
The type column will be later used to divide `data` into `train` and
`test`

### Filling missing values

I want to use Nearest Neighbor Imputation to fill `NaN` values

#### Normalizing data

KNN Imputer is distance based, and hence data should be normalized beforehand to avoid
bias.

In [61]:
temp = data.drop("type", axis=1)

scalar = MinMaxScaler()
temp = pd.DataFrame(scalar.fit_transform(temp), columns=temp.columns)

# note that `temp` has column `type` missing
temp.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.5,0.111111,0.5,0.0,0.0,0.0,0.0
2,0.333333,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,0.5,0.0,0.0,0.5,0.222222,0.5,0.666667,0.0,0.05,0.045455
3,0.333333,0.5,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.333333,1.0,0.0,0.0,,
4,0.666667,0.5,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.5,1.0,0.0,0.5,0.222222,0.5,0.333333,0.0,0.1,0.090909


#### Utilizing KNN Imputer

In [62]:
# got the neighbour value `n_neighbors` from `attempt_1_170020C`
imputer = KNNImputer(n_neighbors=1)

temp = pd.DataFrame(imputer.fit_transform(temp), columns=temp.columns)

temp["type"] = data["type"]
data = temp

assert (not any(data.isna().any()))

data.tail()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,type
53410,0.333333,0.5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.5,0.888889,1.0,0.333333,0.333333,0.9,0.363636,test
53411,1.0,0.5,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.5,0.222222,0.0,0.333333,1.0,0.4,0.136364,test
53412,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.222222,0.5,0.333333,0.0,0.1,0.409091,test
53413,1.0,0.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.111111,0.5,0.333333,0.0,0.8,0.363636,test
53414,0.666667,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.5,0.333333,1.0,0.0,0.0,0.0,0.045455,test


## Attempting to sort with Random Forest Classifier

This is new attempt from `attempt_2_170020C`

### Define targets and train, test data

In [63]:
target_h1n1_vaccine = training_labels["h1n1_vaccine"].values
target_seasonal_vaccine = training_labels["seasonal_vaccine"].values

train_data = data[data.type.eq("train")].drop("type", axis=1)
test_data = data[data.type.eq("test")].drop("type", axis=1)


### Split train data into train and test

In [64]:
# split in 80:20 ratio
# because of two separate train and test scenarios, I'm using different names rather than use X_train, X_test...
p_train, p_test, q_train, q_test = train_test_split(train_data, target_h1n1_vaccine, test_size=0.2, random_state=1, stratify=target_h1n1_vaccine)
r_train, r_test, s_train, s_test = train_test_split(train_data, target_seasonal_vaccine, test_size=0.2, random_state=1, stratify=target_seasonal_vaccine)

### Use Random Search CV

I'll be commenting out the code because I have already ran it once and
don't want to waste running time

In [65]:
# Use RandomSearchCV to determine K value for each

random_grid = {
    # 'n_estimators': [n for n in range(1, max_val) if n % 10 == 0],
    # 'max_features': ['auto', 'sqrt'],
    # 'max_depth': [n for n in range(0, max_val) if n % 10 == 0] + [None],
    # 'min_samples_split': [n for n in range(0, 30) if n % 2 == 0],
    # 'min_samples_leaf': [n for n in range(0, 30) if n % 2 == 0],
    # 'bootstrap': [True, False]

    'n_estimators': [n for n in range(1, 1000) if n % 10 == 0],
    'max_features': ['sqrt'],
    'max_depth': [n for n in range(50, 150) if n % 10 == 0] + [None],
    'min_samples_split': [n for n in range(1, 40) if n % 2 == 0],
    'min_samples_leaf': [n for n in range(1, 20) if n % 2 == 0],
    'bootstrap': [False]
}

rscv = RandomizedSearchCV(
    estimator = RandomForestClassifier(),
    param_distributions = random_grid,
    n_iter=100,
    cv=None,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

print("Ready")

Ready


In [66]:
# # for h1n1 vaccine
#
# rscv.fit(p_train, q_train)
#
# print(rscv.best_params_)
# print(rscv.best_score_)
#
# print("Finished")

# already done. see below for results

The optimal parameters for "h1n1" are:
1. 'n_estimators': 775
2. 'max_features': sqrt
3. 'max_depth': 90
4. 'min_samples_split': 14
5. 'min_samples_leaf': 2
6. 'bootstrap': False

{'n_estimators': 770, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 140, 'bootstrap': False}
0.8325299604752726

{'n_estimators': 790, 'min_samples_split': 28, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
0.8329044331425012

{'n_estimators': 775, 'min_samples_split': 14, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 90, 'bootstrap': False}
0.8346828925813246 => This seems to be the best overall

In [67]:
# # for seasonal vaccine
# rscv.fit(r_train, s_train)
#
# print(rscv.best_params_)
# print(rscv.best_score_)
#
# print("Finished")

# already done. see below for results

The optimal parameters for "h1n1" are:
1. 'n_estimators': 650
2. 'max_features': sqrt
3. 'max_depth': 120
4. 'min_samples_split': 34
5. 'min_samples_leaf': 2
6. 'bootstrap': False

{'n_estimators': 650, 'min_samples_split': 34, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 120, 'bootstrap': False}
0.7750058506903815

### Train with RandomForestClassifier

In [68]:
# 1. Do for h1n1 vaccine

rfc_h1n1 = RandomForestClassifier(
    n_estimators=775,
    max_features="sqrt",
    max_depth=90,
    min_samples_split=14,
    min_samples_leaf=2,
    bootstrap=False
)
rfc_h1n1.fit(p_train, q_train)
print("H1N1 Fit complete")

H1N1 Fit complete


In [69]:
rfc_h1n1.score(p_test, q_test)

0.8384500187195807

In [70]:
# 2. Do for seasonal vaccine

rfc_seasonal = RandomForestClassifier(
    n_estimators=650,
    max_features='sqrt',
    max_depth=120,
    min_samples_split=34,
    min_samples_leaf=2,
    bootstrap=False
)
rfc_seasonal.fit(r_train, s_train)
print("Seasonal Fit complete")

Seasonal Fit complete


In [71]:
rfc_seasonal.score(r_test, s_test)

0.7854736053912392

### Execute model on given test data

In [72]:
build = pd.DataFrame(original_test["respondent_id"])

result_h1n1 = rfc_h1n1.predict(test_data)
build["h1n1_vaccine"] = result_h1n1

result_seasonal = rfc_seasonal.predict(test_data)
build["seasonal_vaccine"] = result_seasonal

assert (build.shape == (26708, 3))

build.head(3)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0,0
1,26708,0,0
2,26709,0,1


## 4. Format the table and save it as a csv file

In [73]:
# convert to a csv file

# they require float values
build.h1n1_vaccine = build.h1n1_vaccine.astype(float)
build.seasonal_vaccine = build.seasonal_vaccine.astype(float)

build.to_csv("output.csv", index=False)
