In [275]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

### import all the files

In [276]:
submission_format = pd.read_csv(r"submission_format.csv")
original_test = pd.read_csv(r"test_set_features.csv")

#data
original_train = pd.read_csv(r"training_set_features.csv")

#target
training_labels = pd.read_csv(r"training_set_labels.csv")

# check whether rows are equal
print(original_train.shape)
print(training_labels.shape)

print(original_test.shape)

(26707, 36)
(26707, 3)
(26708, 36)


### Sanitize the dataset

In [277]:
# now `data` can be sanitized

# check for duplicates
np.sum(original_train.duplicated())
np.sum(original_test.duplicated())

0

In [278]:
# set `respondent_id` as index
train = original_train.set_index("respondent_id")
training_labels = training_labels.set_index("respondent_id")
test = original_test.set_index("respondent_id")

train.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [279]:
# find numeric columns
numeric_columns = np.where(train.dtypes != object)[0]

# I assume both train df and test df has identical column names and data types
numeric_features = [train.columns[col] for col in numeric_columns]

# visualize the unique values in each column
for feature in numeric_features:
    print("%s\t%s" % (feature, str(sorted(train[feature].unique()))))

h1n1_concern	[0.0, 1.0, 2.0, 3.0, nan]
h1n1_knowledge	[0.0, 1.0, 2.0, nan]
behavioral_antiviral_meds	[0.0, 1.0, nan]
behavioral_avoidance	[0.0, 1.0, nan]
behavioral_face_mask	[0.0, 1.0, nan]
behavioral_wash_hands	[0.0, 1.0, nan]
behavioral_large_gatherings	[0.0, 1.0, nan]
behavioral_outside_home	[0.0, 1.0, nan]
behavioral_touch_face	[0.0, 1.0, nan]
doctor_recc_h1n1	[0.0, nan, 1.0]
doctor_recc_seasonal	[0.0, nan, 1.0]
chronic_med_condition	[0.0, 1.0, nan]
child_under_6_months	[0.0, 1.0, nan]
health_worker	[0.0, 1.0, nan]
health_insurance	[1.0, nan, 0.0]
opinion_h1n1_vacc_effective	[1.0, 2.0, 3.0, 4.0, 5.0, nan]
opinion_h1n1_risk	[1.0, 2.0, 3.0, 4.0, 5.0, nan]
opinion_h1n1_sick_from_vacc	[1.0, 2.0, 3.0, 4.0, 5.0, nan]
opinion_seas_vacc_effective	[1.0, 2.0, 3.0, 4.0, 5.0, nan]
opinion_seas_risk	[1.0, 2.0, 3.0, 4.0, 5.0, nan]
opinion_seas_sick_from_vacc	[1.0, 2.0, 3.0, 4.0, 5.0, nan]
household_adults	[0.0, 1.0, 2.0, 3.0, nan]
household_children	[0.0, 1.0, 2.0, 3.0, nan]


In [280]:
# as you can see, the `nan` values  cannot be replaced by mean values
# that would compromise the purpose of the columns
# so I'm replacing it with a negative high number so it can be an outlier
print(numeric_features)

# extract only numeric features, that is what I'm going to work on
train_numeric = train[numeric_features]
test_numeric = test[numeric_features]

# for feature in numeric_features:
#     train_numeric[feature].fillna(-9999)
#     test_numeric[feature].fillna(-9999)

train_numeric = train_numeric.fillna(-9999)
test_numeric = test_numeric.fillna(-9999)

['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']


In [281]:
# these are going to be the targets
h1n1_vaccine = training_labels["h1n1_vaccine"].values
seasonal_vaccine = training_labels["seasonal_vaccine"].values

In [282]:
# split training data for train and test in 80:20 ratio
x_train, x_test, y_train, y_test = train_test_split(train_numeric, h1n1_vaccine, test_size=0.2, random_state=1, stratify=h1n1_vaccine)

In [283]:
# to accurately predict n in knn, I'm going to use grid search

# I'm commenting out the code because I have already ran it once and found the answer
# repeating again would be time consuming

# knn2 = KNeighborsClassifier()
# param_grid = {
#     "n_neighbors": np.arange(1, 30)
# }
# knngscv = GridSearchCV(knn2, param_grid, cv=5)
# knngscv.fit(x_train, y_train)
#
# print(knngscv.best_params_)
# print(knngscv.best_score_)

In [284]:
# building the model
knn_h1n1 = KNeighborsClassifier(n_neighbors=17)
knn_h1n1.fit(x_train, y_train)

# test the model for first 100 elements
knn_h1n1.predict(x_test)[0:100]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [285]:
# prints accuracy of the model: h1n1_vaccine
knn_h1n1.score(x_test, y_test)

0.8339573193560464

In [286]:
# now do another knn for seasonal_vaccine

# split training data for train and test in 80:20 ratio
x_train, x_test, y_train, y_test = train_test_split(train_numeric, seasonal_vaccine, test_size=0.2, random_state=1, stratify=seasonal_vaccine)

knn_seasonal = KNeighborsClassifier(n_neighbors=17)
knn_seasonal.fit(x_train, y_train)

# prints accuracy of the model: seasonal_vaccine
knn_seasonal.score(x_test, y_test)

0.7338075627105953

In [287]:
# now that it works, I'm going to find for test set

build_df = pd.DataFrame(original_test["respondent_id"])

output_for_h1n1_vaccine = knn_h1n1.predict(test_numeric).tolist()
build_df["h1n1_vaccine"] = output_for_h1n1_vaccine

output_for_seasonal_vaccine = knn_seasonal.predict(test_numeric).tolist()
build_df["seasonal_vaccine"] = output_for_seasonal_vaccine

build_df.head(100)

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0,0
1,26708,0,0
2,26709,0,1
3,26710,1,1
4,26711,0,0
...,...,...,...
95,26802,0,0
96,26803,0,0
97,26804,0,0
98,26805,0,0


In [288]:
# convert to a csv file
build_df.to_csv("output.csv", index=False)
