In [40]:
from copy import deepcopy

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

print("Import complete")

Import complete


## Import all the datasets

In [41]:
# test data
test = pd.read_csv(r"test_set_features.csv")

# train data
train = pd.read_csv(r"training_set_features.csv")

# target
training_labels = pd.read_csv(r"training_set_labels.csv")

# check whether rows are equal
print("train df => rows: %s, cols: %s" % (train.shape[0], train.shape[1]))
print("train labels df => rows: %s, cols: %s" % (training_labels.shape[0], training_labels.shape[1]))
print("test df => rows: %s, cols: %s" % (test.shape[0], test.shape[1]))

assert(train.shape[1] == test.shape[1])

train df => rows: 26707, cols: 36
train labels df => rows: 26707, cols: 3
test df => rows: 26708, cols: 36


## Pre-processing

From previous work, it is known that the dataset does not have any
duplicates

### Setting `respodent_id` as index for the df

In [42]:
train = train.set_index("respondent_id")
test = test.set_index("respondent_id")
training_labels = training_labels.set_index("respondent_id")

### Create a grouped dataset

In [43]:
train["type"] = "train"
test["type"] = "test"

data_original = pd.concat([train, test], ignore_index=True)

# the row count should be total of both df and column count should be incremented by one
print("data => rows: %s, cols: %s" % (data_original.shape[0], data_original.shape[1]))

assert(data_original.shape[0] == train.shape[0] + test.shape[0])
assert(data_original.shape[1] == train.shape[1])

del train, test

data => rows: 53415, cols: 36


### Look for columns with significant amt of null values

I can see that following in training data have severely missing values:
1. health_insurance
2. employment_industry
3. employment_occupation

I'm doing the same for test data to see if they match

Here in test data also following have missing values
1. health_insurance
2. employment_industry
3. employment_occupation

### Select important columns

In [44]:
# selected columns
# "type" column is omitted here, when necessary, it will be brought back
selected_features = ['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
                 'chronic_med_condition', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
                 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk',
                 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'income_poverty']

selected_data = data_original[selected_features]

selected_data.shape

(53415, 14)

In [45]:
# divide into numeric columns and categorical columns
categorical_columns = np.where(data_original.dtypes == object)[0]
categorical_features = [data_original.columns[feature] for feature in categorical_columns]
categorical_features = [column for column in categorical_features if column in selected_features]

numerical_features = [feat for feat in selected_features if feat not in categorical_features]

print(categorical_features)
print(numerical_features)

assert (len(numerical_features) + len(categorical_features) == len(selected_features))
del categorical_columns

['age_group', 'education', 'income_poverty']
['h1n1_concern', 'h1n1_knowledge', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc']


### Extract columns from data

In [46]:
data_subset_numeric = data_original.loc[:, numerical_features]
data_subset_categorical = data_original.loc[:, categorical_features]

### Do scaling with Standard scaler

In [47]:
scalar = StandardScaler()
data_subset_numeric = pd.DataFrame(scalar.fit_transform(data_subset_numeric), columns = data_subset_numeric.columns)

del scalar

### Impute with KNNImputer

In [48]:
imputer = KNNImputer(n_neighbors=1, missing_values=np.nan)
data_subset_numeric = pd.DataFrame(imputer.fit_transform(data_subset_numeric), columns = data_subset_numeric.columns)

del imputer

For categorical data, fill values as "unknown"

In [49]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'unknown')
data_subset_categorical = pd.DataFrame(imputer.fit_transform(data_subset_categorical), columns = data_subset_categorical.columns)

### Rejoining data back into one df

In [50]:
data = pd.concat([data_subset_numeric, data_subset_categorical], axis = 1)
data["type"] = data_original["type"]

### Encoding with LabelEncoding

Categorical columns: (except `type`)
age_group                       object
education                       object
income_poverty                  object

In [51]:
# change object to category
data['age_group'] = data['age_group'].astype('category')
data['education'] = data['education'].astype('category')
data['income_poverty'] = data['income_poverty'].astype('category')

`age group: `
['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years']

age group in order, hence can be directly label encoded

`education: `
['12 Years', '< 12 Years', 'College Graduate', 'Some College', 'unknown']

education not in order

`income_poverty: `
['<= $75,000, Above Poverty', '> $75,000', 'Below Poverty', 'unknown']

income_poverty not in order

In [52]:
data['age_group'] = data['age_group'].cat.codes

data['education'].cat.reorder_categories(['unknown','< 12 Years', '12 Years', 'Some College', 'College Graduate'], ordered = True, inplace = True)
data['education'] = data['education'].cat.codes

data['income_poverty'].cat.reorder_categories(['unknown','Below Poverty', '<= $75,000, Above Poverty', '> $75,000'], ordered = True, inplace = True)
data['income_poverty'] = data['income_poverty'].cat.codes

  res = method(*args, **kwargs)
  res = method(*args, **kwargs)


data types of converted columns are in `int8`,
convert that to `float64` for consistency

In [53]:
for col in data.columns:
    if data[col].dtype == 'int8':
        data[col] = data[col].astype('float64')

assert (not any(data.isna().any()))

## Attempting to sort with Random Forest Classifier

### Split train data into train and test

In [54]:
# split in 80:20 ratio

target_h1n1_vaccine = training_labels["h1n1_vaccine"].values
target_seasonal_vaccine = training_labels["seasonal_vaccine"].values

train = data[data.type.eq("train")].drop("type", axis=1)
test = data[data.type.eq("test")].drop("type", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    train,
    training_labels[['h1n1_vaccine', 'seasonal_vaccine']],
    test_size = 0.2,
    shuffle = True,
    stratify = training_labels[['h1n1_vaccine', 'seasonal_vaccine']],
    random_state = 42
)

### Use feature engineering with correlation to remove unnecessary features

https://stackoverflow.com/a/60223949/10582056

In [55]:
# setting threshold
threshold = 0.84

corr = X_train.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

removed_cols = [column for column in upper.columns if any(upper[column] > threshold)]
removed_cols

[]

no such columns found here

### Prune columns which have above threshold co-relation

In [56]:
# X_train_transform = X_train_transform.drop(columns = removed_cols)
# X_test_transform = X_test_transform.drop(columns = removed_cols)
#
# train_transform = train_transform.drop(columns = removed_cols)
# test_transform = test_transform.drop(columns = removed_cols)

## Use Randomized Search CV with Random Forest Classifier

In [57]:
# Use RandomSearchCV to determine K value for each

# random_grid = {
#     # 'n_estimators': [n for n in range(1, max_val) if n % 10 == 0],
#     # 'max_features': ['auto', 'sqrt'],
#     # 'max_depth': [n for n in range(0, max_val) if n % 10 == 0] + [None],
#     # 'min_samples_split': [n for n in range(0, 30) if n % 2 == 0],
#     # 'min_samples_leaf': [n for n in range(0, 30) if n % 2 == 0],
#     # 'bootstrap': [True, False]
#
#     'n_estimators': [n for n in range(200, 2000) if n % 100 == 0],
#     'max_features': ['auto', 'sqrt'],
#     'max_depth': [n for n in range(10, 120) if n % 10 == 0] + [None],
#     'min_samples_split': [n for n in range(1, 30) if n % 2 == 0],
#     'min_samples_leaf': [n for n in range(1, 30) if n % 2 == 0],
#     'bootstrap': [True, False]
# }
#
# rscv_h1n1 = RandomizedSearchCV(
#     estimator = RandomForestClassifier(),
#     param_distributions = random_grid,
#     scoring='roc_auc',
#     n_iter=10,
#     cv=None,
#     verbose=2,
#     random_state=42,
#     n_jobs=-1
# )
#
# rscv_seasonal = deepcopy(rscv_h1n1)
#
# print("Ready!")

In [58]:
# for h1n1

# rscv_h1n1.fit(X_train, y_train['h1n1_vaccine'])
#
# print(rscv_h1n1.best_params_)
# print(rscv_h1n1.best_score_)
# print(rscv_h1n1.best_estimator_)
# print("Finished")

# already done. see below for results

{'n_estimators': 1780, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
0.8646527299947001

'n_estimators': 200, 'min_samples_split': 8, 'min_samples_leaf': 24, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': False}
0.8199652595257858

In [59]:
# for seasonal

# rscv_seasonal.fit(X_train, y_train['seasonal_vaccine'])
#
# print(rscv_seasonal.best_params_)
# print(rscv_seasonal.best_score_)
# print(rscv_seasonal.best_estimator_)
# print("Finished")

# already done. see below for results

{'n_estimators': 1780, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
0.8646527299947001

{'n_estimators': 600, 'min_samples_split': 24, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
0.8443021661049906

### Fine tune with Grid Search CV

In [60]:
# grid = {
#     # 'n_estimators': [n for n in range(1, max_val) if n % 10 == 0],
#     # 'max_features': ['auto', 'sqrt'],
#     # 'max_depth': [n for n in range(0, max_val) if n % 10 == 0] + [None],
#     # 'min_samples_split': [n for n in range(0, 30) if n % 2 == 0],
#     # 'min_samples_leaf': [n for n in range(0, 30) if n % 2 == 0],
#     # 'bootstrap': [True, False]
#
#     'n_estimators': [1700, 1780, 1900],
#     'max_features': ['auto'],
#     'max_depth': [90, 100, 110] + [None],
#     'min_samples_split': [12, 16, 20],
#     'min_samples_leaf': [3, 4, 5],
#     'bootstrap': [False]
# }
#
# gscv = GridSearchCV(
#     estimator=RandomForestClassifier(),
#     param_grid=grid,
#     scoring='roc_auc',
#     verbose=10,
#     n_jobs=4
# )
#
# print("Ready")

In [61]:
# gscv.fit(X_train_transform, y_train.values.ravel())
#
# print(grid.best_estimator_)
# print(grid.best_params_)
# print(grid.best_score_)

# print("Finished")

### Train with RandomForestClassifier

In [65]:
rfc_h1n1 = RandomForestClassifier(
    n_estimators=1780,
    max_features="auto",
    max_depth=100,
    min_samples_split=16,
    min_samples_leaf=4,
    bootstrap=False
)

rfc_seasonal = RandomForestClassifier(
    n_estimators=1780,
    max_features="auto",
    max_depth=100,
    min_samples_split=16,
    min_samples_leaf=4,
    bootstrap=False
)

In [66]:
rfc_h1n1.fit(X_train, y_train['h1n1_vaccine'])
print("H1N1 Fit complete")

predicted = rfc_h1n1.predict_proba(X_test)

h1n1_predicted = pd.DataFrame( {
    "h1n1_vaccine": predicted[:, 1],
    },
    index = y_test.index
)

assert (h1n1_predicted.shape[1] == 1)

roc_auc_score(y_test["h1n1_vaccine"], h1n1_predicted)

H1N1 Fit complete


0.821942765832905

In [64]:
rfc_seasonal.fit(X_train, y_train['seasonal_vaccine'])
print("Seasonal Fit complete")

predicted = rfc_h1n1.predict_proba(X_test)

seasonal_predicted = pd.DataFrame( {
    "seasonal_vaccine": predicted[:, 1],
    },
    index = y_test.index
)

assert (seasonal_predicted.shape[1] == 1)

roc_auc_score(y_test["seasonal_vaccine"], seasonal_predicted)

Seasonal Fit complete


0.7209534990096823

### Execute model on given test data

In [20]:
rfc.fit(train_transform, label_h1n1.values.ravel())
h1n1_result = rfc.predict_proba(test_transform)

print("Done")

Done


In [21]:
build = pd.DataFrame(test.index)

build["h1n1_vaccine"] = h1n1_result[:, 1]

print(build.shape)

assert (build.shape[0] == len(test.index) and build.shape[1] == 2)

build.head()

(26708, 2)


Unnamed: 0,respondent_id,h1n1_vaccine
0,26707,0.173791
1,26708,0.045505
2,26709,0.288724
3,26710,0.55885
4,26711,0.319752


## 4. Format the table and save it as a csv file

In [22]:
# convert to a csv file

build.to_csv("output.csv", index=False)
