In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

print("Import complete")

Import complete


Following approach is modeled after this code:
https://github.com/Rashmini/Flu-Shot-Learning/blob/master/Random_Forest_Classification/flushot_H1N1_rf.ipynb

## Import all the datasets

In [2]:
# test data
test = pd.read_csv(r"test_set_features.csv")

# train data
train = pd.read_csv(r"training_set_features.csv")

# target
training_labels = pd.read_csv(r"training_set_labels.csv")
label_h1n1 = training_labels[["h1n1_vaccine"]]

# check whether rows are equal
print("train df => rows: %s, cols: %s" % (train.shape[0], train.shape[1]))
print("train labels df => rows: %s, cols: %s" % (training_labels.shape[0], training_labels.shape[1]))
print("test df => rows: %s, cols: %s" % (test.shape[0], test.shape[1]))

assert(train.shape[1] == test.shape[1])

train df => rows: 26707, cols: 36
train labels df => rows: 26707, cols: 3
test df => rows: 26708, cols: 36


## Pre-processing

### Check for duplicates

In [3]:
# check for duplicates
original_data_dup_count = np.sum(train.duplicated())
label_dup_count = np.sum(training_labels.duplicated())
original_test_dup_count = np.sum(test.duplicated())

print("duplicates in original training dataset: %s" % original_data_dup_count)
print("duplicates in label dataset: %s" % label_dup_count)
print("duplicates in original testing dataset: %s" % original_test_dup_count)

assert(original_data_dup_count == 0 and label_dup_count == 0 and original_test_dup_count == 0)

duplicates in original training dataset: 0
duplicates in label dataset: 0
duplicates in original testing dataset: 0


### Setting `respodent_id` as index for the df

In [4]:
train = train.set_index("respondent_id")
test = test.set_index("respondent_id")
training_labels = training_labels.set_index("respondent_id")

### Look for columns with significant amt of null values

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26615 non-null  float64
 1   h1n1_knowledge               26591 non-null  float64
 2   behavioral_antiviral_meds    26636 non-null  float64
 3   behavioral_avoidance         26499 non-null  float64
 4   behavioral_face_mask         26688 non-null  float64
 5   behavioral_wash_hands        26665 non-null  float64
 6   behavioral_large_gatherings  26620 non-null  float64
 7   behavioral_outside_home      26625 non-null  float64
 8   behavioral_touch_face        26579 non-null  float64
 9   doctor_recc_h1n1             24547 non-null  float64
 10  doctor_recc_seasonal         24547 non-null  float64
 11  chronic_med_condition        25736 non-null  float64
 12  child_under_6_months         25887 non-null  float64
 13  health_worker   

I can see that following in training data have severely missing values:
1. health_insurance
2. employment_industry
3. employment_occupation

I'm doing the same for test data to see if they match

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26708 entries, 26707 to 53414
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   h1n1_concern                 26623 non-null  float64
 1   h1n1_knowledge               26586 non-null  float64
 2   behavioral_antiviral_meds    26629 non-null  float64
 3   behavioral_avoidance         26495 non-null  float64
 4   behavioral_face_mask         26689 non-null  float64
 5   behavioral_wash_hands        26668 non-null  float64
 6   behavioral_large_gatherings  26636 non-null  float64
 7   behavioral_outside_home      26626 non-null  float64
 8   behavioral_touch_face        26580 non-null  float64
 9   doctor_recc_h1n1             24548 non-null  float64
 10  doctor_recc_seasonal         24548 non-null  float64
 11  chronic_med_condition        25776 non-null  float64
 12  child_under_6_months         25895 non-null  float64
 13  health_worke

Here in test data also following have missing values
1. health_insurance
2. employment_industry
3. employment_occupation

In [7]:
# I'm dropping following columns because its not worth for h1n1

train = train.drop("household_children", axis=1)
test = test.drop("household_children", axis=1)

assert ("household_children" not in train.columns and "household_children" not in test.columns)

### Scaling and Encoding data

I'm using this answer:
https://stackoverflow.com/a/64907828/10582056

#### Identifying categorical columns and numerical columns

I need to identify categorical columns and unique values for each column.

In [8]:
categorical_columns = np.where(train.dtypes == object)[0]
numerical_columns =  np.where(train.dtypes != object)[0]

assert (len(categorical_columns) + len(numerical_columns) == len(train.columns))

As you can see, `employment_industry`, `hhs_geo_region` and `employment_occupation` are scrambled for privacy protection.
However, I can still use those columns because, for my use case, knowing actual occupation or industry is not necessary.

#### Applying Scaler and Encoding into a pipeline

I'm going to use one hot encoding as opposed to label encoding with this one
And I'm using Standard scaler as opposed to Min Max Scaler

In [9]:
# fill numeric values with its mean
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy = 'mean'))
])

non_numeric_preprocessing_steps = Pipeline([
     ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder())
])


preprocessor = ColumnTransformer(
    transformers = [
        ('numeric', numeric_preprocessing_steps, numerical_columns),
        ('non_numeric', non_numeric_preprocessing_steps, categorical_columns)
    ],
    remainder = "drop"
)

## Attempting to sort with Random Forest Classifier

### Split train data into train and test

In [10]:
# split in 80:20 ratio

X_train, X_test, y_train, y_test = train_test_split(train, label_h1n1, test_size=0.2, random_state=42, stratify=label_h1n1)

X_train_transform = pd.DataFrame(preprocessor.fit_transform(X_train))
X_test_transform =  pd.DataFrame(preprocessor.transform(X_test))

assert (X_train_transform.shape[1] == X_test_transform.shape[1])

X_train_transform.shape

(21365, 111)

In [11]:
# transform whole dataset
train_transform = pd.DataFrame(preprocessor.fit_transform(train))
test_transform = pd.DataFrame(preprocessor.transform(test))

### Use feature engineering with correlation to remove unnecessary features

because as you can see there are `112` features now
So I need to cut them

https://stackoverflow.com/a/60223949/10582056

In [12]:
# setting threshold
threshold = 0.84

corr = X_train_transform.corr().abs()

# this gives a 111 * 111 co-relational matrix

# select upper triangle of correlations
# because of abs(), both upper and lower triangles have same values
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
upper.head(2)

removed_cols = [column for column in upper.columns if any(upper[column] > threshold)]

removed_cols

[37, 43, 44, 51, 74, 90, 97]

### Prune columns which have above threshold co-relation

In [13]:
X_train_transform = X_train_transform.drop(columns = removed_cols)
X_test_transform = X_test_transform.drop(columns = removed_cols)

train_transform = train_transform.drop(columns = removed_cols)
test_transform = test_transform.drop(columns = removed_cols)

## Use Randomized Search CV with Random Forest Classifier

In [14]:
# Use RandomSearchCV to determine K value for each

# random_grid = {
#     # 'n_estimators': [n for n in range(1, max_val) if n % 10 == 0],
#     # 'max_features': ['auto', 'sqrt'],
#     # 'max_depth': [n for n in range(0, max_val) if n % 10 == 0] + [None],
#     # 'min_samples_split': [n for n in range(0, 30) if n % 2 == 0],
#     # 'min_samples_leaf': [n for n in range(0, 30) if n % 2 == 0],
#     # 'bootstrap': [True, False]
#
#     'n_estimators': [n for n in range(1500, 2000) if n % 10 == 0],
#     'max_features': ['sqrt', 'auto'],
#     'max_depth': [n for n in range(50, 120) if n % 10 == 0] + [None],
#     'min_samples_split': [n for n in range(12, 24) if n % 2 == 0],
#     'min_samples_leaf': [n for n in range(1, 10)],
#     'bootstrap': [False]
# }
#
# rscv = RandomizedSearchCV(
#     estimator = RandomForestClassifier(),
#     param_distributions = random_grid,
#     scoring='roc_auc',
#     n_iter=10,
#     cv=None,
#     verbose=10,
#     random_state=42,
#     n_jobs=4
# )
#
# print("Ready!")

In [15]:
# for h1n1

# rscv.fit(X_train_transform, y_train.values.ravel())
#
# print(rscv.best_params_)
# print(rscv.best_score_)
# print(rscv.best_estimator_)
# print("Finished")

# already done. see below for results

{'n_estimators': 1780, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
0.8646527299947001

### Fine tune with Grid Search CV

In [16]:
# grid = {
#     # 'n_estimators': [n for n in range(1, max_val) if n % 10 == 0],
#     # 'max_features': ['auto', 'sqrt'],
#     # 'max_depth': [n for n in range(0, max_val) if n % 10 == 0] + [None],
#     # 'min_samples_split': [n for n in range(0, 30) if n % 2 == 0],
#     # 'min_samples_leaf': [n for n in range(0, 30) if n % 2 == 0],
#     # 'bootstrap': [True, False]
#
#     'n_estimators': [1700, 1780, 1900],
#     'max_features': ['auto'],
#     'max_depth': [90, 100, 110] + [None],
#     'min_samples_split': [12, 16, 20],
#     'min_samples_leaf': [3, 4, 5],
#     'bootstrap': [False]
# }
#
# gscv = GridSearchCV(
#     estimator=RandomForestClassifier(),
#     param_grid=grid,
#     scoring='roc_auc',
#     verbose=10,
#     n_jobs=4
# )
#
# print("Ready")

In [17]:
# gscv.fit(X_train_transform, y_train.values.ravel())
#
# print(grid.best_estimator_)
# print(grid.best_params_)
# print(grid.best_score_)

# print("Finished")

### Train with RandomForestClassifier

In [18]:
rfc = RandomForestClassifier(
    n_estimators=1780,
    max_features="auto",
    max_depth=100,
    min_samples_split=16,
    min_samples_leaf=4,
    bootstrap=False
)

In [19]:
rfc.fit(X_train_transform, y_train.values.ravel())
print("H1N1 Fit complete")

# at this point I got to know that the competition requires probabilities
# not labels :(
predicted = rfc.predict_proba(X_test_transform)

h1n1_predicted = pd.DataFrame( {
    "h1n1_vaccine": predicted[:, 1],
    },
    index = y_test.index
)

assert (h1n1_predicted.shape[1] == 1)

roc_auc_score(y_test, h1n1_predicted)

H1N1 Fit complete


0.8575340658374075

### Execute model on given test data

In [20]:
rfc.fit(train_transform, label_h1n1.values.ravel())
h1n1_result = rfc.predict_proba(test_transform)

print("Done")

Done


In [21]:
build = pd.DataFrame(test.index)

build["h1n1_vaccine"] = h1n1_result[:, 1]

print(build.shape)

assert (build.shape[0] == len(test.index) and build.shape[1] == 2)

build.head()

(26708, 2)


Unnamed: 0,respondent_id,h1n1_vaccine
0,26707,0.173791
1,26708,0.045505
2,26709,0.288724
3,26710,0.55885
4,26711,0.319752


## 4. Format the table and save it as a csv file

In [22]:
# convert to a csv file

build.to_csv("output.csv", index=False)
