In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

print("Import complete")

Import complete


This is a continuation of `attempt_4_170020C_h1n1.ipynb`, so many
checking parts will be removed

Run that file before running this

## Import all the datasets

In [2]:
# test data
test = pd.read_csv(r"test_set_features.csv", index_col="respondent_id")

# train data
train = pd.read_csv(r"training_set_features.csv", index_col="respondent_id")

# target
training_labels = pd.read_csv(r"training_set_labels.csv", index_col="respondent_id")
label_seasonal = training_labels[["seasonal_vaccine"]]

# output
output = pd.read_csv(r"output.csv")

# check whether rows are equal
print("train df => rows: %s, cols: %s" % (train.shape[0], train.shape[1]))
print("train labels df => rows: %s, cols: %s" % (training_labels.shape[0], training_labels.shape[1]))
print("test df => rows: %s, cols: %s" % (test.shape[0], test.shape[1]))

assert(train.shape[1] == test.shape[1])

if "seasonal_vaccine" in output.columns:
    output = output.drop(columns=["seasonal_vaccine"])

assert (list(output.columns) == ["respondent_id", "h1n1_vaccine"])

train df => rows: 26707, cols: 35
train labels df => rows: 26707, cols: 2
test df => rows: 26708, cols: 35


## Pre-processing

### Scaling and Encoding data

#### Identifying categorical columns and numerical columns

I need to identify categorical columns and unique values for each column.

In [3]:
categorical_columns = np.where(train.dtypes == object)[0]
numerical_columns =  np.where(train.dtypes != object)[0]

assert (len(categorical_columns) + len(numerical_columns) == len(train.columns))

#### Applying Scaler and Encoding into a pipeline

I'm going to use one hot encoding as opposed to label encoding with this one
And I'm using Standard scaler as opposed to Min Max Scaler

In [4]:
# fill numeric values with its mean
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy = 'mean'))
])

non_numeric_preprocessing_steps = Pipeline([
     ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder())
])


preprocessor = ColumnTransformer(
    transformers = [
        ('numeric', numeric_preprocessing_steps, numerical_columns),
        ('non_numeric', non_numeric_preprocessing_steps, categorical_columns)
    ],
    remainder = "drop"
)

## Attempting to sort with Random Forest Classifier

### Split train data into train and test

In [5]:
# split in 80:20 ratio

# for seasonal
P_train, P_test, q_train, q_test = train_test_split(train, label_seasonal, test_size=0.2, random_state=42, stratify=label_seasonal)

# for seasonal
P_train_transform = pd.DataFrame(preprocessor.fit_transform(P_train))
P_test_transform =  pd.DataFrame(preprocessor.transform(P_test))


assert (P_train_transform.shape[1] == P_test_transform.shape[1])

P_train_transform.shape

(21365, 112)

In [6]:
# transform whole dataset
train_transform = pd.DataFrame(preprocessor.fit_transform(train))
test_transform = pd.DataFrame(preprocessor.transform(test))

### Use feature engineering with correlation to remove unnecessary features

because as you can see there are `112` features now
So I need to cut them

https://stackoverflow.com/a/60223949/10582056

In [7]:
# setting threshold
threshold = 0.84

corr = P_train_transform.corr().abs()

# this gives a 111 * 111 co-relational matrix

# select upper triangle of correlations
# because of abs(), both upper and lower triangles have same values
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
upper.head(2)

removed_cols = [column for column in upper.columns if any(upper[column] > threshold)]

removed_cols

[38, 44, 45, 52, 75, 91, 98]

### Prune columns which have above threshold co-relation

In [8]:
P_train_transform = P_train_transform.drop(columns = removed_cols)
P_test_transform = P_test_transform.drop(columns = removed_cols)

train_transform = train_transform.drop(columns = removed_cols)
test_transform = test_transform.drop(columns = removed_cols)

## Use Randomized Search CV with Random Forest Classifier

In [9]:
# Use RandomSearchCV to determine K value for each

# random_grid = {
#     # 'n_estimators': [n for n in range(1500, 2000) if n % 10 == 0],
#     # 'max_features': ['sqrt', 'auto'],
#     # 'max_depth': [n for n in range(50, 120) if n % 10 == 0] + [None],
#     # 'min_samples_split': [n for n in range(12, 24) if n % 2 == 0],
#     # 'min_samples_leaf': [n for n in range(1, 10)],
#     # 'bootstrap': [False]
# 
#     'n_estimators': [1500, 1600, 1700],
#     'max_features': ['sqrt', 'auto'],
#     'max_depth': [30, 40, 50, 60] + [None],
#     'min_samples_split': [20, 22, 24],
#     'min_samples_leaf': [1, 3, 5],
#     'bootstrap': [True, False]
# }
# 
# rscv = RandomizedSearchCV(
#     estimator = RandomForestClassifier(),
#     param_distributions = random_grid,
#     scoring='roc_auc',
#     n_iter=10,
#     cv=None,
#     verbose=10,
#     random_state=42,
#     n_jobs=4
# )
# 
# print("Ready")

Ready


In [10]:
# for seasonal

# rscv.fit(P_train_transform, q_train.values.ravel())
# 
# print(rscv.best_params_)
# print(rscv.best_score_)
# print(rscv.best_estimator_)
# print("Finished")

# already done. see below for results

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 1700, 'min_samples_split': 24, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}
0.8572862026073617
RandomForestClassifier(bootstrap=False, min_samples_split=24, n_estimators=1700)
Finished


{'n_estimators': 1780, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
0.8566579544610041

{'n_estimators': 1600, 'min_samples_split': 22, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
0.8570867758631667

{'n_estimators': 1700, 'min_samples_split': 24, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'bootstrap': False}
0.8572862026073617

hence it seems I don't have to separately process these two columns

### Train with RandomForestClassifier

In [11]:
rfc = RandomForestClassifier(
    n_estimators=1700,
    max_features="auto",
    max_depth=None,
    min_samples_split=24,
    min_samples_leaf=1,
    bootstrap=False
)

In [12]:
rfc.fit(P_train_transform, q_train.values.ravel())
print("Seasonal Fit complete")

predicted = rfc.predict_proba(P_test_transform)

seasonal_predicted = pd.DataFrame( {
    "seasonal_vaccine": predicted[:, 1],
    },
    index = q_test.index
)

assert (seasonal_predicted.shape[1] == 1)

roc_auc_score(q_test, seasonal_predicted)

Seasonal Fit complete


0.8593434018014517

### Execute model on given test data

In [13]:
rfc.fit(train_transform, label_seasonal.values.ravel())
seasonal_result = rfc.predict_proba(test_transform)

print("Done")

Done


In [14]:
build = pd.DataFrame(test.index)

build["h1n1_vaccine"] = output["h1n1_vaccine"]
build["seasonal_vaccine"] = seasonal_result[:, 1]

print(build.shape)

assert (build.shape[0] == len(test.index) and build.shape[1] == 3)

build.head(2)

(26708, 3)


Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.173791,0.340573
1,26708,0.045505,0.085706


## 4. Format the table and save it as a csv file

In [15]:
# convert to a csv file

build.to_csv("output.csv", index=False)
