In [20]:
from copy import deepcopy

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from catboost.utils import get_gpu_device_count

print("Import complete")

Import complete


## Import all the datasets

In [21]:
# test data
test = pd.read_csv(r"test_set_features.csv")

# train data
train = pd.read_csv(r"training_set_features.csv")

# target
training_labels = pd.read_csv(r"training_set_labels.csv")

# check whether rows are equal
print("train df => rows: %s, cols: %s" % (train.shape[0], train.shape[1]))
print("train labels df => rows: %s, cols: %s" % (training_labels.shape[0], training_labels.shape[1]))
print("test df => rows: %s, cols: %s" % (test.shape[0], test.shape[1]))

assert(train.shape[1] == test.shape[1])

train df => rows: 26707, cols: 36
train labels df => rows: 26707, cols: 3
test df => rows: 26708, cols: 36


## Pre-processing

From previous work, it is known that the dataset does not have any
duplicates

### Setting `respodent_id` as index for the df

In [22]:
train = train.set_index("respondent_id")
test = test.set_index("respondent_id")
training_labels = training_labels.set_index("respondent_id")

### Create a grouped dataset

In [23]:
train["type"] = "train"
test["type"] = "test"

data_original = pd.concat([train, test], ignore_index=True)

# the row count should be total of both df and column count should be incremented by one
print("data => rows: %s, cols: %s" % (data_original.shape[0], data_original.shape[1]))

assert(data_original.shape[0] == train.shape[0] + test.shape[0])
assert(data_original.shape[1] == train.shape[1])

del train, test

data => rows: 53415, cols: 36


### Select Categorical and Numerical columns

In [24]:
# divide into numeric columns and categorical columns
categorical_columns = np.where(data_original.dtypes == object)[0]
categorical_features = [data_original.columns[feature] for feature in categorical_columns]

numerical_features = [feat for feat in data_original.columns if feat not in categorical_features]

print(categorical_features)
print(numerical_features)

assert (len(numerical_features) + len(categorical_features) == len(data_original.columns))
del categorical_columns

['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation', 'type']
['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']


### Extract columns from data

In [25]:
data_subset_numeric = data_original.loc[:, numerical_features]

data_subset_categorical = data_original.loc[:, categorical_features]

### Do scaling with Standard scalar

In [26]:
scalar = StandardScaler()
data_subset_numeric = pd.DataFrame(scalar.fit_transform(data_subset_numeric), columns = data_subset_numeric.columns)

del scalar

### Impute with SimpleImputer

In [27]:
# imputer = KNNImputer(n_neighbors=1, missing_values=np.nan)
imputer = SimpleImputer(strategy="mean")
data_subset_numeric = pd.DataFrame(imputer.fit_transform(data_subset_numeric), columns = data_subset_numeric.columns)

del imputer

For categorical data, fill values as "unknown"

In [28]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 'unknown')
data_subset_categorical = pd.DataFrame(imputer.fit_transform(data_subset_categorical), columns = data_subset_categorical.columns)

# change object to category
for cat in categorical_features:
    data_subset_categorical[cat] = data_subset_categorical[cat].astype('category')

In [29]:
for cat in categorical_features:
    print("%s: %s" % (cat, list(data_subset_categorical[cat].cat.categories)))

age_group: ['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years']
education: ['12 Years', '< 12 Years', 'College Graduate', 'Some College', 'unknown']
race: ['Black', 'Hispanic', 'Other or Multiple', 'White']
sex: ['Female', 'Male']
income_poverty: ['<= $75,000, Above Poverty', '> $75,000', 'Below Poverty', 'unknown']
marital_status: ['Married', 'Not Married', 'unknown']
rent_or_own: ['Own', 'Rent', 'unknown']
employment_status: ['Employed', 'Not in Labor Force', 'Unemployed', 'unknown']
hhs_geo_region: ['atmpeygn', 'bhuqouqj', 'dqpwygqj', 'fpwskwrf', 'kbazzjca', 'lrircsnp', 'lzgpxyit', 'mlyzmhmf', 'oxchjgsf', 'qufhixun']
census_msa: ['MSA, Not Principle  City', 'MSA, Principle City', 'Non-MSA']
employment_industry: ['arjwrbjb', 'atmlpfrs', 'cfqqtusy', 'dotnnunm', 'fcxhlnwr', 'haxffmxo', 'ldnlellj', 'mcubkhph', 'mfikgejo', 'msuufmds', 'nduyfdeo', 'phxvnwax', 'pxcmvdjn', 'qnlwzans', 'rucpziij', 'saaquncn', 'unknown', 'vjjrobsf', 'wlfvacwt', 'wxleyezf', 'xicduo

### Encoding with LabelEncoding and OneHotEncoding

Label encoding:
1. age_group            -> in order
2. education            -> not in order
3. income_poverty       -> not in order
4. sex                  -> only binary values present

One Hot Encoding:
1. race
2. marital_status
3. rent_or_own
4. employment_status
5. hhs_geo_region
6. census_msa
7. employment_industry
8. employment_occupation

Leave as it is:
2. type

So I have to divide `data_subset_categorical` into 2:
1. `data_subset_categorical_label`
2. `data_subset_categorical_onehot`

In [30]:
label_features = ['age_group', 'education', 'income_poverty', 'sex']
onehot_features = ['race', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation']

# `type` is excluded
assert (len(label_features) + len(onehot_features) == len(categorical_features) - 1)

data_subset_categorical_label = data_subset_categorical.loc[:, label_features]
data_subset_categorical_onehot = data_subset_categorical.loc[:, onehot_features]

assert (not any(data_subset_categorical_label.isna().any()))
assert (not any(data_subset_categorical_onehot.isna().any()))
assert (data_subset_categorical_label.shape[0] == data_subset_categorical_onehot.shape[0])

In [31]:
# label encoding

data_subset_categorical_label['age_group'] = data_subset_categorical_label['age_group'].cat.codes

data_subset_categorical_label['education'].cat.reorder_categories(['unknown','< 12 Years', '12 Years', 'Some College', 'College Graduate'], ordered = True, inplace = True)
data_subset_categorical_label['education'] = data_subset_categorical_label['education'].cat.codes

data_subset_categorical_label['income_poverty'].cat.reorder_categories(['unknown','Below Poverty', '<= $75,000, Above Poverty', '> $75,000'], ordered = True, inplace = True)
data_subset_categorical_label['income_poverty'] = data_subset_categorical_label['income_poverty'].cat.codes

data_subset_categorical_label['sex'] = data_subset_categorical_label['sex'].cat.codes

  res = method(*args, **kwargs)
  res = method(*args, **kwargs)


In [32]:
# one hot encoding
for feature in onehot_features:
    data_subset_categorical_onehot = pd.concat([data_subset_categorical_onehot.drop(feature, axis=1), pd.get_dummies(data_subset_categorical_onehot[feature], prefix=feature)], axis=1)


### Rejoining everything back into one df

In [33]:
data = pd.concat([data_subset_numeric, data_subset_categorical_label, data_subset_categorical_onehot], axis = 1)
data["type"] = data_original["type"]

assert (data.shape[0] == data_original.shape[0])

data.shape

(53415, 101)

In [34]:
data.dtypes

h1n1_concern                      float64
h1n1_knowledge                    float64
behavioral_antiviral_meds         float64
behavioral_avoidance              float64
behavioral_face_mask              float64
                                   ...   
employment_occupation_xgwztkwe      uint8
employment_occupation_xqwwgdyp      uint8
employment_occupation_xtkaffoo      uint8
employment_occupation_xzmlyyjv      uint8
type                               object
Length: 101, dtype: object

data types of converted columns are in `int8` and `uint8`,
convert that to `float64` for consistency

In [35]:
for col in data.columns:
    if data[col].dtype == 'int8' or data[col].dtype == 'uint8':
        data[col] = data[col].astype('float64')

assert (not any(data.isna().any()))

### Use feature engineering with correlation to remove unnecessary features

https://stackoverflow.com/a/60223949/10582056

In [36]:
# setting threshold
threshold = 0.85

corr = data.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

removed_cols = [column for column in upper.columns if any(upper[column] > threshold)]
removed_cols

['marital_status_Not Married',
 'employment_status_unknown',
 'employment_industry_unknown',
 'employment_occupation_dcjcmpih',
 'employment_occupation_unknown']

### Prune columns which have above threshold co-relation

In [37]:
data = data.drop(columns = removed_cols)
data.shape

(53415, 96)

## Attempting to sort with CatBoostClassifier

### Split train data into train and test

In [38]:
# split in 80:20 ratio

target_h1n1_vaccine = training_labels["h1n1_vaccine"].values
target_seasonal_vaccine = training_labels["seasonal_vaccine"].values

train = data[data.type.eq("train")].drop("type", axis=1)
test = data[data.type.eq("test")].drop("type", axis=1)

size = 1000
X_train, X_test, y_train, y_test = train_test_split(
    train.head(size),
    training_labels[['h1n1_vaccine', 'seasonal_vaccine']].head(size),
    test_size = 0.2,
    shuffle = True,
    stratify = training_labels[['h1n1_vaccine', 'seasonal_vaccine']].head(size),
    random_state = 42
)

## Use Randomized Search CV with CatBoostClassifier

In [39]:
# Use RandomSearchCV to determine K value for each

random_grid = {
    # 'n_estimators': [n for n in range(200, 2000) if n % 100 == 0],
    # 'max_features': ['auto', 'sqrt'],
    # 'max_depth': [n for n in range(10, 120) if n % 10 == 0] + [None],
    # 'min_samples_split': [n for n in range(1, 30) if n % 2 == 0],
    # 'min_samples_leaf': [n for n in range(1, 30) if n % 2 == 0],
    # 'bootstrap': [True, False]

    "n_estimators":[500, 600, 700, 800],
    "max_depth": [10, 30, 50, 80, 100],
    "learning_rate": [0.1, 0.15, 0.3, 0.5, 0.8, 1],
    "loss_function": ['Logloss'],
    "l2_leaf_reg": [1,2,3]
}

rscv_h1n1 = RandomizedSearchCV(
    estimator = CatBoostClassifier(),
    param_distributions = random_grid,
    scoring='roc_auc',
    n_iter=10,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=4
)

rscv_seasonal = deepcopy(rscv_h1n1)

print("Ready!")

Ready!


In [40]:
# for h1n1

rscv_h1n1.fit(X_train, y_train['h1n1_vaccine'])

Fitting 3 folds for each of 10 candidates, totalling 30 fits
0:	learn: 0.5736930	total: 156ms	remaining: 1m 33s
1:	learn: 0.5052994	total: 165ms	remaining: 49.4s
2:	learn: 0.4622084	total: 166ms	remaining: 33.1s
3:	learn: 0.4019966	total: 175ms	remaining: 26.1s
4:	learn: 0.3610813	total: 184ms	remaining: 21.9s
5:	learn: 0.3239784	total: 193ms	remaining: 19.1s
6:	learn: 0.2994197	total: 202ms	remaining: 17.1s
7:	learn: 0.2779857	total: 216ms	remaining: 15.9s
8:	learn: 0.2582015	total: 225ms	remaining: 14.8s
9:	learn: 0.2425799	total: 234ms	remaining: 13.8s
10:	learn: 0.2262804	total: 243ms	remaining: 13s
11:	learn: 0.2053854	total: 252ms	remaining: 12.3s
12:	learn: 0.1957183	total: 261ms	remaining: 11.8s
13:	learn: 0.1813659	total: 288ms	remaining: 12.1s
14:	learn: 0.1703963	total: 300ms	remaining: 11.7s
15:	learn: 0.1606032	total: 309ms	remaining: 11.3s
16:	learn: 0.1558875	total: 320ms	remaining: 11s
17:	learn: 0.1475996	total: 333ms	remaining: 10.8s
18:	learn: 0.1412327	total: 364ms	

 0.78509866        nan        nan 0.82710049]


RandomizedSearchCV(cv=3,
                   estimator=<catboost.core.CatBoostClassifier object at 0x0000026F780F5948>,
                   n_jobs=4,
                   param_distributions={'l2_leaf_reg': [1, 2, 3],
                                        'learning_rate': [0.15, 0.3, 0.5, 0.8,
                                                          1],
                                        'loss_function': ['Logloss'],
                                        'max_depth': [10, 30, 50, 80, 100],
                                        'n_estimators': [300, 600, 1000, 1500,
                                                         2000]},
                   random_state=42, scoring='roc_auc', verbose=2)

In [41]:
print(rscv_h1n1.best_params_)
print(rscv_h1n1.best_score_)
print(rscv_h1n1.best_estimator_)
print("Finished")

# already done. see below for results

{'n_estimators': 600, 'max_depth': 10, 'loss_function': 'Logloss', 'learning_rate': 0.15, 'l2_leaf_reg': 2}
0.8271004893185343
<catboost.core.CatBoostClassifier object at 0x0000026F7801A4C8>
Finished


{'n_estimators': 1780, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
0.8646527299947001

{'n_estimators': 300, 'max_depth': 10, 'loss_function': 'Logloss', 'learning_rate': 0.15, 'l2_leaf_reg': 1}
0.6828042328042327

{'n_estimators': 300, 'max_depth': 10, 'loss_function': 'Logloss', 'learning_rate': 0.15, 'l2_leaf_reg': 1}
0.6711538461538462

In [None]:
# for seasonal

rscv_seasonal.fit(X_train, y_train['seasonal_vaccine'])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
print(rscv_seasonal.best_params_)
print(rscv_seasonal.best_score_)
print(rscv_seasonal.best_estimator_)
print("Finished")

# already done. see below for results

{'n_estimators': 1780, 'min_samples_split': 16, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
0.8646527299947001

### Fine tune with Grid Search CV

In [60]:
# grid = {
#     # 'n_estimators': [n for n in range(1, max_val) if n % 10 == 0],
#     # 'max_features': ['auto', 'sqrt'],
#     # 'max_depth': [n for n in range(0, max_val) if n % 10 == 0] + [None],
#     # 'min_samples_split': [n for n in range(0, 30) if n % 2 == 0],
#     # 'min_samples_leaf': [n for n in range(0, 30) if n % 2 == 0],
#     # 'bootstrap': [True, False]
#
#     'n_estimators': [1700, 1780, 1900],
#     'max_features': ['auto'],
#     'max_depth': [90, 100, 110] + [None],
#     'min_samples_split': [12, 16, 20],
#     'min_samples_leaf': [3, 4, 5],
#     'bootstrap': [False]
# }
#
# gscv = GridSearchCV(
#     estimator=RandomForestClassifier(),
#     param_grid=grid,
#     scoring='roc_auc',
#     verbose=10,
#     n_jobs=4
# )
#
# print("Ready")

In [61]:
# gscv.fit(X_train_transform, y_train.values.ravel())
#
# print(grid.best_estimator_)
# print(grid.best_params_)
# print(grid.best_score_)

# print("Finished")

### Train with RandomForestClassifier

In [65]:
rfc_h1n1 = RandomForestClassifier(
    n_estimators=1780,
    max_features="auto",
    max_depth=100,
    min_samples_split=16,
    min_samples_leaf=4,
    bootstrap=False
)

rfc_seasonal = RandomForestClassifier(
    n_estimators=1780,
    max_features="auto",
    max_depth=100,
    min_samples_split=16,
    min_samples_leaf=4,
    bootstrap=False
)

In [63]:
rfc_h1n1.fit(X_train, y_train['h1n1_vaccine'])
print("H1N1 Fit complete")

predicted = rfc_h1n1.predict_proba(X_test)

h1n1_predicted = pd.DataFrame( {
    "h1n1_vaccine": predicted[:, 1],
    },
    index = y_test.index
)

assert (h1n1_predicted.shape[1] == 1)

roc_auc_score(y_test["h1n1_vaccine"], h1n1_predicted)

H1N1 Fit complete


0.8262119249541096

In [64]:
rfc_seasonal.fit(X_train, y_train['seasonal_vaccine'])
print("Seasonal Fit complete")

predicted = rfc_h1n1.predict_proba(X_test)

seasonal_predicted = pd.DataFrame( {
    "seasonal_vaccine": predicted[:, 1],
    },
    index = y_test.index
)

assert (seasonal_predicted.shape[1] == 1)

roc_auc_score(y_test["seasonal_vaccine"], seasonal_predicted)

Seasonal Fit complete


0.7209534990096823

### Execute model on given test data

In [20]:
rfc.fit(train_transform, label_h1n1.values.ravel())
h1n1_result = rfc.predict_proba(test_transform)

print("Done")

Done


In [21]:
build = pd.DataFrame(test.index)

build["h1n1_vaccine"] = h1n1_result[:, 1]

print(build.shape)

assert (build.shape[0] == len(test.index) and build.shape[1] == 2)

build.head()

(26708, 2)


Unnamed: 0,respondent_id,h1n1_vaccine
0,26707,0.173791
1,26708,0.045505
2,26709,0.288724
3,26710,0.55885
4,26711,0.319752


## 4. Format the table and save it as a csv file

In [22]:
# convert to a csv file

build.to_csv("output.csv", index=False)
