In [34]:
import copy

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", 200)

from sklearn.experimental import enable_halving_search_cv
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV

from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, roc_auc_score

print("import success")

import success


In [35]:
# import pre-processed datasets

# train data
data = pd.read_csv(r"../files/for_train.csv", index_col="respondent_id")

# target
labels = pd.read_csv(r"../files/training_set_labels.csv", index_col="respondent_id")
label_h1n1 = labels[["h1n1_vaccine"]]
label_seasonal = labels[["seasonal_vaccine"]]

# check whether rows are equal
print("data => rows: %s, cols: %s" % (data.shape[0], data.shape[1]))
print("labels => rows: %s, cols: %s" % (labels.shape[0], labels.shape[1]))

data => rows: 53415, cols: 113
labels => rows: 26707, cols: 2


In [36]:
target = label_h1n1.values.ravel()

train = data[data.type.eq("train")].drop("type", axis=1)
test = data[data.type.eq("test")].drop("type", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    train,
    target,
    test_size = 0.2,
    shuffle = True,
    stratify = target,
    random_state = 42
)

# size = 1000
size = len(y_train)
X_train_pruned = X_train.head(size).copy()
y_train_pruned = copy.deepcopy(y_train[:size])

In [37]:
catboost = CatBoostClassifier(allow_writing_files=False)

grid = {
    'n_estimators': [n for n in range(1000, 3000) if n % 400 == 0],
    'max_depth': [2, 6, 8, 12],
}

hrscv = HalvingGridSearchCV(
    estimator=catboost,
    param_grid=grid,
    scoring=make_scorer(roc_auc_score),
    cv=3,
    error_score='raise',
    refit=False,
    verbose=10,
    random_state=42,
    n_jobs=4
)

print("Ready!")

Ready!


In [38]:
# hrscv.fit(X_train_pruned, y_train_pruned)

In [39]:
# print(hrscv.best_params_)
# print(hrscv.best_score_)
# print(hrscv.best_estimator_)

{'max_depth': 6, 'n_estimators': 1600}

In [40]:
# fit the classifier
catboost = CatBoostClassifier(
    max_depth=6,
    n_estimators=1600,
    verbose=0,
    allow_writing_files=False
)
catboost.fit(X_train, y_train)
print("Fit complete for h1n1")

Fit complete for h1n1


In [41]:
predicted_vals = catboost.predict_proba(X_test)[:, 1]

score = roc_auc_score(y_test, predicted_vals)

print(score)

0.8637800016544693


In [42]:
build = pd.DataFrame(test.index)

build["h1n1_vaccine"] = catboost.predict_proba(test)[:, 1]

print(build.shape)

assert (build.shape[0] == len(test.index) and build.shape[1] == 2)

(26708, 2)


In [43]:
target = label_seasonal.values.ravel()

X_train, X_test, y_train, y_test = train_test_split(
    train,
    target,
    test_size = 0.2,
    shuffle = True,
    stratify = target,
    random_state = 42
)

In [44]:
# fit the classifier
catboost = CatBoostClassifier(
    max_depth=6,
    n_estimators=1600,
    verbose=0,
    allow_writing_files=False
)
catboost.fit(X_train, y_train)
print("Fit complete for seasonal")

Fit complete for seasonal


In [45]:
predicted_vals = catboost.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test,  predicted_vals)

print(score)

0.866970875522947


In [46]:
build["seasonal_vaccine"] = catboost.predict_proba(test)[:, 1]

print(build.shape)

assert (build.shape[0] == len(test.index) and build.shape[1] == 3)

build.head(2)

(26708, 3)


Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.142957,0.365155
1,26708,0.026167,0.019542


In [47]:
build.to_csv("../files/output.csv", index=False)
print("success")


success
