# HELP

https://practicaldatascience.co.uk/machine-learning/how-to-use-category-encoders-to-transform-categorical-variables
https://xgboost.readthedocs.io/en/latest/parameter.html

## Load modules and packages

In [None]:
import numpy as np
import pandas as pd
import datetime
from tqdm.notebook import tqdm
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import tree
from sklearn.metrics import roc_auc_score
import random
from sklearn.pipeline import Pipeline
import category_encoders as ce
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingRandomSearchCV

## Load data

In [None]:
random.seed(10) 

from functions import *
# Load data - set index column, decimal point, separator
data = pd.read_csv('hw1_devsample.csv', sep=',',
                   decimal='.', index_col='SK_ID_CURR')

# print time of data being loaded - use strftime
print(f'Data loaded on:   {datetime.datetime.now().strftime(format="%Y-%m-%d %H:%M:%S")}')
data_xgb = data.copy()

data_test = pd.read_csv('hw1_outofsample.csv', sep=',',decimal='.', index_col='SK_ID_CURR')

## Small changes

In [None]:
target = data_xgb['TARGET']
data_xgb = data_xgb.drop(['MONTH','TIME','DAY','BASE','TARGET'], axis=1)
data_xgb.info()

In [None]:
data_test = data_test.drop(['MONTH','TIME','DAY','BASE'], axis=1)

In [None]:
cols_pred = data_xgb.columns
cols_pred_cat = [col for col in cols_pred if data_xgb[col].dtype == 'O']
cols_pred_cat

## Split data train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_xgb, target, test_size=0.2, random_state=123)

# Test manually

### Test target encoder

In [None]:
target_encoder = ce.target_encoder.TargetEncoder(verbose=0, 
cols=cols_pred_cat,
#drop_invariant=False, 
return_df=True, handle_missing='value', 
handle_unknown='value',
#, min_samples_leaf=1
#smoothing=1.0
)

In [None]:
X_train = target_encoder.fit_transform(X_train, y_train)
X_train.info()

In [None]:
# find columns with infinity values
cols_with_inf = []
for col in X_train.columns:
    if np.any(np.isinf(X_train[col])):
        cols_with_inf.append(col)
        print(f'Column {col} includes infinity values.')

# find columns with negative infinity values
cols_with_neginf = []
for col in X_train.columns:
    if np.any(np.isneginf(X_train[col])):
        cols_with_neginf.append(col)
        print(f'Column {col} includes negative infinity values.')

for col in cols_with_inf:
    X_train[col].replace(np.inf, 9999999, inplace = True)

### Test manually XGBOOST

In [None]:
model = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree', max_depth = 4, eta = 0.4, gamma = 3)

In [None]:
print(model)

In [None]:
model.fit(X_train,y_train)

In [None]:
X_test = target_encoder.fit_transform(X_test, y_test)
X_test.info()

In [None]:
pred_test = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, pred_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Pipeline

In [None]:
target_encoder = ce.target_encoder.TargetEncoder( 
cols=cols_pred_cat,
#drop_invariant=False, 
return_df=True, handle_missing='value', 
handle_unknown='value'
#, min_samples_leaf=1
#, smoothing=1.0
)

In [None]:
model = xgb.XGBClassifier(objective='binary:logistic', booster='gbtree', eval_metric = 'auc')

In [None]:
pipe = Pipeline([
    ('Mean_Target_Encoding', target_encoder),
    ('model', model)
])

## Halving Random Search CV

In [None]:
param_grid = {
    'Mean_Target_Encoding__smoothing': [0.5, 1.0, 1.5,2.0],
    'model__max_depth': [2, 4, 6, 8],
    'model__eta' : [0.2,0.3,0.4],
    'model__gamma': [0,1,2,3,4,5]
}


## Apply pipeline to data

In [None]:
FACTOR = 2
MAX_RESOURCE_DIVISOR = 4

n_samples = len(X_train)
min_ressources = n_samples/MAX_RESOURCE_DIVISOR
min_ressources

In [None]:
min_ressources = int(min_ressources)

In [None]:
search = HalvingRandomSearchCV(pipe, param_grid, 
aggressive_elimination=True,
resource='n_samples',min_resources=min_ressources,factor=FACTOR,).fit(X_train, y_train)

In [None]:
hrs = HalvingRandomSearchCV(
    estimator=pipe,
    param_distributions=param_grid,
    n_candidates=10,
    factor=2,
    n_jobs=-1
).fit(X_train, y_train)

In [60]:
hrs.best_params_ 

{'model__max_depth': 8,
 'model__gamma': 1,
 'model__eta': 0.3,
 'Mean_Target_Encoding__smoothing': 0.5}

{'model__max_depth': 2,
 'model__gamma': 5,
 'model__eta': 0.2,
 'Mean_Target_Encoding__smoothing': 1.5}

{'model__max_depth': 2,
 'model__gamma': 4,
 'model__eta': 0.4,
 'Mean_Target_Encoding__smoothing': 1.0}

In [59]:
hrs.best_score_

0.93125

In [58]:
hrs.best_estimator_

Pipeline(steps=[('Mean_Target_Encoding',
                 TargetEncoder(cols=['NAME_CONTRACT_TYPE', 'CODE_GENDER',
                                     'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
                                     'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
                                     'NAME_EDUCATION_TYPE',
                                     'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
                                     'OCCUPATION_TYPE',
                                     'WEEKDAY_APPR_PROCESS_START',
                                     'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
                                     'HOUSETYPE_MODE', 'WALLSMATERIAL_MO...
                               eval_metric='auc', gamma=1, gpu_id=-1,
                               grow_policy='depthwise', importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_t

In [None]:
hrs.fit(X_train,y_train)

In [None]:
pred_test = hrs.predict(X_test)

In [None]:
hrs.score

In [None]:
print('AUC',roc_auc_score(y_test, pred_test))

## Calculate AUC

In [None]:
pred_test = search.predict(X_test)

In [None]:
sum(pred_test)

In [None]:
sum(y_test)

In [None]:
accuracy = accuracy_score(y_test, pred_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
print('AUC',roc_auc_score(y_test, pred_test))

# Predict on the second dataset

In [None]:
prediction = search.predict(data_test)

In [None]:
data_saving = pd.DataFrame({'SK_ID_CURR' : data_test.index ,'prediction' : prediction})
data_saving.to_csv('DS2_22_HW3_CADIOU&PAIN.csv',index=False,sep=',')
data_saving.info()