# HELP

https://practicaldatascience.co.uk/machine-learning/how-to-use-category-encoders-to-transform-categorical-variables
https://xgboost.readthedocs.io/en/latest/parameter.html

## Load modules and packages

In [None]:
import numpy as np
import pandas as pd
import datetime
from tqdm.notebook import tqdm
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import tree
from sklearn.metrics import roc_auc_score
import random
from sklearn.pipeline import Pipeline
import category_encoders as ce
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingRandomSearchCV

## Load data

In [None]:
random.seed(10) 

from functions import *
# Load data - set index column, decimal point, separator
data = pd.read_csv('hw1_devsample.csv', sep=',',
                   decimal='.', index_col='SK_ID_CURR')

# print time of data being loaded - use strftime
print(f'Data loaded on:   {datetime.datetime.now().strftime(format="%Y-%m-%d %H:%M:%S")}')
data_xgb = data.copy()

data_test = pd.read_csv('hw1_outofsample.csv', sep=',',decimal='.', index_col='SK_ID_CURR')

## Small changes

In [None]:
target = data_xgb['TARGET']
data_xgb = data_xgb.drop(['MONTH','TIME','DAY','BASE','TARGET'], axis=1)
data_xgb.info()

In [None]:
cols_pred = data_xgb.columns
cols_pred_cat = [col for col in cols_pred if data_xgb[col].dtype == 'O']
cols_pred_cat

## Split data train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_xgb, target, test_size=0.2, random_state=123)

# Test target encoder

In [None]:
target_encoder = ce.target_encoder.TargetEncoder(verbose=0, 
cols=cols_pred_cat,
#drop_invariant=False, 
return_df=True, handle_missing='value', 
handle_unknown='value'
#, min_samples_leaf=1
#, smoothing=1.0
)

In [None]:
X_train = target_encoder.fit_transform(X_train, y_train)
X_train.info()

In [None]:
X_train.head()

In [None]:
data_xgb[data_xgb.columns].isna().sum()

In [None]:
# find columns with infinity values
cols_with_inf = []
for col in X_train.columns:
    if np.any(np.isinf(X_train[col])):
        cols_with_inf.append(col)
        print(f'Column {col} includes infinity values.')

# find columns with negative infinity values
cols_with_neginf = []
for col in X_train.columns:
    if np.any(np.isneginf(X_train[col])):
        cols_with_neginf.append(col)
        print(f'Column {col} includes negative infinity values.')

for col in cols_with_inf:
    X_train[col].replace(np.inf, 9999999, inplace = True)

# Test XGBOOST

In [None]:
model = xgb.XGBClassifier()

In [None]:
print(model)

In [None]:
model.fit(X_train,y_train)

In [None]:
pred_test = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, pred_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
print('AUC',roc_auc_score(y_test,pred_test))

# Pipeline

In [None]:
target_encoder = ce.target_encoder.TargetEncoder( 
cols=cols_pred_cat,
#drop_invariant=False, 
return_df=True, handle_missing='value', 
handle_unknown='value'
#, min_samples_leaf=1
#, smoothing=1.0
)

In [None]:
model = xgb.XGBClassifier()

In [None]:
pipe = Pipeline([
    ('Mean_Target_Encoding', target_encoder),
    ('model', model)
])

## Halving Random Search CV

In [None]:
param_grid = {
    'Mean_Target_Encoding__verbose': [0, 1, 2],
    'model__max_depth': [2, 4, 6, 8]
}


## Apply pipeline to data

In [None]:
search = HalvingRandomSearchCV(pipe, param_grid).fit(X_train, y_train)

In [None]:
search.best_params_

## Calculate AUC

# Predict on the second dataset

In [None]:
prediction = ...

In [None]:
data_saving = pd.DataFrame({'SK_ID_CURR' : data_test.index ,'prediction' : prediction})
data_saving.to_csv('DS2_22_HW1_CADIOU&PAIN.csv',index=False,sep=',')
data_saving.info()