# HELP

https://practicaldatascience.co.uk/machine-learning/how-to-use-category-encoders-to-transform-categorical-variables

## Load modules and packages

In [None]:
import numpy as np
import pandas as pd
import datetime
from tqdm.notebook import tqdm
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import tree
from sklearn.metrics import roc_auc_score
import random
from sklearn.pipeline import Pipeline
import category_encoders as ce
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingRandomSearchCV

## Load data

In [None]:
random.seed(10) 

from functions import *
# Load data - set index column, decimal point, separator
data = pd.read_csv('hw1_devsample.csv', sep=',',
                   decimal='.', index_col='SK_ID_CURR')

# print time of data being loaded - use strftime
print(f'Data loaded on:   {datetime.datetime.now().strftime(format="%Y-%m-%d %H:%M:%S")}')
data_xgb = data.copy()

data_test = pd.read_csv('hw1_outofsample.csv', sep=',',decimal='.', index_col='SK_ID_CURR')

## Small changes

In [None]:
target = data_xgb['TARGET']
data_xgb = data_xgb.drop(['MONTH','TIME','DAY','BASE','TARGET'], axis=1)
data_xgb.info()

In [None]:
cols_pred = data_xgb.columns
cols_pred_cat = [col for col in cols_pred if data_xgb[col].dtype == 'O']
cols_pred_cat

## Split data train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_xgb, target, test_size=0.2, random_state=123)

# Test target encoder

In [15]:
target_encoder = ce.target_encoder.TargetEncoder(verbose=0, 
cols=cols_pred_cat,
#drop_invariant=False, 
return_df=True, handle_missing='value', 
handle_unknown='value'
#, min_samples_leaf=1
#, smoothing=1.0
)

In [22]:
data_xgb2 = target_encoder.fit_transform(X_train, y_train)
data_xgb2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64000 entries, 147643 to 291895
Columns: 191 entries, NAME_CONTRACT_TYPE to LAST_TRANSACTION_TIME_MONTHS
dtypes: float64(154), int64(37)
memory usage: 93.8 MB


In [20]:
data_xgb2.head()

Unnamed: 0_level_0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,MEAN_AMTCR_1M_3M_TYPE_EQ_ACTIVE_DIV_MEAN_AMTCR_3M_12M_TYPE_EQ_ACTIVE,MEAN_AMTCR_1M_3M_TYPE_EQ_CLOSED_DIV_MEAN_AMTCR_3M_12M_TYPE_EQ_CLOSED,MEAN_AMTCR_OVERDUE_0M_INFM_DIV_MEAN_AMTCR_0M_INFM,MEAN_AMTCR_OVERDUE_0M_12M_DIV_MEAN_AMTCR_0M_12M,MEAN_AMTCR_OVERDUE_0M_INFM_TYPE_EQ_ACTIVE_DIV_MEAN_AMTCR_0M_INFM_TYPE_EQ_ACTIVE,MEAN_AMTCR_OVERDUE_0M_12M_TYPE_EQ_ACTIVE_DIV_MEAN_AMTCR_0M_12M_TYPE_EQ_ACTIVE,MEAN_AMTCR_OVERDUE_0M_INFM_TYPE_EQ_CLOSED_DIV_MEAN_AMTCR_0M_INFM_TYPE_EQ_CLOSED,MEAN_AMTCR_OVERDUE_0M_12M_TYPE_EQ_CLOSED_DIV_MEAN_AMTCR_0M_12M_TYPE_EQ_CLOSED,FIRST_TRANSACTION_TIME_MONTHS,LAST_TRANSACTION_TIME_MONTHS
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
215138,0.082553,0.099091,0.08407,0.080058,0.0,135000.0,646920.0,25195.5,540000.0,0.078102,...,,,0.0,,0.0,,0.0,,37.0,28.0
325963,0.082553,0.099091,0.072736,0.080058,0.0,135000.0,740218.5,40284.0,639000.0,0.081065,...,,,0.0,0.0,0.0,0.0,0.0,,3.0,18.0
206923,0.082553,0.070302,0.08407,0.080058,0.0,180000.0,339241.5,12919.5,238500.0,0.081065,...,,,0.0,,0.0,,0.0,,73.0,61.0
265855,0.082553,0.070302,0.08407,0.080058,0.0,180000.0,1264428.0,37098.0,990000.0,0.081065,...,,,0.0,0.0,0.0,0.0,,,36.0,4.0
359795,0.057899,0.070302,0.08407,0.080058,,76500.0,202500.0,10125.0,202500.0,0.081065,...,,,0.0,0.0,,,0.0,0.0,12.0,12.0


In [21]:
data_xgb[data_xgb.columns].isna().sum()

NAME_CONTRACT_TYPE                                                                     0
CODE_GENDER                                                                            0
FLAG_OWN_CAR                                                                           0
FLAG_OWN_REALTY                                                                        0
CNT_CHILDREN                                                                       24965
                                                                                   ...  
MEAN_AMTCR_OVERDUE_0M_12M_TYPE_EQ_ACTIVE_DIV_MEAN_AMTCR_0M_12M_TYPE_EQ_ACTIVE      65757
MEAN_AMTCR_OVERDUE_0M_INFM_TYPE_EQ_CLOSED_DIV_MEAN_AMTCR_0M_INFM_TYPE_EQ_CLOSED    42844
MEAN_AMTCR_OVERDUE_0M_12M_TYPE_EQ_CLOSED_DIV_MEAN_AMTCR_0M_12M_TYPE_EQ_CLOSED      76167
FIRST_TRANSACTION_TIME_MONTHS                                                      30353
LAST_TRANSACTION_TIME_MONTHS                                                       30353
Length: 191, dtype: i

In [None]:
# find columns with infinity values
cols_with_inf = []
for col in data_cart2.columns:
    if np.any(np.isinf(data_cart2[col])):
        cols_with_inf.append(col)
        print(f'Column {col} includes infinity values.')

# find columns with negative infinity values
cols_with_neginf = []
for col in data_cart2.columns:
    if np.any(np.isneginf(data_cart2[col])):
        cols_with_neginf.append(col)
        print(f'Column {col} includes negative infinity values.')

for col in cols_with_inf:
    data_cart2[col].replace(np.inf, 9999999, inplace = True)

# Test XGBOOST

In [None]:
model = xgb.XGBClassifier()

In [None]:
print(model)

In [None]:
model.fit(X_train,y_train)

In [None]:
pred_test = model.predict(X_test)
predictions = [round(value) for value in pred_test]

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
print('AUC',roc_auc_score(y_test,predictions))

# Pipeline

In [None]:
pipe = Pipeline([
    ('Mean Target Encoding', ce.target_encoder.TargetEncoder()),
    ('model', xgb.XGBClassifier())
])

## Halving Random Search CV

In [None]:
from scipy.stats import randint
param_distributions = {"max_depth": [3, None],
                        "min_samples_split": randint(2, 11)}

In [None]:
search = HalvingRandomSearchCV(pipe, param_distributions,
                               random_state=0).fit(X_train, y_train)

## Apply pipeline to data

search = HalvingRandomSearchCV(pipe, param_grid, cv=3).fit(X_train, y_train)

In [None]:
search.best_params_

## Calculate AUC

# Predict on the second dataset

In [None]:
prediction = ...

In [None]:
data_saving = pd.DataFrame({'SK_ID_CURR' : data_test.index ,'prediction' : prediction})
data_saving.to_csv('DS2_22_HW1_CADIOU&PAIN.csv',index=False,sep=',')
data_saving.info()