In [251]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, roc_auc_score
import sklearn.metrics as metrics

from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder

from catboost import CatBoostClassifier

In [252]:
train = pd.read_csv('train_dataset.csv', low_memory=False)

In [253]:
def remove_repetitions(first_list, second_list):
    return [element for element in first_list if element not in second_list]

=================================================== ОБРАБОТКА КАТЕГОРИАЛЬНЫХ ДАННЫХ ===================================================

In [254]:
cat_cols = [col for col in train.columns if train[col].dtype == 'object']
cat_cols.pop(0)

train_cat = train[cat_cols]
train_cat = train_cat.fillna('')


train_cat_np = train_cat.to_numpy()

for i in range(len(train_cat_np)):
    for j in range(len(train_cat_np[i])):
        el = train_cat_np[i][j]
        if isinstance(el, int):
            continue

        if el != "":
            if "https" in el:
                new_i = el[el.find("//")+2:]
                new_i = new_i[:new_i.find("/")]
                train_cat_np[i][j] = new_i
train_cat = pd.DataFrame(train_cat_np, columns=cat_cols)

In [255]:
train_cat = train_cat.apply(LabelEncoder().fit_transform)

In [256]:
corr_matrix = train_cat.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]

train_cat.drop(to_drop, axis=1, inplace=True)

cat_cols = remove_repetitions(cat_cols, to_drop)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ОБРАБОТКА КАТЕГОРИАЛЬНЫХ ДАННЫХ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

=================================================== ОБРАБОТКА ЧИСЛЕННЫХ ДАННЫХ ===================================================

In [257]:
num_cols = [col for col in train.columns if train[col].dtype != 'object']
num_cols.pop(1)
train_num = train[num_cols]
train_num = train_num.fillna(0)  # try mediana

In [258]:
corr_matrix = train_num.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]

train_num.drop(to_drop, axis=1, inplace=True)

num_cols = remove_repetitions(num_cols, to_drop)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ОБРАБОТКА ЧИСЛЕННЫХ ДАННЫХ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [259]:
# СОЕДИНЕНИЕ ОБРАБОТАННЫХ ДАННЫХ
train_norm = train_num.join(train_cat)
train_norm

Unnamed: 0,client_id,col1,col2,col5,col8,col45,col46,col47,col48,col65,...,col1447,col1448,col1449,col1454,col1647,col1649,col2191,col2192,col2193,col2195
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,15,0,0,0,0,0,0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,33,0,0,0,0,0,0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,40,0,0,0,0,0,0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,60,0,0,0,0,0,0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,114,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14451,1241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,4742,0,0,0,0,0,0
14452,1969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,4751,0,0,0,0,0,0
14453,7116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,4763,0,0,0,0,0,0
14454,7117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,4769,0,0,0,0,0,0


In [260]:
stand_sc = StandardScaler()

# data_ohe = pd.DataFrame(onehotencoder.fit_transform(train[cat_cols]))
data_num_cols = stand_sc.fit_transform(train_norm[num_cols])
data_norm = pd.DataFrame(data_num_cols, columns=num_cols).join(train_norm[cat_cols])

y = train['target'].values
X = data_norm[data_norm.columns[:-1]].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=62, stratify=y)
print(np.unique(y_train), np.unique(y_test))

[0 1] [0 1]


In [261]:
# XGBOOST EXECUTION

import xgboost
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

DMatTrain = xgboost.DMatrix(X_train, y_train)
DMatTest  = xgboost.DMatrix(X_test,  y_test )

booster = xgboost.train(params={'max_depth' : 10, "objective" : "binary:logistic" }, dtrain=DMatTrain, evals=[ (DMatTrain, "train"), (DMatTest, "test") ])

print("Train RMSE: ", booster.eval(DMatTrain))
print("Test RMSE:  ", booster.eval(DMatTest ))

train_preds = [1 if pred>0.5 else 0 for pred in booster.predict(data=DMatTrain)]
test_preds  = [1 if pred>0.5 else 0 for pred in booster.predict(data=DMatTest )]

print("\nTest  Accuracy : %.2f"%accuracy_score(y_test, test_preds))
print("Train Accuracy : %.2f"%accuracy_score(y_train, train_preds))

print("\nConfusion Matrix : ")
print(confusion_matrix(y_test, test_preds))

print("\nClassification Report : ")
print(classification_report(y_test, test_preds))

print("\nRoc Auc Score: ")
print(roc_auc_score(y_test, test_preds))

[0]	train-logloss:0.46153	test-logloss:0.46457
[1]	train-logloss:0.33153	test-logloss:0.33824
[2]	train-logloss:0.24794	test-logloss:0.25914
[3]	train-logloss:0.19186	test-logloss:0.20714
[4]	train-logloss:0.15241	test-logloss:0.17092
[5]	train-logloss:0.12264	test-logloss:0.14401
[6]	train-logloss:0.09830	test-logloss:0.12493
[7]	train-logloss:0.08031	test-logloss:0.11089
[8]	train-logloss:0.06635	test-logloss:0.10038
[9]	train-logloss:0.05601	test-logloss:0.09271
Train RMSE:  [0]	eval-logloss:0.05601242841171460
Test RMSE:   [0]	eval-logloss:0.09271217412399038

Test  Accuracy : 0.97
Train Accuracy : 0.99

Confusion Matrix : 
[[2797    8]
 [  65   22]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2805
           1       0.73      0.25      0.38        87

    accuracy                           0.97      2892
   macro avg       0.86      0.63      0.68      2892
weighted avg       0.97      0.97      

In [250]:
import random
all_data: dict = {}
for _ in range(100):
    n_estimators = random.randint(100, 600)
    max_depth = random.randint(10, 50)
    learning_rate = random.uniform(0.1, 0.9)

    xgb_classifier = xgboost.XGBClassifier(n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate)

    xgb_classifier.fit(X_train, y_train, eval_set = [ (X_test, y_test) ], eval_metric = "auc", verbose = 3)

    print("Test  Accuracy Score : %.2f"%xgb_classifier.score(X_test, y_test))
    print("Train Accuracy Score : %.2f"%xgb_classifier.score(X_train, y_train))

    print("\nRoc Auc Score: ")
    test_preds = xgb_classifier.predict(X_test)
    roc = roc_auc_score(y_test, test_preds)

    all_data[str([n_estimators, max_depth, learning_rate])] = roc

    print(roc)

print(all_data)


`eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.



[0]	validation_0-auc:0.73664
[3]	validation_0-auc:0.84116
[6]	validation_0-auc:0.89072
[9]	validation_0-auc:0.92131
[12]	validation_0-auc:0.92131
[15]	validation_0-auc:0.92041
[18]	validation_0-auc:0.91762
[21]	validation_0-auc:0.91662
[24]	validation_0-auc:0.92090
[27]	validation_0-auc:0.91681
[30]	validation_0-auc:0.91549
[33]	validation_0-auc:0.91490
[36]	validation_0-auc:0.91425
[39]	validation_0-auc:0.91630
[42]	validation_0-auc:0.91582
[45]	validation_0-auc:0.91843
[48]	validation_0-auc:0.91665
[51]	validation_0-auc:0.91639
[54]	validation_0-auc:0.91693
[57]	validation_0-auc:0.91826


KeyboardInterrupt: 