In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, roc_auc_score
import sklearn.metrics as metrics

from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder

from catboost import CatBoostClassifier

In [3]:
train = pd.read_csv('train_dataset.csv', low_memory=False)

In [4]:
def remove_repetitions(first_list, second_list):
    return [element for element in first_list if element not in second_list]

=================================================== ОБРАБОТКА КАТЕГОРИАЛЬНЫХ ДАННЫХ ===================================================

In [5]:
cat_cols = [col for col in train.columns if train[col].dtype == 'object']
cat_cols.pop(0)

train_cat = train[cat_cols]
train_cat = train_cat.fillna('')


train_cat_np = train_cat.to_numpy()

for i in range(len(train_cat_np)):
    for j in range(len(train_cat_np[i])):
        el = train_cat_np[i][j]
        if isinstance(el, int):
            continue

        if el != "":
            if "https" in el:
                new_i = el[el.find("//")+2:]
                new_i = new_i[:new_i.find("/")]
                train_cat_np[i][j] = new_i
train_cat = pd.DataFrame(train_cat_np, columns=cat_cols)

In [6]:
train_cat = train_cat.apply(LabelEncoder().fit_transform)

In [7]:
corr_matrix = train_cat.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

train_cat.drop(to_drop, axis=1, inplace=True)

cat_cols = remove_repetitions(cat_cols, to_drop)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ОБРАБОТКА КАТЕГОРИАЛЬНЫХ ДАННЫХ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

=================================================== ОБРАБОТКА ЧИСЛЕННЫХ ДАННЫХ ===================================================

In [8]:
num_cols = [col for col in train.columns if train[col].dtype != 'object']
num_cols.pop(1)
train_num = train[num_cols]
train_num = train_num.fillna(0)  # try mediana

In [9]:
corr_matrix = train_num.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

train_num.drop(to_drop, axis=1, inplace=True)

num_cols = remove_repetitions(num_cols, to_drop)

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ОБРАБОТКА ЧИСЛЕННЫХ ДАННЫХ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [10]:
# СОЕДИНЕНИЕ ОБРАБОТАННЫХ ДАННЫХ
train_norm = train_num.join(train_cat)
train_norm

Unnamed: 0,client_id,col1,col2,col4,col5,col7,col8,col45,col46,col47,...,col1647,col1649,col1650,col2191,col2192,col2193,col2194,col2195,col2196,col2197
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14451,1241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
14452,1969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
14453,7116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
14454,7117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
stand_sc = StandardScaler()

# data_ohe = pd.DataFrame(onehotencoder.fit_transform(train[cat_cols]))
data_num_cols = stand_sc.fit_transform(train_norm[num_cols])
data_norm = pd.DataFrame(data_num_cols, columns=num_cols).join(train_norm[cat_cols])

y = train['target'].values
X = data_norm[data_norm.columns[:-1]].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=62, stratify=y)
print(np.unique(y_train), np.unique(y_test))

[0 1] [0 1]


In [13]:
X_train.shape

(11564, 1256)

In [71]:
# TEST DATASET

test = pd.read_csv('test.csv', low_memory=False, sep=';')

# test_cat = test[cat_cols]
# test_cat = test_cat.fillna('')

new_col = ['client_id', 'col268', 'col365', 'col440', 'col494', 'col620', 'col640',
       'col755', 'col756', 'col947', 'col1080', 'col1099', 'col1100',
       'col1158', 'col1163', 'col1459', 'col1460', 'col1488', 'col1491',
       'col1516', 'col1694', 'col1718', 'col1843', 'col1971', 'col1972',
       'col1975', 'col1978', 'col2012', 'col2125', 'col2128', 'col2170',
       'col2172', 'col2173', 'col2174', 'col2180', 'col2182', 'col2203',
       'col2209', 'col2211', 'col2214', 'col2216', 'col2218', 'col2220',
       'col2221', 'col2222', 'col2229', 'col2235', 'col2236', 'col2238',
       'col2244', 'col2274', 'col2277', 'col2280', 'col2283', 'col2286',
       'col2292', 'col2298', 'col2304', 'col2310', 'col2311', 'col2316',
       'col2317', 'col2328', 'col2341', 'col2364', 'col2365', 'col2389',
       'col2400', 'col2403', 'col2406', 'col2427', 'col2430', 'col2436',
       'col2448', 'col2451', 'col2454', 'col2460', 'col2461', 'col2466',
       'col2468', 'col2470', 'col2484', 'col2486', 'col2558', 'col2588',
       'col2606', 'col2621', 'col2630', 'col2654', 'col2660', 'col2663',
       'col224', 'col243', 'col244', 'col546', 'col550', 'col567', 'col584',
       'col1276', 'col2191']

test = test[new_col]
test_cat_cols = []
test_num_cols = []

for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = test[col].fillna('')
        test_cat_cols.append(col)

    if test[col].dtype != 'object':
        test[col] = test[col].fillna(0)
        test_num_cols.append(col)
#     median = train[col].mean(skipna=True)
#     train[col] = train[col].fillna(median)
# train[cat_cols] = train[cat_cols].fillna('')

# del_cols = ['col1280', 'col1279', 'col1278', 'col1277', 'col1276', 'col1275', 'col1274', 'col1273', 'col1180', 'col1172','col1171', 'col1170', 'col1076', 'col188',
#             'col189', 'col190', 'col191', 'col192', 'col194', 'col195', 'col819', 'col820', 'col823', 'col824']
# def remove_repetitions(first_list, second_list):
#     return [element for element in first_list if element not in second_list]

# cat_cols = remove_repetitions(cat_cols, del_cols)
test_cat = test[test_cat_cols]
test_cat = test_cat.apply(LabelEncoder().fit_transform)

# test_num = test[num_cols]
# test_num = test_num.fillna(0)

# test_norm = test_num.join(test_cat)

stand_sc = StandardScaler()
test_num = test[test_num_cols]
test_num = stand_sc.fit_transform(test_num[test_num_cols])
test_norm = pd.DataFrame(test_num, columns=test_num_cols).join(test_cat[test_cat_cols])

# columns_to_keep = [0,   86,  125,  143,  161,  169,  186,  189,  190,  253,  283,
#         288,  289,  299,  304,  394,  395,  408,  411,  430,  577,  584,
#         622,  703,  704,  706,  707,  723,  744,  745,  759,  761,  762,
#         763,  767,  769,  775,  779,  780,  782,  784,  786,  787,  788,
#         789,  794,  798,  799,  800,  804,  831,  832,  833,  834,  835,
#         837,  841,  843,  846,  847,  848,  849,  854,  857,  860,  861,
#         862,  866,  867,  868,  875,  876,  878,  882,  883,  884,  886,
#         887,  891,  893,  895,  907,  908,  956,  960,  966,  969,  972,
#         978,  980,  981, 1078, 1089, 1090, 1124, 1128, 1145, 1162, 1234,
#        1250]
# new_list = [x+1 for x in columns_to_keep]

# test_norm_new = test.iloc[:, test_norm]

# data_ohe = pd.DataFrame(onehotencoder.fit_transform(train[cat_cols]))
# test_num_cols = stand_sc.fit_transform(test_norm_new[num_cols])
# test_norm = pd.DataFrame(test_num_cols, columns=num_cols).join(test_norm_new[cat_cols])

# y = train['target'].values
test_X = test_norm[test_norm.columns[:-1]].values # 


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=62, stratify=y)
# print(np.unique(y_train), np.unique(y_test))

In [59]:
colname = test.columns[columns_to_keep]
colname

Index(['report_date', 'col85', 'col124', 'col142', 'col160', 'col168',
       'col185', 'col188', 'col189', 'col252', 'col282', 'col287', 'col288',
       'col298', 'col303', 'col393', 'col394', 'col407', 'col410', 'col429',
       'col576', 'col583', 'col621', 'col702', 'col703', 'col705', 'col706',
       'col722', 'col743', 'col744', 'col758', 'col760', 'col761', 'col762',
       'col766', 'col768', 'col774', 'col778', 'col779', 'col781', 'col783',
       'col785', 'col786', 'col787', 'col788', 'col793', 'col797', 'col798',
       'col799', 'col803', 'col830', 'col831', 'col832', 'col833', 'col834',
       'col836', 'col840', 'col842', 'col845', 'col846', 'col847', 'col848',
       'col853', 'col856', 'col859', 'col860', 'col861', 'col865', 'col866',
       'col867', 'col874', 'col875', 'col877', 'col881', 'col882', 'col883',
       'col885', 'col886', 'col890', 'col892', 'col894', 'col906', 'col907',
       'col955', 'col959', 'col965', 'col968', 'col971', 'col977', 'col979',
     

In [57]:
cat_cols

['col49',
 'col50',
 'col51',
 'col52',
 'col85',
 'col86',
 'col87',
 'col88',
 'col89',
 'col90',
 'col91',
 'col92',
 'col93',
 'col94',
 'col95',
 'col96',
 'col97',
 'col98',
 'col99',
 'col100',
 'col129',
 'col130',
 'col131',
 'col132',
 'col133',
 'col134',
 'col135',
 'col136',
 'col137',
 'col138',
 'col139',
 'col140',
 'col144',
 'col145',
 'col146',
 'col147',
 'col148',
 'col149',
 'col150',
 'col151',
 'col152',
 'col153',
 'col154',
 'col155',
 'col156',
 'col157',
 'col158',
 'col159',
 'col160',
 'col161',
 'col162',
 'col163',
 'col164',
 'col165',
 'col166',
 'col167',
 'col168',
 'col169',
 'col170',
 'col171',
 'col172',
 'col173',
 'col174',
 'col175',
 'col176',
 'col181',
 'col182',
 'col183',
 'col184',
 'col185',
 'col186',
 'col187',
 'col193',
 'col196',
 'col201',
 'col202',
 'col203',
 'col204',
 'col205',
 'col206',
 'col207',
 'col208',
 'col210',
 'col214',
 'col215',
 'col216',
 'col221',
 'col222',
 'col223',
 'col224',
 'col225',
 'col226',
 'col22

In [None]:
# XGBOOST EXECUTION

import xgboost
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

DMatTrain = xgboost.DMatrix(X_train, y_train)
DMatTest  = xgboost.DMatrix(X_test,  y_test )

booster = xgboost.train(params={'max_depth' : 10, "objective" : "binary:logistic" }, dtrain=DMatTrain, evals=[ (DMatTrain, "train"), (DMatTest, "test") ])

print("Train RMSE: ", booster.eval(DMatTrain))
print("Test RMSE:  ", booster.eval(DMatTest ))

train_preds = [1 if pred>0.5 else 0 for pred in booster.predict(data=DMatTrain)]
test_preds  = [1 if pred>0.5 else 0 for pred in booster.predict(data=DMatTest )]

print("\nTest  Accuracy : %.2f"%accuracy_score(y_test, test_preds))
print("Train Accuracy : %.2f"%accuracy_score(y_train, train_preds))

print("\nConfusion Matrix : ")
print(confusion_matrix(y_test, test_preds))

print("\nClassification Report : ")
print(classification_report(y_test, test_preds))

print("\nRoc Auc Score: ")
print(roc_auc_score(y_test, test_preds))

In [None]:
import random
all_data: dict = []
for _ in range(100):
    n_estimators = random.randint(100, 600)
    max_depth = random.randint(10, 50)
    learning_rate = random.uniform(0.1, 0.9)

    xgb_classifier = xgboost.XGBClassifier(n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate)

    xgb_classifier.fit(X_train, y_train, eval_set = [ (X_test, y_test) ], eval_metric = "auc", verbose = 3)

    print("Test  Accuracy Score : %.2f"%xgb_classifier.score(X_test, y_test))
    print("Train Accuracy Score : %.2f"%xgb_classifier.score(X_train, y_train))

    print("\nRoc Auc Score: ")
    test_preds = xgb_classifier.predict(X_test)
    roc = roc_auc_score(y_test, test_preds)

    all_data[str([n_estimators, max_depth, learning_rate])] = roc

    print(roc)

print(all_data)

In [68]:
from catboost import Pool, EShapCalcType, EFeaturesSelectionAlgorithm

feature_names = ['F{}'.format(i) for i in range(X_train.shape[1])]
train_pool = Pool(X_train, y_train, feature_names=feature_names)
test_pool = Pool(X_test, y_test, feature_names=feature_names)

model = CatBoostClassifier(iterations=1000, random_seed=0, task_type="GPU")
summary = model.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select=feature_names,
    num_features_to_select=100,
    steps=3,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Verbose',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.057831
Step #1 out of 3
0:	learn: 0.5988332	test: 0.5991156	best: 0.5991156 (0)	total: 92.9ms	remaining: 1m 32s
1:	learn: 0.5237573	test: 0.5240507	best: 0.5240507 (1)	total: 157ms	remaining: 1m 18s
2:	learn: 0.4557849	test: 0.4558314	best: 0.4558314 (2)	total: 219ms	remaining: 1m 12s
3:	learn: 0.4022886	test: 0.4024879	best: 0.4024879 (3)	total: 279ms	remaining: 1m 9s
4:	learn: 0.3575235	test: 0.3580012	best: 0.3580012 (4)	total: 339ms	remaining: 1m 7s
5:	learn: 0.3193628	test: 0.3202174	best: 0.3202174 (5)	total: 403ms	remaining: 1m 6s
6:	learn: 0.2886485	test: 0.2894004	best: 0.2894004 (6)	total: 459ms	remaining: 1m 5s
7:	learn: 0.2623266	test: 0.2631091	best: 0.2631091 (7)	total: 515ms	remaining: 1m 3s
8:	learn: 0.2397979	test: 0.2406232	best: 0.2406232 (8)	total: 572ms	remaining: 1m 2s
9:	learn: 0.2193793	test: 0.2204642	best: 0.2204642 (9)	total: 633ms	remaining: 1m 2s
10:	learn: 0.2045348	test: 0.2057845	best: 0.2057845 (10)	total: 692ms	remaining: 1m 2s
1

In [42]:
feature_importance = model.get_feature_importance(train_pool)
feature_importance
feature_importance.shape[0]

print(len(feature_names))
type(feature_importance)
print(feature_importance.shape[0])
indexes = np.where(feature_importance > 0)

# Выведите индексы элементов больше 0
print(indexes)

result = feature_importance[feature_importance > 0]

print(result)
type(result)

result.shape[0]


1260
1260
(array([   0,  143,  169,  289,  394,  395,  584,  704,  722,  782,  784,
        787,  788,  789,  804,  831,  832,  834,  837,  843,  846,  847,
        848,  849,  857,  859,  860,  861,  862,  865,  866,  867,  868,
        876,  882,  883,  884,  886,  887,  893,  895,  956,  960,  972,
        978,  980,  981, 1090, 1234, 1250]),)
[7.9239179  0.67769852 0.29894818 0.19301966 1.23994984 4.10778629
 0.15585332 0.72378712 0.66893771 2.62045966 2.62412722 3.80195929
 1.35348715 3.49956724 1.63959463 0.28908953 0.27732557 1.99107685
 3.934261   0.75702416 2.1363762  1.74984042 4.01491959 3.64488274
 4.66912147 0.84997454 0.51015802 0.89826065 2.80664992 0.3899543
 1.72083916 0.19056896 2.17806403 2.27758045 0.79876938 1.05944051
 1.7033582  1.54507664 2.43983856 2.13199792 2.69863367 0.97269597
 1.56405942 1.25632154 1.08897977 1.23228151 9.70700642 0.38118357
 0.16119242 4.44410321]


50

In [23]:
type(data_norm)
columns_to_keep = [0,   86,  125,  143,  161,  169,  186,  189,  190,  253,  283,
        288,  289,  299,  304,  394,  395,  408,  411,  430,  577,  584,
        622,  703,  704,  706,  707,  723,  744,  745,  759,  761,  762,
        763,  767,  769,  775,  779,  780,  782,  784,  786,  787,  788,
        789,  794,  798,  799,  800,  804,  831,  832,  833,  834,  835,
        837,  841,  843,  846,  847,  848,  849,  854,  857,  860,  861,
        862,  866,  867,  868,  875,  876,  878,  882,  883,  884,  886,
        887,  891,  893,  895,  907,  908,  956,  960,  966,  969,  972,
        978,  980,  981, 1078, 1089, 1090, 1124, 1128, 1145, 1162, 1234,
       1250]

#[ 0,   14,   46,   95,  103,  120,  125,  127,  131,  143,  152,
#        161,  166,  167,  169,  174,  176,  185,  186,  189,  191,  228,
#        230,  234,  253,  255,  259,  278,  283,  288,  289,  292,  295,
#        296,  299,  300,  304,  307,  337,  375,  393,  394,  395,  408,
#        410,  424,  430,  577,  584,  613,  621,  622,  623,  681,  691,
#        706,  707,  711,  712,  714,  715,  720,  722,  737,  745,  748,
#        755,  757,  759,  761,  762,  763,  767,  775,  779,  782,  784,
#        786,  787,  788,  789,  794,  798,  799,  800,  804,  805,  831,
#        832,  833,  834,  835,  837,  841,  843,  846,  847,  848,  849,
#        854,  857,  860,  861,  862,  866,  867,  868,  870,  876,  878,
#        882,  883,  884,  886,  887,  893,  895,  905,  907,  956,  960,
#        962,  966,  972,  978,  980,  981, 1020, 1077, 1078, 1089, 1090,
#       1115, 1124, 1128, 1130, 1142, 1145, 1150, 1154, 1162, 1163, 1174,
#       1189, 1206, 1227, 1234, 1250, 1256 ] 

#[   0,  143,  169,  289,  394,  395,  584,  704,  722,  782,  784,
#       787,  788,  789,  804,  831,  832,  834,  837,  843,  846,  847,
#       848,  849,  857,  859,  860,  861,  862,  865,  866,  867,  868,
#       876,  882,  883,  884,  886,  887,  893,  895,  956,  960,  972,
#       978,  980,  981, 1090, 1234, 1250]

print(data_norm.shape)
print(len(columns_to_keep))


data_norm_new = data_norm.iloc[:, columns_to_keep]

print(data_norm_new.shape)

(14456, 1257)
100
(14456, 100)


In [24]:
# stand_sc = StandardScaler()

# # data_ohe = pd.DataFrame(onehotencoder.fit_transform(train[cat_cols]))
# data_num_cols = stand_sc.fit_transform(train_norm[num_cols])
# data_norm = pd.DataFrame(data_num_cols, columns=num_cols).join(train_norm[cat_cols])

y = train['target'].values
X = data_norm_new[data_norm_new.columns[:-1]].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=62, stratify=y)
print(np.unique(y_train), np.unique(y_test))

[0 1] [0 1]


In [62]:
old_columns = data_norm_new.columns
old_columns

Index(['client_id', 'col268', 'col365', 'col440', 'col494', 'col620', 'col640',
       'col755', 'col756', 'col947', 'col1080', 'col1099', 'col1100',
       'col1158', 'col1163', 'col1459', 'col1460', 'col1488', 'col1491',
       'col1516', 'col1694', 'col1718', 'col1843', 'col1971', 'col1972',
       'col1975', 'col1978', 'col2012', 'col2125', 'col2128', 'col2170',
       'col2172', 'col2173', 'col2174', 'col2180', 'col2182', 'col2203',
       'col2209', 'col2211', 'col2214', 'col2216', 'col2218', 'col2220',
       'col2221', 'col2222', 'col2229', 'col2235', 'col2236', 'col2238',
       'col2244', 'col2274', 'col2277', 'col2280', 'col2283', 'col2286',
       'col2292', 'col2298', 'col2304', 'col2310', 'col2311', 'col2316',
       'col2317', 'col2328', 'col2341', 'col2364', 'col2365', 'col2389',
       'col2400', 'col2403', 'col2406', 'col2427', 'col2430', 'col2436',
       'col2448', 'col2451', 'col2454', 'col2460', 'col2461', 'col2466',
       'col2468', 'col2470', 'col2484', 'col248

In [36]:
import xgboost

xgb_classifier = xgboost.XGBClassifier(params = {'updater' : 'refresh'}, n_estimators = 300, max_depth = 11, learning_rate = 0.1, booster = "gbtree", tree_method = "hist", objective = "reg:squarederror",eval_metric = "auc")

# objective reg:squarederror exact tree method, booster gbtree 350, 20, 0.1

xgb_classifier.fit(X_train, y_train, eval_set = [ (X_test, y_test) ])

print("Test  Accuracy Score : %.2f"%xgb_classifier.score(X_test, y_test))
print("Train Accuracy Score : %.2f"%xgb_classifier.score(X_train, y_train))

print("\nRoc Auc Score: ")
test_preds = xgb_classifier.predict(X_test)
print(roc_auc_score(y_test, test_preds))

Parameters: { "params" } are not used.

[0]	validation_0-auc:0.77661
[1]	validation_0-auc:0.80737
[2]	validation_0-auc:0.80029
[3]	validation_0-auc:0.83274
[4]	validation_0-auc:0.86738
[5]	validation_0-auc:0.87126
[6]	validation_0-auc:0.87911
[7]	validation_0-auc:0.87521
[8]	validation_0-auc:0.88146
[9]	validation_0-auc:0.90213
[10]	validation_0-auc:0.90336
[11]	validation_0-auc:0.90376
[12]	validation_0-auc:0.91056
[13]	validation_0-auc:0.90877
[14]	validation_0-auc:0.91874
[15]	validation_0-auc:0.91811
[16]	validation_0-auc:0.91422
[17]	validation_0-auc:0.91302
[18]	validation_0-auc:0.91671
[19]	validation_0-auc:0.92016
[20]	validation_0-auc:0.92356
[21]	validation_0-auc:0.92159
[22]	validation_0-auc:0.93495
[23]	validation_0-auc:0.93651
[24]	validation_0-auc:0.93811
[25]	validation_0-auc:0.94073
[26]	validation_0-auc:0.94219
[27]	validation_0-auc:0.94223
[28]	validation_0-auc:0.94125
[29]	validation_0-auc:0.94238
[30]	validation_0-auc:0.94258
[31]	validation_0-auc:0.94172
[32]	valid

In [72]:
pred_test = xgb_classifier.predict(test_X)

In [73]:
pred_test

array([0, 1, 1, ..., 0, 0, 0])

In [98]:
submission = pd.read_csv('./submission.csv')
test_ = pd.read_csv('test.csv', low_memory=False, sep=';')
test_['score'] = pred_test
# print(len(pred_test))
# sample_submission = test_.loc[:, ['client_id', 'score']]
test_ = test_[['id', 'score']]
test_.to_csv('./my_submission.csv', index=False, sep=';')
test_

Unnamed: 0,id,score
0,0,0
1,1,1
2,2,1
3,3,0
4,4,0
...,...,...
3637,3637,0
3638,3638,0
3639,3639,0
3640,3640,0


In [30]:
# XGBOOST CLASSIFIER OBJECTIVE MULTITEST
import xgboost

objectives = ["reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror", "reg:absoluteerror", "binary:logistic", "binary:logitraw", "binary:hinge"] # removed reg:tweedie
results = dict.fromkeys(objectives, 0)

for objective in objectives:
    xgb_classifier = xgboost.XGBClassifier(
        n_estimators = 300, 
        max_depth = 11, 
        learning_rate = 0.1, 
        booster = "gbtree", 
        tree_method = "hist", 
        objective = objective, 
        eval_metric = "auc"
    )

    # 350 30 0.1 gbtree hist squaredlogerror 0.800 50crit
    # 350 30 0.1 gbtree approx squaredlogerror 0.806 50crit
    # 350 30 0.1 gbtree exact squaredlogerror 0.794 50crit

    xgb_classifier.fit(X_train, y_train, eval_set = [ (X_test, y_test) ], verbose = 0)

    print(f"Objective - {objective} complete!")
    test_preds = xgb_classifier.predict(X_test)
    results[objective] = roc_auc_score(y_test, test_preds)

results

Objective - reg:squarederror complete!
Objective - reg:squaredlogerror complete!
Objective - reg:logistic complete!
Objective - reg:pseudohubererror complete!
Objective - reg:absoluteerror complete!
Objective - binary:logistic complete!
Objective - binary:logitraw complete!
Objective - binary:hinge complete!


{'reg:squarederror': 0.8233081320302416,
 'reg:squaredlogerror': 0.7559468928637285,
 'reg:logistic': 0.7737230315323622,
 'reg:pseudohubererror': 0.8118138791566785,
 'reg:absoluteerror': 0.7764828815538755,
 'binary:logistic': 0.7737230315323622,
 'binary:logitraw': 0.7453439055873132,
 'binary:hinge': 0.7904296514844182}

In [35]:
from sklearn.ensemble import HistGradientBoostingClassifier

HGBClassifier = HistGradientBoostingClassifier(loss = 'log_loss', learning_rate=0.1, max_iter=300, n_iter_no_change=20, max_depth=10, scoring='roc_auc', min_samples_leaf=1, verbose=3, random_state=69)

HGBClassifier.fit(X_train, y_train)

test_preds = HGBClassifier.predict(X_test)
print(roc_auc_score(y_test, test_preds))

Binning 0.008 GB of training data: 0.052 s
Binning 0.001 GB of validation data: 0.001 s
Fitting gradient boosted rounds:
[1/300] 1 tree, 31 leaves, max depth = 9, train score: 0.70559, val score: 0.61112, in 0.023s
[2/300] 1 tree, 31 leaves, max depth = 10, train score: 0.83140, val score: 0.73006, in 0.018s
[3/300] 1 tree, 31 leaves, max depth = 10, train score: 0.86154, val score: 0.74660, in 0.018s
[4/300] 1 tree, 31 leaves, max depth = 9, train score: 0.89891, val score: 0.82227, in 0.024s
[5/300] 1 tree, 31 leaves, max depth = 10, train score: 0.92060, val score: 0.83038, in 0.016s
[6/300] 1 tree, 31 leaves, max depth = 7, train score: 0.93916, val score: 0.84791, in 0.018s
[7/300] 1 tree, 31 leaves, max depth = 8, train score: 0.94871, val score: 0.87624, in 0.017s
[8/300] 1 tree, 31 leaves, max depth = 10, train score: 0.95756, val score: 0.88659, in 0.018s
[9/300] 1 tree, 31 leaves, max depth = 10, train score: 0.96488, val score: 0.89001, in 0.018s
[10/300] 1 tree, 31 leaves, 

In [34]:
from sklearn.ensemble import GradientBoostingClassifier

GBClassifier = GradientBoostingClassifier(loss = 'log_loss', learning_rate=0.1, n_estimators=300, n_iter_no_change=10, max_depth=10, min_samples_leaf=1, verbose=1, random_state=69)

GBClassifier.fit(X_train, y_train)

test_preds = GBClassifier.predict(X_test)
print(roc_auc_score(y_test, test_preds))

      Iter       Train Loss   Remaining Time 
         1           0.2040            1.41m
         2           0.1626            1.41m
         3           0.1317            1.40m
         4           0.1173            1.40m
         5           0.1078            1.39m
         6           0.0973            1.40m
         7           0.0914            1.39m
         8           0.0838            1.39m
         9           0.0786            1.38m
        10           0.0743            1.38m
        20           0.0387            1.34m
        30           0.0214            1.30m
        40           0.0130            1.25m
0.778044133013707


In [70]:
from sklearn.ensemble import RandomForestClassifier
import time
start_time = time.time()

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=2500, max_depth=140, verbose=2, min_samples_leaf=1,n_jobs=-1,bootstrap=False,criterion="log_loss")

# Fit the Random Forest classifier to the training data
rf_classifier.fit(X_train, y_train)


# Evaluate the model
print("Test  Accuracy Score : %.2f" % rf_classifier.score(X_test, y_test))
print("Train Accuracy Score : %.2f" % rf_classifier.score(X_train, y_train))

print("\nRoc Auc Score: ")
test_preds = rf_classifier.predict(X_test)
print(roc_auc_score(y_test, test_preds))
end_time = time.time()
execution_time = end_time - start_time
print(f"RandomForestClassifier took {execution_time:.2f} seconds")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 2500
building tree 2 of 2500
building tree 3 of 2500
building tree 4 of 2500
building tree 5 of 2500
building tree 6 of 2500
building tree 7 of 2500
building tree 8 of 2500
building tree 9 of 2500
building tree 10 of 2500
building tree 11 of 2500
building tree 12 of 2500
building tree 13 of 2500
building tree 14 of 2500
building tree 15 of 2500
building tree 16 of 2500
building tree 17 of 2500
building tree 18 of 2500
building tree 19 of 2500
building tree 20 of 2500
building tree 21 of 2500
building tree 22 of 2500
building tree 23 of 2500
building tree 24 of 2500
building tree 25 of 2500
building tree 26 of 2500
building tree 27 of 2500
building tree 28 of 2500
building tree 29 of 2500
building tree 30 of 2500
building tree 31 of 2500
building tree 32 of 2500
building tree 33 of 2500
building tree 34 of 2500
building tree 35 of 2500
building tree 36 of 2500
building tree 37 of 2500
building tree 38 of 2500
building tree 39 of 2500
building tree 40 of 2500
building 

[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    1.3s


building tree 45 of 2500
building tree 46 of 2500
building tree 47 of 2500
building tree 48 of 2500
building tree 49 of 2500
building tree 50 of 2500
building tree 51 of 2500
building tree 52 of 2500
building tree 53 of 2500
building tree 54 of 2500
building tree 55 of 2500
building tree 56 of 2500
building tree 57 of 2500
building tree 58 of 2500
building tree 59 of 2500
building tree 60 of 2500
building tree 61 of 2500
building tree 62 of 2500
building tree 63 of 2500
building tree 64 of 2500
building tree 65 of 2500
building tree 66 of 2500
building tree 67 of 2500
building tree 68 of 2500
building tree 69 of 2500
building tree 70 of 2500
building tree 71 of 2500
building tree 72 of 2500
building tree 73 of 2500
building tree 74 of 2500
building tree 75 of 2500
building tree 76 of 2500
building tree 77 of 2500
building tree 78 of 2500
building tree 79 of 2500
building tree 80 of 2500
building tree 81 of 2500
building tree 82 of 2500
building tree 83 of 2500
building tree 84 of 2500


[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    4.3s


building tree 163 of 2500
building tree 164 of 2500
building tree 165 of 2500
building tree 166 of 2500
building tree 167 of 2500
building tree 168 of 2500building tree 169 of 2500

building tree 170 of 2500
building tree 171 of 2500
building tree 172 of 2500
building tree 173 of 2500
building tree 174 of 2500
building tree 175 of 2500
building tree 176 of 2500
building tree 177 of 2500
building tree 178 of 2500
building tree 179 of 2500
building tree 180 of 2500
building tree 181 of 2500
building tree 182 of 2500
building tree 183 of 2500
building tree 184 of 2500
building tree 185 of 2500
building tree 186 of 2500
building tree 187 of 2500
building tree 188 of 2500
building tree 189 of 2500
building tree 190 of 2500
building tree 191 of 2500
building tree 192 of 2500
building tree 193 of 2500
building tree 194 of 2500
building tree 195 of 2500
building tree 196 of 2500
building tree 197 of 2500
building tree 198 of 2500
building tree 199 of 2500
building tree 200 of 2500
building tre

[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:    9.4s


building tree 367 of 2500
building tree 368 of 2500
building tree 369 of 2500
building tree 370 of 2500
building tree 371 of 2500
building tree 372 of 2500
building tree 373 of 2500
building tree 374 of 2500
building tree 375 of 2500
building tree 376 of 2500
building tree 377 of 2500
building tree 378 of 2500
building tree 379 of 2500
building tree 380 of 2500
building tree 381 of 2500
building tree 382 of 2500
building tree 383 of 2500
building tree 384 of 2500
building tree 385 of 2500
building tree 386 of 2500
building tree 387 of 2500
building tree 388 of 2500
building tree 389 of 2500
building tree 390 of 2500
building tree 391 of 2500
building tree 392 of 2500
building tree 393 of 2500
building tree 394 of 2500
building tree 395 of 2500
building tree 396 of 2500
building tree 397 of 2500
building tree 398 of 2500
building tree 399 of 2500
building tree 400 of 2500
building tree 401 of 2500
building tree 402 of 2500
building tree 403 of 2500
building tree 404 of 2500
building tre

[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:   17.7s


building tree 647 of 2500
building tree 648 of 2500
building tree 649 of 2500
building tree 650 of 2500
building tree 651 of 2500
building tree 652 of 2500
building tree 653 of 2500
building tree 654 of 2500
building tree 655 of 2500
building tree 656 of 2500
building tree 657 of 2500
building tree 658 of 2500
building tree 659 of 2500
building tree 660 of 2500
building tree 661 of 2500
building tree 662 of 2500
building tree 663 of 2500
building tree 664 of 2500
building tree 665 of 2500
building tree 666 of 2500
building tree 667 of 2500
building tree 668 of 2500
building tree 669 of 2500
building tree 670 of 2500
building tree 671 of 2500
building tree 672 of 2500
building tree 673 of 2500
building tree 674 of 2500
building tree 675 of 2500
building tree 676 of 2500
building tree 677 of 2500
building tree 678 of 2500
building tree 679 of 2500
building tree 680 of 2500
building tree 681 of 2500
building tree 682 of 2500
building tree 683 of 2500
building tree 684 of 2500
building tre

[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed:   28.1s


building tree 1013 of 2500
building tree 1014 of 2500
building tree 1015 of 2500
building tree 1016 of 2500
building tree 1017 of 2500
building tree 1018 of 2500
building tree 1019 of 2500
building tree 1020 of 2500
building tree 1021 of 2500
building tree 1022 of 2500
building tree 1023 of 2500
building tree 1024 of 2500
building tree 1025 of 2500
building tree 1026 of 2500
building tree 1027 of 2500
building tree 1028 of 2500
building tree 1029 of 2500
building tree 1030 of 2500
building tree 1031 of 2500
building tree 1032 of 2500
building tree 1033 of 2500
building tree 1034 of 2500
building tree 1035 of 2500
building tree 1036 of 2500
building tree 1037 of 2500
building tree 1038 of 2500
building tree 1039 of 2500
building tree 1040 of 2500
building tree 1041 of 2500
building tree 1042 of 2500
building tree 1043 of 2500
building tree 1044 of 2500
building tree 1045 of 2500
building tree 1046 of 2500
building tree 1047 of 2500
building tree 1048 of 2500
building tree 1049 of 2500
b

[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed:   40.5s


building tree 1456 of 2500
building tree 1457 of 2500
building tree 1458 of 2500
building tree 1459 of 2500
building tree 1460 of 2500
building tree 1461 of 2500
building tree 1462 of 2500
building tree 1463 of 2500
building tree 1464 of 2500
building tree 1465 of 2500
building tree 1466 of 2500
building tree 1467 of 2500
building tree 1468 of 2500
building tree 1469 of 2500
building tree 1470 of 2500
building tree 1471 of 2500
building tree 1472 of 2500
building tree 1473 of 2500
building tree 1474 of 2500
building tree 1475 of 2500
building tree 1476 of 2500
building tree 1477 of 2500
building tree 1478 of 2500
building tree 1479 of 2500
building tree 1480 of 2500
building tree 1481 of 2500
building tree 1482 of 2500
building tree 1483 of 2500
building tree 1484 of 2500
building tree 1485 of 2500
building tree 1486 of 2500
building tree 1487 of 2500
building tree 1488 of 2500
building tree 1489 of 2500
building tree 1490 of 2500
building tree 1491 of 2500
building tree 1492 of 2500
b

[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed:   54.9s


building tree 1988 of 2500
building tree 1989 of 2500
building tree 1990 of 2500
building tree 1991 of 2500
building tree 1992 of 2500
building tree 1993 of 2500
building tree 1994 of 2500
building tree 1995 of 2500
building tree 1996 of 2500
building tree 1997 of 2500
building tree 1998 of 2500
building tree 1999 of 2500
building tree 2000 of 2500
building tree 2001 of 2500
building tree 2002 of 2500
building tree 2003 of 2500
building tree 2004 of 2500
building tree 2005 of 2500
building tree 2006 of 2500
building tree 2007 of 2500
building tree 2008 of 2500
building tree 2009 of 2500
building tree 2010 of 2500
building tree 2011 of 2500
building tree 2012 of 2500
building tree 2013 of 2500
building tree 2014 of 2500
building tree 2015 of 2500
building tree 2016 of 2500
building tree 2017 of 2500
building tree 2018 of 2500
building tree 2019 of 2500
building tree 2020 of 2500
building tree 2021 of 2500
building tree 2022 of 2500
building tree 2023 of 2500
building tree 2024 of 2500
b

[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed:  1.2min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 1009 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 1454 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 1981 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 2500 out of 2500 | elapsed:    1.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:    0.2s


Test  Accuracy Score : 0.98


[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 1009 tasks      | elapsed:    0.9s
[Parallel(n_jobs=2)]: Done 1454 tasks      | elapsed:    1.3s
[Parallel(n_jobs=2)]: Done 1981 tasks      | elapsed:    1.8s
[Parallel(n_jobs=2)]: Done 2500 out of 2500 | elapsed:    2.3s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:    0.2s


Train Accuracy Score : 1.00

Roc Auc Score: 


[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 1009 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 1454 tasks      | elapsed:    0.6s
[Parallel(n_jobs=2)]: Done 1981 tasks      | elapsed:    0.8s


0.8128833978732558
RandomForestClassifier took 74.88 seconds


[Parallel(n_jobs=2)]: Done 2500 out of 2500 | elapsed:    1.1s finished
