In [98]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
from matplotlib.pyplot import figure
import seaborn as sns

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
import scikitplot as skplt
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import Accuracy, Precision, Recall, AUC, TrueNegatives, TruePositives, FalseNegatives, FalsePositives
from tensorflow.keras.optimizers import Adam, Adamax, Nadam
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [99]:
data = pd.read_csv('data_prepared_to_AI_101')
test = pd.read_csv('test_dataset_hackathon_mkb.csv', sep=';', encoding='cp1251')

train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample'], axis=1)
test_data = test_data.drop(['TARGET'], axis=1)

y = train_data['TARGET']            # наш таргет
X = train_data.drop(['TARGET'], axis=1)

In [100]:
X_train, X_test, y_train, y_test = X.iloc[:14000,], X.iloc[14000:,], y[:14000,], y[14000:,]

In [101]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train)
data_train = scaler.transform(X_train)
data_test = scaler.transform(X_test)

In [102]:
tf.keras.backend.clear_session()
encoding_dim = 30

#Encoder
input_ = Input(shape=(2167, ))
x = Dense(encoding_dim*3, activation='elu')(input_)
x = Dense(encoding_dim*2, activation='elu')(x)
encoded = Dense(encoding_dim, activation="sigmoid")(x)

#Decoder
input_encoded = Input(shape=(encoding_dim,))
x = Dense(encoding_dim*2, activation='elu')(input_encoded)
x = Dense(encoding_dim*3, activation='elu')(x)
flat_decoded = Dense(2167, activation='sigmoid')(x)
decoded = Reshape((2167, 1))(flat_decoded)

encoder = Model(input_, encoded, name="encoder")
decoder = Model(input_encoded, decoded, name="decoder")
autoencoder = Model(input_, decoder(encoder(input_)), name="autoencoder")

autoencoder.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

autoencoder.summary()

Model: "autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2167)]            0         
                                                                 
 encoder (Functional)        (None, 30)                202410    
                                                                 
 decoder (Functional)        (None, 2167, 1)           204547    
                                                                 
Total params: 406,957
Trainable params: 406,957
Non-trainable params: 0
_________________________________________________________________


In [103]:
checkpoint = ModelCheckpoint('best_model_after_autoencoder.hdf5' , 
                             monitor = 'loss', 
                             verbose = 1, 
                             save_best_only=True,
                             mode = 'min',
                             save_weights_only=True,
                             save_freq='epoch'
                            )
callbacks_list = [checkpoint]

autoencoder.fit(data_train, data_train,
                epochs=100,
                shuffle=True,
                callbacks=callbacks_list,
                )

Epoch 1/100
Epoch 00001: loss improved from inf to 0.03486, saving model to best_model_after_autoencoder.hdf5
Epoch 2/100
Epoch 00002: loss improved from 0.03486 to 0.02184, saving model to best_model_after_autoencoder.hdf5
Epoch 3/100
Epoch 00003: loss improved from 0.02184 to 0.01661, saving model to best_model_after_autoencoder.hdf5
Epoch 4/100
Epoch 00004: loss improved from 0.01661 to 0.01554, saving model to best_model_after_autoencoder.hdf5
Epoch 5/100
Epoch 00005: loss improved from 0.01554 to 0.01486, saving model to best_model_after_autoencoder.hdf5
Epoch 6/100
Epoch 00006: loss improved from 0.01486 to 0.01420, saving model to best_model_after_autoencoder.hdf5
Epoch 7/100
Epoch 00007: loss improved from 0.01420 to 0.01344, saving model to best_model_after_autoencoder.hdf5
Epoch 8/100
Epoch 00008: loss improved from 0.01344 to 0.01248, saving model to best_model_after_autoencoder.hdf5
Epoch 9/100
Epoch 00009: loss improved from 0.01248 to 0.01179, saving model to best_model_a

Epoch 30/100
Epoch 00030: loss improved from 0.00775 to 0.00767, saving model to best_model_after_autoencoder.hdf5
Epoch 31/100
Epoch 00031: loss improved from 0.00767 to 0.00759, saving model to best_model_after_autoencoder.hdf5
Epoch 32/100
Epoch 00032: loss improved from 0.00759 to 0.00751, saving model to best_model_after_autoencoder.hdf5
Epoch 33/100
Epoch 00033: loss improved from 0.00751 to 0.00742, saving model to best_model_after_autoencoder.hdf5
Epoch 34/100
Epoch 00034: loss improved from 0.00742 to 0.00737, saving model to best_model_after_autoencoder.hdf5
Epoch 35/100
Epoch 00035: loss improved from 0.00737 to 0.00731, saving model to best_model_after_autoencoder.hdf5
Epoch 36/100
Epoch 00036: loss improved from 0.00731 to 0.00726, saving model to best_model_after_autoencoder.hdf5
Epoch 37/100
Epoch 00037: loss improved from 0.00726 to 0.00719, saving model to best_model_after_autoencoder.hdf5
Epoch 38/100
Epoch 00038: loss improved from 0.00719 to 0.00714, saving model to

Epoch 00058: loss improved from 0.00633 to 0.00629, saving model to best_model_after_autoencoder.hdf5
Epoch 59/100
Epoch 00059: loss improved from 0.00629 to 0.00627, saving model to best_model_after_autoencoder.hdf5
Epoch 60/100
Epoch 00060: loss improved from 0.00627 to 0.00624, saving model to best_model_after_autoencoder.hdf5
Epoch 61/100
Epoch 00061: loss improved from 0.00624 to 0.00620, saving model to best_model_after_autoencoder.hdf5
Epoch 62/100
Epoch 00062: loss improved from 0.00620 to 0.00619, saving model to best_model_after_autoencoder.hdf5
Epoch 63/100
Epoch 00063: loss improved from 0.00619 to 0.00614, saving model to best_model_after_autoencoder.hdf5
Epoch 64/100
Epoch 00064: loss improved from 0.00614 to 0.00613, saving model to best_model_after_autoencoder.hdf5
Epoch 65/100
Epoch 00065: loss improved from 0.00613 to 0.00611, saving model to best_model_after_autoencoder.hdf5
Epoch 66/100
Epoch 00066: loss improved from 0.00611 to 0.00608, saving model to best_model_a

Epoch 87/100
Epoch 00087: loss improved from 0.00562 to 0.00559, saving model to best_model_after_autoencoder.hdf5
Epoch 88/100
Epoch 00088: loss improved from 0.00559 to 0.00556, saving model to best_model_after_autoencoder.hdf5
Epoch 89/100
Epoch 00089: loss improved from 0.00556 to 0.00553, saving model to best_model_after_autoencoder.hdf5
Epoch 90/100
Epoch 00090: loss did not improve from 0.00553
Epoch 91/100
Epoch 00091: loss improved from 0.00553 to 0.00551, saving model to best_model_after_autoencoder.hdf5
Epoch 92/100
Epoch 00092: loss did not improve from 0.00551
Epoch 93/100
Epoch 00093: loss improved from 0.00551 to 0.00545, saving model to best_model_after_autoencoder.hdf5
Epoch 94/100
Epoch 00094: loss did not improve from 0.00545
Epoch 95/100
Epoch 00095: loss improved from 0.00545 to 0.00545, saving model to best_model_after_autoencoder.hdf5
Epoch 96/100
Epoch 00096: loss improved from 0.00545 to 0.00541, saving model to best_model_after_autoencoder.hdf5
Epoch 97/100
Ep

<keras.callbacks.History at 0x7f8211ba6850>

In [104]:
autoencoder.load_weights('best_model_after_autoencoder.hdf5')
predict_autoencoder_X = autoencoder.predict(data_train)
predict_autoencoder_test_data = autoencoder.predict(data_test)

In [105]:

data_predict_autoencoder_X = pd.DataFrame(predict_autoencoder_X.reshape(predict_autoencoder_X.shape[0], predict_autoencoder_X.shape[1]))
data_predict_autoencoder_test_data = pd.DataFrame(predict_autoencoder_test_data.reshape(predict_autoencoder_test_data.shape[0], predict_autoencoder_test_data.shape[1]))

In [106]:
# X_train, X_test, y_train, y_test = data_predict_autoencoder_X.iloc[:14000,], data_predict_autoencoder_X.iloc[14000:,], y[:14000,], y[14000:,]

In [107]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)
scale_pos_weight = sum(y_train == 0) / (1.0 * sum(y_train == 1))

RF_T = CatBoostClassifier(
      verbose=0
    #loss_function='Logloss'
)
RF_T.fit(data_predict_autoencoder_X, y_train)

test_pred = RF_T.predict_proba(data_predict_autoencoder_test_data)[:,1]
test_pred_bin = RF_T.predict(data_predict_autoencoder_test_data)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF_T.predict_proba(data_predict_autoencoder_X)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF_T.predict(data_predict_autoencoder_X)))

print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF_T.predict(data_predict_autoencoder_test_data)))

FP, TP              : 194.31 700.85
ROC_AUC_SCORE       : 0.91
accuracy            : 0.81
precision           : 0.91
recall              : 0.51
f1                  : 0.66
val, train  AUC     : 0.91 0.99
val, train  AUC_2   : 0.74 0.96
[[2430   67]
 [ 681  713]]
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      2497
           1       0.91      0.51      0.66      1394

    accuracy                           0.81      3891
   macro avg       0.85      0.74      0.76      3891
weighted avg       0.83      0.81      0.79      3891



In [108]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)
scale_pos_weight = sum(y_train == 0) / (1.0 * sum(y_train == 1))

RF_T = RandomForestClassifier( n_jobs=-1, random_state=0)
RF_T.fit(data_predict_autoencoder_X, y_train)

test_pred = RF_T.predict_proba(data_predict_autoencoder_test_data)[:,1]
test_pred_bin = RF_T.predict(data_predict_autoencoder_test_data)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF_T.predict_proba(data_predict_autoencoder_X)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF_T.predict(data_predict_autoencoder_X)))

print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF_T.predict(data_predict_autoencoder_test_data)))

FP, TP              : 34.04 111.97
ROC_AUC_SCORE       : 0.90
accuracy            : 0.77
precision           : 0.91
recall              : 0.41
f1                  : 0.56
val, train  AUC     : 0.90 1.00
val, train  AUC_2   : 0.69 1.00
[[2441   56]
 [ 827  567]]
              precision    recall  f1-score   support

           0       0.75      0.98      0.85      2497
           1       0.91      0.41      0.56      1394

    accuracy                           0.77      3891
   macro avg       0.83      0.69      0.70      3891
weighted avg       0.81      0.77      0.74      3891



In [109]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)
scale_pos_weight = sum(y_train == 0) / (1.0 * sum(y_train == 1))

RF_T = LogisticRegression(random_state=0, n_jobs=-1)
RF_T.fit(data_predict_autoencoder_X, y_train)

test_pred = RF_T.predict_proba(data_predict_autoencoder_test_data)[:,1]
test_pred_bin = RF_T.predict(data_predict_autoencoder_test_data)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF_T.predict_proba(data_predict_autoencoder_X)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF_T.predict(data_predict_autoencoder_X)))

print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF_T.predict(data_predict_autoencoder_test_data)))

FP, TP              : 194.84 668.72
ROC_AUC_SCORE       : 0.90
accuracy            : 0.80
precision           : 0.85
recall              : 0.54
f1                  : 0.66
val, train  AUC     : 0.90 0.94
val, train  AUC_2   : 0.74 0.87
[[2362  135]
 [ 643  751]]
              precision    recall  f1-score   support

           0       0.79      0.95      0.86      2497
           1       0.85      0.54      0.66      1394

    accuracy                           0.80      3891
   macro avg       0.82      0.74      0.76      3891
weighted avg       0.81      0.80      0.79      3891



In [110]:
RF_T = lgb.LGBMClassifier(num_leaves=20, learning_rate=0.04, n_estimators=int(1000*1),
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=0)
RF_T.fit(data_predict_autoencoder_X, y_train)

test_pred = RF_T.predict_proba(data_predict_autoencoder_test_data)[:,1]
test_pred_bin = RF_T.predict(data_predict_autoencoder_test_data)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF_T.predict_proba(data_predict_autoencoder_X)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF_T.predict(data_predict_autoencoder_X)))

print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF_T.predict(data_predict_autoencoder_test_data)))

FP, TP              : 222.45 736.99
ROC_AUC_SCORE       : 0.90
accuracy            : 0.81
precision           : 0.91
recall              : 0.52
f1                  : 0.66
val, train  AUC     : 0.90 1.00
val, train  AUC_2   : 0.74 1.00
[[2428   69]
 [ 674  720]]
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      2497
           1       0.91      0.52      0.66      1394

    accuracy                           0.81      3891
   macro avg       0.85      0.74      0.76      3891
weighted avg       0.83      0.81      0.79      3891



In [94]:
data_test_fin = scaler.transform(test_data)

In [95]:
predict_autoencoder_fin = autoencoder.predict(data_test_fin)

In [96]:
data_predict_autoencoder_fin = pd.DataFrame(predict_autoencoder_fin.reshape(predict_autoencoder_fin.shape[0], predict_autoencoder_fin.shape[1]))

In [97]:
test['TARGET'] = RF_T.predict_proba(data_predict_autoencoder_fin)[:,1]
test[['id_contract', 'TARGET']].to_csv('fin_1.csv', sep=';', index=False)
test[['id_contract', 'TARGET']].head()

Unnamed: 0,id_contract,TARGET
0,17892,0.10775
1,17893,0.006122
2,17894,0.035927
3,17895,0.207422
4,17896,0.054934
