In [52]:
from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import settings

import pandas as pd
pd.set_option('display.max_rows', 500)
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
from matplotlib.pyplot import figure
import seaborn as sns

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, roc_curve, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression

from catboost import CatBoostClassifier
from category_encoders import TargetEncoder

import lightgbm as lgb
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest

from tqdm import tqdm

from category_encoders import TargetEncoder

In [53]:
data = pd.read_csv('30.csv')
test = pd.read_csv('test_dataset_hackathon_mkb.csv', sep=';', encoding='cp1251')

train_data = data.query('sample == 1').drop(['sample'], axis=1)
test_data = data.query('sample == 0').drop(['sample'], axis=1)
test_data = test_data.drop(['TARGET'], axis=1)

y = train_data['TARGET'].values            # наш таргет
X = train_data.drop(['TARGET'], axis=1)

X_train, X_test, y_train, y_test = X.iloc[:14000,], X.iloc[14000:,], y[:14000,], y[14000:,]

In [54]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)

RF = lgb.LGBMClassifier(num_leaves=200, learning_rate=0.04, n_estimators=int(1000*1),
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=0)
RF.fit(X_train.values, y_train)

test_pred = RF.predict_proba(X_test)[:,1]
test_pred_bin = RF.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF.predict_proba(X_train)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF.predict(X_train)))
print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF.predict(X_test)))

FP, TP              : 361.33 867.14
ROC_AUC_SCORE       : 0.93
accuracy            : 0.86
precision           : 0.91
recall              : 0.69
f1                  : 0.78
val, train  AUC     : 0.93 1.00
val, train  AUC_2   : 0.83 0.99
[[2407   90]
 [ 437  957]]
              precision    recall  f1-score   support

           0       0.85      0.96      0.90      2497
           1       0.91      0.69      0.78      1394

    accuracy                           0.86      3891
   macro avg       0.88      0.83      0.84      3891
weighted avg       0.87      0.86      0.86      3891



In [55]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)
scale_pos_weight = sum(y_train == 0) / (1.0 * sum(y_train == 1))

RF_T = CatBoostClassifier(
      verbose=0
    #loss_function='Logloss'
)
RF_T.fit(X_train, y_train)

test_pred = RF_T.predict_proba(X_test)[:,1]
test_pred_bin = RF_T.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF_T.predict_proba(X_train)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF_T.predict(X_train)))

print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF_T.predict(X_test)))

FP, TP              : 348.87 858.96
ROC_AUC_SCORE       : 0.94
accuracy            : 0.86
precision           : 0.88
recall              : 0.71
f1                  : 0.79
val, train  AUC     : 0.94 0.98
val, train  AUC_2   : 0.83 0.93
[[2368  129]
 [ 408  986]]
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      2497
           1       0.88      0.71      0.79      1394

    accuracy                           0.86      3891
   macro avg       0.87      0.83      0.84      3891
weighted avg       0.86      0.86      0.86      3891



In [56]:
settings_minimal = settings.MinimalFCParameters()
settings_time = settings.TimeBasedFCParameters()
settings_time.update(settings_minimal)
settings_efficient = settings.EfficientFCParameters()
settings_comprehensive = settings.ComprehensiveFCParameters()

In [57]:
data_long = pd.DataFrame({0: X.values.flatten(),
                          1: np.arange(X.shape[0]).repeat(X.shape[1])})
print(data_long.shape)
data_long.head()

(536730, 2)


Unnamed: 0,0,1
0,0.0,0
1,7.449408e+17,0
2,0.0,0
3,1.151712e+18,0
4,0.0,0


In [58]:
X_2 = extract_features(data_long, column_id=1, impute_function=impute, default_fc_parameters=settings_efficient)
print(X_2.shape)

Feature Extraction: 100%|██████████| 40/40 [01:41<00:00,  2.53s/it]


(17891, 781)


In [59]:
# X.to_csv('X.csv', sep=';', index=False)

In [60]:
X_train, X_test, y_train, y_test = X_2.iloc[:14000,], X_2.iloc[14000:,], y[:14000,], y[14000:,]

In [61]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)

RF = lgb.LGBMClassifier(num_leaves=200, learning_rate=0.04, n_estimators=int(1000*1),
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=0)
RF.fit(X_train.values, y_train)

test_pred = RF.predict_proba(X_test)[:,1]
test_pred_bin = RF.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF.predict_proba(X_train)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF.predict(X_train)))
print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF.predict(X_test)))

FP, TP              : 386.25 857.88
ROC_AUC_SCORE       : 0.91
accuracy            : 0.78
precision           : 0.90
recall              : 0.43
f1                  : 0.59
val, train  AUC     : 0.91 1.00
val, train  AUC_2   : 0.70 0.99
[[2432   65]
 [ 788  606]]
              precision    recall  f1-score   support

           0       0.76      0.97      0.85      2497
           1       0.90      0.43      0.59      1394

    accuracy                           0.78      3891
   macro avg       0.83      0.70      0.72      3891
weighted avg       0.81      0.78      0.76      3891



In [62]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)
scale_pos_weight = sum(y_train == 0) / (1.0 * sum(y_train == 1))

RF_T = CatBoostClassifier(
      verbose=0
    #loss_function='Logloss'
)
RF_T.fit(X_train, y_train)

test_pred = RF_T.predict_proba(X_test)[:,1]
test_pred_bin = RF_T.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF_T.predict_proba(X_train)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF_T.predict(X_train)))

print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF_T.predict(X_test)))

FP, TP              : 351.38 833.35
ROC_AUC_SCORE       : 0.91
accuracy            : 0.80
precision           : 0.86
recall              : 0.52
f1                  : 0.64
val, train  AUC     : 0.91 0.99
val, train  AUC_2   : 0.73 0.95
[[2378  119]
 [ 674  720]]
              precision    recall  f1-score   support

           0       0.78      0.95      0.86      2497
           1       0.86      0.52      0.64      1394

    accuracy                           0.80      3891
   macro avg       0.82      0.73      0.75      3891
weighted avg       0.81      0.80      0.78      3891



In [63]:
X_new = pd.concat([X, X_2], axis=1)

In [64]:
X_new

Unnamed: 0,duplicate,DATEFIRSTREG_value,WORKERSRANGE_0.0,TAXREG_REGDATE_value,OKVED_CODE_433.0,OKTMO_CODE,DATEFIRSTREG_year,id_client,F1300,F2350,...,0__permutation_entropy__dimension_5__tau_1,0__permutation_entropy__dimension_6__tau_1,0__permutation_entropy__dimension_7__tau_1,0__query_similarity_count__query_None__threshold_0.0,"0__matrix_profile__feature_""min""__threshold_0.98","0__matrix_profile__feature_""max""__threshold_0.98","0__matrix_profile__feature_""mean""__threshold_0.98","0__matrix_profile__feature_""median""__threshold_0.98","0__matrix_profile__feature_""25""__threshold_0.98","0__matrix_profile__feature_""75""__threshold_0.98"
0,0,744940800000000000,0,1151712000000000000,0,3.701000e+09,1993.0,1847,1.960896e+09,1.479338e+09,...,2.864739,2.997069,3.004767,0.0,0.758160,3.624298,2.484177,2.972264,2.598394,3.008788
1,0,-9223372036854775808,0,-9223372036854775808,0,0.000000e+00,0.0,4650,0.000000e+00,0.000000e+00,...,1.482484,1.748862,1.929832,0.0,4.082714,5.772848,4.462192,4.469757,4.243056,4.469757
2,0,697161600000000000,0,1030665600000000000,0,3.371000e+10,1992.0,4770,9.073600e+07,1.494000e+06,...,2.918058,2.997069,3.004767,0.0,0.740080,4.797896,2.873394,2.376831,1.705382,4.110617
3,0,1120176000000000000,0,1120176000000000000,0,7.187600e+10,2005.0,12237,2.685905e+09,1.335281e+09,...,2.517644,2.809783,3.004767,0.0,1.888243,5.330304,3.402396,3.296659,2.786946,4.049347
4,0,-9223372036854775808,0,-9223372036854775808,0,0.000000e+00,0.0,9988,0.000000e+00,0.000000e+00,...,1.482484,1.748862,1.929832,0.0,4.082497,5.773462,4.460177,4.467697,4.223023,4.467697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17886,0,1178582400000000000,0,1178582400000000000,0,1.765010e+10,2007.0,3439,7.192800e+07,6.050000e+06,...,3.204778,3.218876,3.178054,0.0,1.038539,4.813402,3.058541,2.583702,2.229162,4.064096
17887,0,870307200000000000,0,1040342400000000000,0,9.870100e+10,1997.0,838,0.000000e+00,0.000000e+00,...,2.824489,3.052521,3.178054,0.0,0.000000,3.089800,0.554482,0.000000,0.000000,0.598656
17888,0,1476921600000000000,1,1476921600000000000,1,4.530900e+10,2016.0,10537,3.150000e+05,3.200000e+04,...,2.831544,3.052521,3.178054,0.0,2.832516,8.000000,5.141725,5.754323,4.199273,5.851840
17889,0,1503964800000000000,1,1503964800000000000,0,1.701000e+09,2017.0,1751,3.080000e+05,2.078000e+06,...,2.664532,2.886165,2.947005,0.0,0.721189,4.813198,3.001567,2.793857,1.554004,4.715013


In [65]:
X_train, X_test, y_train, y_test = X_new.iloc[:14000,], X_new.iloc[14000:,], y[:14000,], y[14000:,]

In [66]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)

RF = lgb.LGBMClassifier(num_leaves=200, learning_rate=0.04, n_estimators=int(1000*1),
                                          colsample_bytree=0.5, subsample=0.5,
                                          n_jobs=-1, random_state=0)
RF.fit(X_train.values, y_train)

test_pred = RF.predict_proba(X_test)[:,1]
test_pred_bin = RF.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF.predict_proba(X_train)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF.predict(X_train)))
print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF.predict(X_test)))

FP, TP              : 361.82 852.49
ROC_AUC_SCORE       : 0.93
accuracy            : 0.82
precision           : 0.92
recall              : 0.54
f1                  : 0.68
val, train  AUC     : 0.93 1.00
val, train  AUC_2   : 0.75 0.99
[[2430   67]
 [ 647  747]]
              precision    recall  f1-score   support

           0       0.79      0.97      0.87      2497
           1       0.92      0.54      0.68      1394

    accuracy                           0.82      3891
   macro avg       0.85      0.75      0.77      3891
weighted avg       0.84      0.82      0.80      3891



In [67]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13, shuffle = False)
scale_pos_weight = sum(y_train == 0) / (1.0 * sum(y_train == 1))

RF_T = CatBoostClassifier(
      verbose=0
    #loss_function='Logloss'
)
RF_T.fit(X_train, y_train)

test_pred = RF_T.predict_proba(X_test)[:,1]
test_pred_bin = RF_T.predict(X_test)

fpr, tpr, _ = roc_curve(y_test, test_pred)
auc = roc_auc_score(y_test, test_pred)
accuracy = accuracy_score(y_test, test_pred_bin)
f1 = f1_score(y_test, test_pred_bin)
precision =  precision_score(y_test, test_pred_bin)
recall = recall_score(y_test, test_pred_bin)

print('FP, TP              :', "%0.2f" % sum(fpr), "%0.2f" % sum(tpr))
print('ROC_AUC_SCORE       :', "%0.2f" % auc)
print('accuracy            :', "%0.2f" % accuracy)
print('precision           :', "%0.2f" % precision)
print('recall              :', "%0.2f" % recall)
print('f1                  :', "%0.2f" % f1)

print('val, train  AUC     :', "%0.2f" % roc_auc_score(y_test, test_pred), "%0.2f" % roc_auc_score(y_train, RF_T.predict_proba(X_train)[:,1]))
print('val, train  AUC_2   :', "%0.2f" % roc_auc_score(y_test, test_pred_bin), "%0.2f" % roc_auc_score(y_train, RF_T.predict(X_train)))

print(confusion_matrix(y_test, test_pred_bin))
print(classification_report(y_test, RF_T.predict(X_test)))

FP, TP              : 356.68 850.90
ROC_AUC_SCORE       : 0.92
accuracy            : 0.81
precision           : 0.87
recall              : 0.56
f1                  : 0.68
val, train  AUC     : 0.92 0.99
val, train  AUC_2   : 0.76 0.95
[[2378  119]
 [ 612  782]]
              precision    recall  f1-score   support

           0       0.80      0.95      0.87      2497
           1       0.87      0.56      0.68      1394

    accuracy                           0.81      3891
   macro avg       0.83      0.76      0.77      3891
weighted avg       0.82      0.81      0.80      3891

