In [49]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math


In [50]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [51]:
data = pd.read_csv('train_data.csv')
data

Unnamed: 0,client_id,cato_code,gender,country,citizenship,age_days,client_days,opsos_code,phone_digits,max_dup,...,salary_10,transfers_amount_in_8,transfers_count_in_8,transfers_amount_out_8,transfers_count_out_8,transactions_amount_8,transactions_count_8,auth_count_8,salary_8,label
0,2152686,17.0,1.0,0.0,0.0,11643.0,130.0,9.0,5.0,1.0,...,,,,,,,,,,0
1,1712777,2.0,,0.0,0.0,18095.0,994.0,5.0,5.0,1.0,...,,,,,,,,,,0
2,458915,,0.0,0.0,0.0,7955.0,1144.0,7.0,5.0,1.0,...,,,,,,0.037251,0.033580,0.03358,,0
3,530220,,0.0,0.0,0.0,22580.0,847.0,0.0,6.0,1.0,...,0.036560,,,,,0.115086,0.033581,0.03358,0.092161,0
4,1103654,12.0,0.0,0.0,0.0,13107.0,547.0,12.0,6.0,1.0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941526,1244050,6.0,1.0,0.0,0.0,11813.0,462.0,8.0,4.0,1.0,...,,,,,,,,,,0
941527,1142266,12.0,0.0,0.0,0.0,11573.0,787.0,8.0,3.0,2.0,...,,,,,,,,,,0
941528,1327217,9.0,1.0,0.0,0.0,11268.0,455.0,11.0,4.0,1.0,...,,,,,,,,0.03358,,0
941529,1634056,5.0,1.0,0.0,,17215.0,400.0,10.0,4.0,1.0,...,0.038471,,,,,0.041267,0.033580,0.03358,0.037430,0


In [52]:
data['in_kaz'] = data['country'] == 0

In [53]:
data['is_kaz'] = data['citizenship'] == 0

In [54]:
data['gender'].median()

1.0

In [55]:
fillmedian = ['client_days', 'gender', 'client_days', 'age_days', 'opsos_code', 'phone_digits', 'max_dup']
fillzero = ['cato_code', 'country', 'citizenship']

In [56]:
data.fillna(data[fillmedian].median(), inplace = True)

In [57]:
data[fillzero] = data[fillzero].fillna(0)

In [58]:
data['age'] = data['age_days'] / 365

In [59]:
data['is_child'] = data['age'] < 18
data['is_adult'] = (data['age'] >= 18) & (data['age'] < 60)
data['is_old'] = data['age'] >= 60

In [60]:
data['have_installment'] = (pd.isnull(data['monthly_installment']))

In [61]:
data['have_deposit'] = (pd.isnull(data['deposit_sum_KZT']))

In [62]:
data['have_deal'] = (pd.isnull(data['deal_sum_kzt_express']))

In [63]:
data['have_product'] = (pd.isnull(data['deal_sum_kzt_product']))

In [64]:
data[['deal_sum_kzt_express', 'monthly_installment','transfers_amount_in_9', 'transfers_amount_in_10', 'transfers_amount_in_8',
                  'transfers_count_in_9', 'transfers_count_in_10', 'transfers_count_in_8',
                  'transfers_amount_out_9', 'transfers_amount_out_10', 'transfers_amount_out_8',
                 'transfers_count_out_9', 'transfers_count_out_10', 'transfers_count_out_8',
                  'transactions_amount_9', 'transactions_amount_10', 'transactions_amount_8',
                  'transactions_count_9', 'transactions_count_10', 'transactions_count_8',
                  'auth_count_9', 'auth_count_10', 'auth_count_8', 
                  'salary_9', 'salary_10', 'salary_8']] = data[['deal_sum_kzt_express', 'monthly_installment','transfers_amount_in_9', 'transfers_amount_in_10', 'transfers_amount_in_8',
                  'transfers_count_in_9', 'transfers_count_in_10', 'transfers_count_in_8',
                  'transfers_amount_out_9', 'transfers_amount_out_10', 'transfers_amount_out_8',
                 'transfers_count_out_9', 'transfers_count_out_10', 'transfers_count_out_8',
                  'transactions_amount_9', 'transactions_amount_10', 'transactions_amount_8',
                'transactions_count_9', 'transactions_count_10', 'transactions_count_8',
                  'auth_count_9', 'auth_count_10', 'auth_count_8', 
                  'salary_9', 'salary_10', 'salary_8']].fillna(0)

In [65]:
data['transfers_amount_in'] = data[['transfers_amount_in_9','transfers_amount_in_10','transfers_amount_in_8']].max(axis = 1)
data['transfers_count_in'] = data[['transfers_count_in_9','transfers_count_in_10','transfers_count_in_8']].max(axis = 1)
data['transfers_amount_out'] = data[['transfers_amount_out_9','transfers_amount_out_10','transfers_amount_out_8']].max(axis = 1)
data['transfers_count_out'] = data[['transfers_count_out_9','transfers_count_out_10','transfers_count_out_8']].max(axis = 1)
data['transactions_amount'] = data[['transactions_amount_9','transactions_amount_10','transactions_amount_8']].max(axis = 1)
data['transactions_count'] = data[['transactions_count_9','transactions_count_10','transactions_count_8']].max(axis = 1)
data['auth_count'] = data[['auth_count_9','auth_count_10','auth_count_8']].max(axis = 1)
data['salary'] = data[['salary_9','salary_10','salary_8']].max(axis = 1)

In [66]:
data = data.drop(['transfers_amount_in_9', 'transfers_amount_in_10', 'transfers_amount_in_8',
                  'transfers_count_in_9', 'transfers_count_in_10', 'transfers_count_in_8',
                  'transfers_amount_out_9', 'transfers_amount_out_10', 'transfers_amount_out_8',
                 'transfers_count_out_9', 'transfers_count_out_10', 'transfers_count_out_8',
                  'transactions_amount_9', 'transactions_amount_10', 'transactions_amount_8',
                'transactions_count_9', 'transactions_count_10', 'transactions_count_8',
                  'auth_count_9', 'auth_count_10', 'auth_count_8', 
                  'salary_9', 'salary_10', 'salary_8'], axis=1)

In [67]:
data['date_close'] = pd.to_datetime(data['date_close'])
data['date_close_year'] = 2022 - data['date_close'].dt.year
data['date_close_month'] = data['date_close'].dt.month
data = data.drop('date_close', axis=1)
data['deal_start_date_express'] = pd.to_datetime(data['deal_start_date_express'])
data['deal_start_date_express_year'] = 2022 - data['deal_start_date_express'].dt.year
data['deal_start_date_express_month'] = data['deal_start_date_express'].dt.month
data = data.drop('deal_start_date_express', axis=1)
data['plan_end_date_express'] = pd.to_datetime(data['plan_end_date_express'])
data['plan_end_date_express_year'] = 2022 - data['plan_end_date_express'].dt.year
data['plan_end_date_express_month'] = data['plan_end_date_express'].dt.month
data = data.drop('plan_end_date_express', axis=1)
data['deal_start_date_product'] = pd.to_datetime(data['deal_start_date_product'])
data['deal_start_date_product_year'] = 2022 -data['deal_start_date_product'].dt.year
data['deal_start_date_product_month'] = data['deal_start_date_product'].dt.month
data = data.drop('deal_start_date_product', axis=1)
data['plan_end_date_product'] = pd.to_datetime(data['plan_end_date_product'])
data['plan_end_date_product_year'] = 2022 - data['plan_end_date_product'].dt.year
data['plan_end_date_product_month'] = data['plan_end_date_product'].dt.month
data = data.drop('plan_end_date_product', axis=1)

In [68]:
data[['date_close_year', 'deal_start_date_express_year', 'plan_end_date_express_year',
      'deal_start_date_product_year', 'plan_end_date_product_year']] = data[['date_close_year', 'deal_start_date_express_year', 'plan_end_date_express_year',
      'deal_start_date_product_year', 'plan_end_date_product_year']].fillna(0)
data[['date_close_month', 'deal_start_date_express_month', 'plan_end_date_express_month',
      'deal_start_date_product_month', 'plan_end_date_product_month']] = data[['date_close_month', 'deal_start_date_express_month', 'plan_end_date_express_month',
      'deal_start_date_product_month', 'plan_end_date_product_month']].fillna(0)


In [69]:
data[['deposit_sum_KZT', 'deposit_sum_USD', 'deposit_sum_EUR']] = data[['deposit_sum_KZT', 'deposit_sum_USD', 'deposit_sum_EUR']].fillna(0)

In [70]:
num_features=['deal_sum_kzt_express', 'monthly_installment', 'client_days', 'phone_digits', 'max_dup',
             'transfers_amount_in', 'transfers_count_in', 'transfers_amount_out',
              'transfers_count_out', 'transactions_amount', 'transactions_count',
              'auth_count','salary', 'deposit_sum_KZT', 'deposit_sum_USD', 'deposit_sum_EUR']
cat_features = ['cato_code', 'gender', 'is_kaz', 'in_kaz', 'is_child', 'is_adult', 'is_old',
             'have_installment', 'have_deposit', 'have_deal', 'have_product', 'date_close_year',
             'deal_start_date_express_year', 'plan_end_date_express_year',
             'deal_start_date_product_year', 'plan_end_date_product_year', 'date_close_month', 'deal_start_date_express_month', 'plan_end_date_express_month',
      'deal_start_date_product_month', 'plan_end_date_product_month']
target='label'

In [71]:
preprocessor = make_column_transformer(
            (StandardScaler(), num_features),
            (OneHotEncoder(), cat_features),
            sparse_threshold=100)

In [72]:
X = data[num_features+cat_features]
y = data[target]
X = preprocessor.fit_transform(X)

In [73]:
X_train, X_tests, y_train, y_tests = train_test_split(X, y, test_size=0.05, stratify=y)

In [74]:
X_tests.shape

(47077, 158)

In [75]:
X_train.shape

(894454, 158)

In [76]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
import keras
from keras.models import Sequential
from keras.layers import Dense
import warnings
import tensorflow as tf

In [77]:
metrics = [keras.metrics.Precision(name = 'precision'),
           keras.metrics.Recall(name = 'recall')]

In [93]:
model = Sequential()
model.add(Dense(30, input_shape=(158,), activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [94]:
model.compile(loss = 'BinaryCrossentropy', optimizer=keras.optimizers.Adam(1e-2), metrics=metrics)

In [95]:
class_weights = {0:0.265,
                 1:1}

In [96]:
from sklearn.utils import class_weight
model.fit(X_train.toarray(), y_train, epochs=30, batch_size=1000, verbose = 1, class_weight=class_weights)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fd928354050>

In [97]:
y_res = model.predict(X_tests.toarray()) 



In [98]:
l = 0.2
while l < 0.7:
  y_r = y_res > l
  print(f1_score(y_tests, y_r))
  l += 0.01

0.2902865649423149
0.3251451540866458
0.3272892112420671
0.33195402298850574
0.3426103646833013
0.3453658536585366
0.3464955577492596
0.3487792725460887
0.3512293025589563
0.35492095869454354
0.3574380165289256
0.35808320168509744
0.36236559139784946
0.3631376851343938
0.3675977653631285
0.37032842582106457
0.3761574074074074
0.3780271707028942
0.3829018663455749
0.3848039215686275
0.3875
0.38881829733163914
0.389391979301423
0.3923633969716919
0.39381720430107525
0.39809134287661896
0.40083507306889354
0.4014035087719298
0.40228245363766046
0.3982621288921071
0.39674315321983716
0.3958020989505247
0.39083969465648855
0.3910355486862442
0.39274447949526814
0.39420756234915527
0.3967213114754098
0.3875838926174497
0.3918228279386712
0.3871527777777778
0.38237885462555066
0.37455197132616486
0.3713768115942029
0.3683241252302026
0.3709226467847157
0.36672967863894135
0.3634615384615385
0.36184857423795475
0.35856573705179284
0.35845213849287166


In [99]:
y_res = model.predict(X_train.toarray())



KeyboardInterrupt: ignored

In [101]:
y_r = y_res > 0.47
print(f1_score(y_train, y_r))

ValueError: ignored

In [102]:
data = pd.read_csv('test_data.csv')
data

Unnamed: 0,client_id,cato_code,gender,country,citizenship,age_days,client_days,opsos_code,phone_digits,max_dup,...,auth_count_10,salary_10,transfers_amount_in_8,transfers_count_in_8,transfers_amount_out_8,transfers_count_out_8,transactions_amount_8,transactions_count_8,auth_count_8,salary_8
0,2,2.0,1.0,0.0,0.0,17150.0,1638.0,2.0,3.0,2.0,...,,,,,,,,,,
1,5,3.0,1.0,0.0,,14278.0,416.0,3.0,6.0,1.0,...,,,,,,,,,,
2,10,4.0,1.0,0.0,,19040.0,225.0,0.0,5.0,1.0,...,0.033580,,,,,,0.057141,0.033580,0.033580,
3,11,4.0,1.0,0.0,0.0,28811.0,2919.0,,,,...,,,,,,,,,,
4,27,,0.0,0.0,0.0,26687.0,2919.0,0.0,6.0,1.0,...,0.033581,,,,,,0.157054,0.033583,0.033582,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463734,4433589,,,,,,,3.0,6.0,1.0,...,,,,,,,,,,
463735,4438614,,,,,,,8.0,7.0,0.0,...,,,,,,,,,,
463736,4439677,,,,,,,4.0,6.0,1.0,...,0.033580,,,,,,,,,
463737,4443125,,,,,,,11.0,6.0,1.0,...,,,,,,,,,,


In [103]:
data['in_kaz'] = data['country'] == 0

In [104]:
#data[data.label == 1].groupby('citizenship').citizenship.count()

In [105]:
data['is_kaz'] = data['citizenship'] == 0

In [106]:
fillmedian = ['client_days', 'gender', 'client_days', 'age_days', 'opsos_code', 'phone_digits', 'max_dup']
fillzero = ['cato_code', 'country', 'citizenship']

In [107]:
data.fillna(data[fillmedian].median(), inplace = True)

In [108]:
data[fillzero] = data[fillzero].fillna(0)

In [109]:
data['age'] = data['age_days'] / 365

In [110]:
data['is_child'] = data['age'] < 18
data['is_adult'] = (data['age'] >= 18) & (data['age'] < 60)
data['is_old'] = data['age'] >= 60

In [111]:
data['have_installment'] = (pd.isnull(data['monthly_installment']))
data['have_deposit'] = (pd.isnull(data['deposit_sum_KZT']))
data['have_deal'] = (pd.isnull(data['deal_sum_kzt_express']))
data['have_product'] = (pd.isnull(data['deal_sum_kzt_product']))

In [112]:
data[['deal_sum_kzt_express', 'monthly_installment','transfers_amount_in_9', 'transfers_amount_in_10', 'transfers_amount_in_8',
                  'transfers_count_in_9', 'transfers_count_in_10', 'transfers_count_in_8',
                  'transfers_amount_out_9', 'transfers_amount_out_10', 'transfers_amount_out_8',
                 'transfers_count_out_9', 'transfers_count_out_10', 'transfers_count_out_8',
                  'transactions_amount_9', 'transactions_amount_10', 'transactions_amount_8',
                  'transactions_count_9', 'transactions_count_10', 'transactions_count_8',
                  'auth_count_9', 'auth_count_10', 'auth_count_8', 
                  'salary_9', 'salary_10', 'salary_8']] = data[['deal_sum_kzt_express', 'monthly_installment','transfers_amount_in_9', 'transfers_amount_in_10', 'transfers_amount_in_8',
                  'transfers_count_in_9', 'transfers_count_in_10', 'transfers_count_in_8',
                  'transfers_amount_out_9', 'transfers_amount_out_10', 'transfers_amount_out_8',
                 'transfers_count_out_9', 'transfers_count_out_10', 'transfers_count_out_8',
                  'transactions_amount_9', 'transactions_amount_10', 'transactions_amount_8',
                'transactions_count_9', 'transactions_count_10', 'transactions_count_8',
                  'auth_count_9', 'auth_count_10', 'auth_count_8', 
                  'salary_9', 'salary_10', 'salary_8']].fillna(0)

In [113]:
data['transfers_amount_in'] = data[['transfers_amount_in_9','transfers_amount_in_10','transfers_amount_in_8']].max(axis = 1)
data['transfers_count_in'] = data[['transfers_count_in_9','transfers_count_in_10','transfers_count_in_8']].max(axis = 1)
data['transfers_amount_out'] = data[['transfers_amount_out_9','transfers_amount_out_10','transfers_amount_out_8']].max(axis = 1)
data['transfers_count_out'] = data[['transfers_count_out_9','transfers_count_out_10','transfers_count_out_8']].max(axis = 1)
data['transactions_amount'] = data[['transactions_amount_9','transactions_amount_10','transactions_amount_8']].max(axis = 1)
data['transactions_count'] = data[['transactions_count_9','transactions_count_10','transactions_count_8']].max(axis = 1)
data['auth_count'] = data[['auth_count_9','auth_count_10','auth_count_8']].max(axis = 1)
data['salary'] = data[['salary_9','salary_10','salary_8']].max(axis = 1)

In [114]:
data = data.drop(['transfers_amount_in_9', 'transfers_amount_in_10', 'transfers_amount_in_8',
                  'transfers_count_in_9', 'transfers_count_in_10', 'transfers_count_in_8',
                  'transfers_amount_out_9', 'transfers_amount_out_10', 'transfers_amount_out_8',
                 'transfers_count_out_9', 'transfers_count_out_10', 'transfers_count_out_8',
                  'transactions_amount_9', 'transactions_amount_10', 'transactions_amount_8',
                'transactions_count_9', 'transactions_count_10', 'transactions_count_8',
                  'auth_count_9', 'auth_count_10', 'auth_count_8', 
                  'salary_9', 'salary_10', 'salary_8'], axis=1)

In [115]:
data['date_close'] = pd.to_datetime(data['date_close'])
data['date_close_year'] = 2022 - data['date_close'].dt.year
data['date_close_month'] = data['date_close'].dt.month
data = data.drop('date_close', axis=1)
data['deal_start_date_express'] = pd.to_datetime(data['deal_start_date_express'])
data['deal_start_date_express_year'] = 2022 - data['deal_start_date_express'].dt.year
data['deal_start_date_express_month'] = data['deal_start_date_express'].dt.month
data = data.drop('deal_start_date_express', axis=1)
data['plan_end_date_express'] = pd.to_datetime(data['plan_end_date_express'])
data['plan_end_date_express_year'] = 2022 - data['plan_end_date_express'].dt.year
data['plan_end_date_express_month'] = data['plan_end_date_express'].dt.month
data = data.drop('plan_end_date_express', axis=1)
data['deal_start_date_product'] = pd.to_datetime(data['deal_start_date_product'])
data['deal_start_date_product_year'] = 2022 -data['deal_start_date_product'].dt.year
data['deal_start_date_product_month'] = data['deal_start_date_product'].dt.month
data = data.drop('deal_start_date_product', axis=1)
data['plan_end_date_product'] = pd.to_datetime(data['plan_end_date_product'])
data['plan_end_date_product_year'] = 2022 - data['plan_end_date_product'].dt.year
data['plan_end_date_product_month'] = data['plan_end_date_product'].dt.month
data = data.drop('plan_end_date_product', axis=1)

In [116]:
data[['date_close_year', 'deal_start_date_express_year', 'plan_end_date_express_year',
      'deal_start_date_product_year', 'plan_end_date_product_year']] = data[['date_close_year', 'deal_start_date_express_year', 'plan_end_date_express_year',
      'deal_start_date_product_year', 'plan_end_date_product_year']].fillna(0)
data[['date_close_month', 'deal_start_date_express_month', 'plan_end_date_express_month',
      'deal_start_date_product_month', 'plan_end_date_product_month']] = data[['date_close_month', 'deal_start_date_express_month', 'plan_end_date_express_month',
      'deal_start_date_product_month', 'plan_end_date_product_month']].fillna(0)

In [117]:
data[['deposit_sum_KZT', 'deposit_sum_USD', 'deposit_sum_EUR']] = data[['deposit_sum_KZT', 'deposit_sum_USD', 'deposit_sum_EUR']].fillna(0)

In [118]:
X = data[num_features+cat_features]
X = preprocessor.fit_transform(X)

In [119]:
X.shape

(463739, 158)

In [120]:
y = model.predict(X)



In [194]:
yr = y > 0.30

In [195]:
yr = 1*yr

In [196]:
yr

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [199]:
data['target']=yr
data['id'] = data['client_id']
data.reset_index()[['id', 'target']].to_csv("outputf110.csv", index=False)

In [200]:
data.groupby('target').target.count()

target
0    457539
1      6200
Name: target, dtype: int64