**Подключаем библиотеки**

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn import preprocessing

In [None]:
#!pip install -U lightautoml

In [None]:
import os
import time
import requests
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, recall_score, precision_score, f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

#from lightautoml.automl.presets.tabular_presets import TabularAutoML
#from lightautoml.tasks import Task
#import torch

**Загружаем данные, обрабатываем пустые ячейки**

In [None]:
client_in = pd.read_excel("data_2.xlsx")

In [None]:
client_in = client_in[client_in['Согласие клиента'] == 1]

In [None]:
client_in = client_in.drop(columns = ['Согласие клиента'])

In [None]:
client = client_in[['ID клиента', 'ОПФ', 'ОКВЭД', 'Зона благонадежности', 'Средний кредитовый оборот за 6 мес',\
                    'Средний дебетовый оборот за 6 мес', 'Количество контрагентов по кредиту за 3 мес',\
                    'Количество контрагентов по дебету за 3 мес', 'Валюта баланса', 'Выручка', 'Чистая прибыль',
                    'Активы', 'Собственный капитал']]

In [None]:
deal = client_in[['Одобренная сумма', 'Запрошенная сумма',  'Одобренный срок', 'Запрошенный срок',\
                  'Процент одобренной суммы от запрошенной', 'Цель', 'Маршрут рассмотрения', 'Залог','Грейс период',\
                  'Подтверждение целевого использования кредитных средств', 'Требуется предоставление отчетности',\
                  'Гос.программа', 'Период отсрочки', 'Форма выдачи', 'Период доступности', 'ID клиента']]

In [None]:
#%xdel client_in

In [None]:
# описательная статистика
client.describe()

In [None]:
deal.describe()

In [None]:
client.shape

In [None]:
deal.shape

In [None]:
client = client.fillna(0)
deal = deal.fillna(0)

Удалим дубликаты строк

In [None]:
client = client.drop_duplicates().reset_index()
deal = deal.drop_duplicates().reset_index()

In [None]:
client

In [None]:
deal

Сохраним идентификатор заявки в отдельные массивы для дальнейшего использования на этапе связывания таблиц

In [None]:
client_ID = client[['ID клиента']]
deal_client_ID = deal[['ID клиента']]

In [None]:
print(f'client: {len(client_ID)} = {len(client)}')
print(f'deal: {len(deal_client_ID)} = {len(deal)}')

In [None]:
client = client.drop(['ID клиента'], axis=1)
deal = deal.drop(['ID клиента'], axis=1)

Преобразуем категориальные переменные в числовые

In [None]:
def categorialToFloat(df):
  cat_columns = df.select_dtypes(['object']).columns
  #print(cat_columns)
  df[cat_columns] = df[cat_columns].apply ( lambda x: pd.factorize (x)[ 0 ])
  return df

In [None]:
client = categorialToFloat(client)

In [None]:
deal = categorialToFloat(deal)

Нормализуем данные, используя preprocessing из sklearn

In [None]:
from sklearn import preprocessing

In [None]:
def normalMinMax(df):
  x = df.values
  min_max_scaler = preprocessing.MinMaxScaler()
  x_scaled = min_max_scaler.fit_transform(x)
  return x_scaled

In [None]:
client_norm_numpy = normalMinMax(client)
deal_norm_numpy = normalMinMax(deal)

In [None]:
client_norm = pd.DataFrame(client_norm_numpy, columns=client.columns)
deal_norm = pd.DataFrame(deal_norm_numpy, columns=deal.columns)

Удаляем коррелирующие столбцы

In [None]:
corr = client_norm.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
dcorr = deal_norm.corr()
dcorr.style.background_gradient(cmap='coolwarm')

In [None]:
def correlationDelete(df, threshold):
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
                if colname in df.columns:
                    del df[colname]

    return df

In [None]:
client_norm = correlationDelete(client_norm, threshold=0.75)
deal_norm = correlationDelete(deal_norm, threshold=0.75)

In [None]:
#%xdel client
#%xdel deal

In [None]:
len_deal = deal_norm.shape[0]
len_deal_plot = len_deal // 40
len_deal_plot

**Кластеризация кредитных заявок**

In [None]:
k_means = KMeans(n_clusters=20, random_state=0) # n_clusters - число кластеров
k_means = k_means.fit(deal_norm) # кластеризируем
clusters = k_means.predict(deal_norm) # получаем предсказанные кластеры


In [None]:
#db = DBSCAN(eps=0.3, min_samples=100)
#db = db.fit(deal_norm)
#clusters = db.labels_

In [None]:
clusters.shape

In [None]:
%%time
from sklearn.manifold import TSNE
X = deal_norm.iloc[:len_deal_plot, :-1]
# Создаем алгоритм t-SNE с двумя главными компонентами и перплексией
tsne = TSNE(n_components=2, perplexity=40)
deal_2d = tsne.fit_transform(X)

In [None]:
y_plot = deal_norm.iloc[:len_deal_plot, -1:].to_numpy()
y_plot

In [None]:
plt.figure(figsize=(15,10))
cmap = plt.get_cmap('jet', 10)
plt.figure(figsize=(25, 10))
plt.subplot(1, 2, 1)
plt.scatter(deal_2d[:, 0], deal_2d[:, 1], s=2)
plt.colorbar()
plt.subplot(1, 2, 2)
plt.scatter(deal_2d[:, 0], deal_2d[:, 1], c = y_plot, cmap=cmap, s=2)
plt.colorbar()
plt.show()

In [None]:
deal_norm['Group_deals'] = clusters

In [None]:
deal_norm.groupby(['Group_deals']).count()

In [None]:
from sklearn.metrics import silhouette_score as sc

In [None]:
#deal_score = sc(deal_norm, k_means.labels_)
#deal_score

**Предсказываем группу заявки для клиента - решаем задачу мультиклассовой классификации**

Добавим в датасет с клиентом группу заявки, полученную в результате кластеризации заявок

In [None]:
client_norm['ID клиента'] = client_ID['ID клиента']
deal_norm['ID клиента'] = deal_client_ID['ID клиента']

In [None]:
client_ID.shape

(189391, 1)

In [None]:
deal_client_ID.shape

(205901, 1)

In [None]:
client_norm.shape

(189391, 14)

In [None]:
deal_norm.shape

(205901, 15)

In [None]:
client_norm

Unnamed: 0,index,ОПФ,ОКВЭД,Зона благонадежности,Средний кредитовый оборот за 6 мес,Средний дебетовый оборот за 6 мес,Количество контрагентов по кредиту за 3 мес,Количество контрагентов по дебету за 3 мес,Валюта баланса,Выручка,Чистая прибыль,Собственный капитал,Group_clients,ID клиента
0,0.000000,0.0,0.474227,0.0,0.000100,0.000048,0.001692,0.000719,0.000220,0.000519,0.172308,0.534756,4,3
1,0.000010,0.0,0.010309,0.0,0.000756,0.001955,0.000376,0.004317,0.000220,0.000519,0.172308,0.534756,19,7
2,0.000013,0.0,0.010309,0.0,0.001665,0.000599,0.000000,0.000719,0.000220,0.000519,0.172308,0.534756,19,7
3,0.000016,0.0,0.989691,0.0,0.000307,0.002347,0.000188,0.012230,0.000220,0.000519,0.172308,0.534756,16,8
4,0.000020,1.0,0.010309,0.0,0.002021,0.003785,0.003196,0.010072,0.000220,0.000519,0.172308,0.534756,17,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189386,0.999984,0.0,0.484536,0.0,0.000266,0.000000,0.001128,0.000000,0.000220,0.000519,0.172308,0.534756,5,317492
189387,0.999990,0.0,0.701031,0.0,0.000366,0.000110,0.000752,0.000000,0.000220,0.000519,0.172308,0.534756,5,317498
189388,0.999993,1.0,0.484536,0.0,0.000000,0.002091,0.000000,0.010791,0.000504,0.000671,0.172346,0.534786,0,317501
189389,0.999997,0.0,0.835052,0.0,0.000035,0.000000,0.000752,0.000000,0.000220,0.000519,0.172308,0.534756,12,317503


In [None]:
deal_norm

Unnamed: 0,index,Одобренная сумма,Запрошенная сумма,Одобренный срок,Процент одобренной суммы от запрошенной,Цель,Маршрут рассмотрения,Залог,Грейс период,Подтверждение целевого использования кредитных средств,Гос.программа,Период отсрочки,Форма выдачи,Group_deals,ID клиента
0,0.000000,0.000983,0.000298,0.195531,0.002312,0.000000,0.000000,0.0,0.0,1.0,0.0,0.25,0.00,9,3
1,0.000010,0.005750,0.000598,0.061453,0.006663,0.000000,0.058824,1.0,0.0,1.0,0.1,0.00,0.25,13,7
2,0.000013,0.009789,0.001018,0.061453,0.006663,0.000000,0.058824,1.0,0.0,1.0,0.1,0.00,0.25,13,7
3,0.000016,0.001327,0.000138,0.128492,0.006663,0.090909,0.117647,0.0,0.0,0.0,0.2,0.00,0.25,5,8
4,0.000020,0.010558,0.001098,0.061453,0.006663,0.000000,0.176471,1.0,0.0,1.0,0.1,0.00,0.25,13,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205896,0.999984,0.001904,0.000198,0.195531,0.006663,0.090909,0.117647,0.0,0.0,0.0,0.2,0.00,0.25,1,317492
205897,0.999990,0.000077,0.000008,0.195531,0.006663,0.272727,0.294118,0.0,1.0,0.0,0.2,0.00,0.25,11,317498
205898,0.999993,0.000606,0.000198,0.195531,0.002163,0.090909,0.117647,0.0,0.0,0.0,0.2,0.00,0.25,1,317501
205899,0.999997,0.001327,0.000138,0.189944,0.006663,0.090909,0.117647,0.0,0.0,0.0,0.2,0.00,0.25,1,317503


In [None]:
client_norm = client_norm.drop_duplicates()

In [None]:
deal_norm = deal_norm.drop_duplicates()

In [None]:
client_norm.shape

(189391, 14)

In [None]:
deal_norm.shape

(205901, 15)

In [None]:
deal_norm_group = deal_norm[['ID клиента', 'Group_deals']].drop_duplicates()

In [None]:
deal_norm_group.shape

(182311, 2)

In [None]:
client_gr = client_norm_group.merge(deal_norm_group, how='inner', left_on='ID клиента', right_on='ID клиента')

In [None]:
client_gr = client_gr.sample(frac=0.5)

In [None]:
data_types_dict = {'Group_deals': int}

In [None]:
client_gr  = client_gr.astype(data_types_dict)

In [None]:
client_gr.rename(columns=lambda x: str(x) , inplace=True)

In [None]:
client_gr.columns[(client_gr == 0).all()]

Index([], dtype='object')

In [None]:
y = client_gr.iloc[:, -1:].to_numpy()

In [None]:
y

array([[17],
       [ 1],
       [ 2],
       ...,
       [ 6],
       [ 8],
       [10]])

In [None]:
X = client_gr.iloc[:, :-1].drop(columns=['ID клиента']).to_numpy()

In [None]:
len(X)

93110

In [None]:
len(y)

93110

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

**Используем алгоритмы мультиклассовой классификации из библиотеки scikit-learn**

DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree_model = DecisionTreeClassifier(max_depth = 20).fit(X_train, y_train)
dtree_predictions = dtree_model.predict(X_test)

In [None]:
dtree_predictions

array([ 5,  5, 13, ..., 13,  1,  5])

In [None]:
precision_score(y_test, dtree_predictions, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.09758892348158828

In [None]:
recall_score(y_test, dtree_predictions, average='macro')

0.13437800733523356

In [None]:
f1_score(y_test, dtree_predictions, average='macro')

0.0983456834257753

In [None]:
cr = classification_report(y_test, dtree_predictions)
print(cr)

              precision    recall  f1-score   support

           0       0.14      0.07      0.09       831
           1       0.38      0.76      0.51      2963
           2       0.21      0.05      0.08      2456
           3       0.00      0.00      0.00       558
           4       0.00      0.00      0.00       429
           5       0.39      0.73      0.51      3176
           6       0.00      0.00      0.00      2423
           7       0.00      0.00      0.00       130
           8       0.00      0.00      0.00       640
           9       0.00      0.00      0.00       852
          10       0.17      0.14      0.15      1000
          11       0.23      0.05      0.09      2383
          12       0.28      0.75      0.41      3136
          13       0.15      0.14      0.14       630
          14       0.00      0.00      0.00       115
          15       0.00      0.00      0.00       343
          16       0.00      0.00      0.00       123
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
balanced_accuracy_score(y_test, dtree_predictions)

0.13437800733523356

SVM (Support vector machine) classifier

In [None]:
#from sklearn.svm import SVC
#svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
#svm_predictions = svm_model_linear.predict(X_test)

In [None]:
#print(vm_model_linear.score(y_test, svm_predictions))

KNN (k-nearest neighbors) classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train)
knn_predictions = knn.predict(X_test)

In [None]:
print(knn.score(X_test, y_test))

In [None]:
cr_knn = classification_report(y_test, knn_predictions)
print(cr)

Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, y_train)
gnb_predictions = gnb.predict(X_test)

In [None]:
print(gnb.score(X_test, y_test))

*RandomForest со случайным поиском гиперпараметров*

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

In [None]:
# Определяем интервалы макропараметров для поиска
param_dist = {
    'n_estimators' : radint(50, 200),
    'max_depth' : [None, 10, 20, 30, 40, 50]
    'min_samles_leaf' : [1, 2, 4]
}


In [None]:
# Создаем модель и осуществляем ее обучение со случайным подбором макропараметров
rf_model = RandomForestClassifier()
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=100, cv=5)
random_search.fit(X_train, y_train)

In [None]:
# Лучшие гиперпараметры
random_search.best_params_

In [None]:
# Лучшие оценки
random_search.best_score_

Используем LightAutoML для решения задачи мультиклассовой классификации

In [None]:
'''
task = Task('multiclass', loss='crossentropy', metric='crossentropy')
N_THREADS = 4
RANDOM_STATE = 42
TIMEOUT = 8 * 100
TARGET_NAME = 'Group'
TEST_SIZE = 0.2
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
tr_data, te_data = train_test_split(client_gr, test_size=TEST_SIZE, stratify=client_gr['Group'], random_state=RANDOM_STATE)
roles = {
    'target': TARGET_NAME,
    'drop': 'SK_ID_CURR',
}
automl = TabularAutoML(task = task,
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS)
%%time

oof_pred = automl.fit_predict(tr_data, roles = roles)

print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))
%%time

te_pred = automl.predict(te_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')
tr_data['Group']
class_result = classification_report(y_true=tr_data['Group'].values, y_pred=oof_pred[:, 0])
print(class_result)
'''

In [None]:
'''
np_oof_pred = np.array(oof_pred)
np_oof_pred
len(oof_pred)
np.save('/content/drive/MyDrive/Colab Notebooks/archive-2/oof_pred', np_oof_pred)
np_oof_pred1 = np.load('/content/drive/MyDrive/Colab Notebooks/archive-2/oof_pred.npy')
np_oof_pred1
'''

**Кластеризация клиентов**

k_means = KMeans(n_clusters=20, random_state=0) # n_clusters - число кластеров
k_means = k_means.fit(client_norm) # кластеризируем
clusters_client = k_means.predict(client_norm) # получаем предсказанные кластеры


len_client = client_norm.shape[0]
len_client_plot = len_client // 40
len_client_plot

%%time
from sklearn.manifold import TSNE
X = client_norm.iloc[:len_client_plot, :-1]
# Создаем алгоритм t-SNE с двумя главными компонентами и перплексией
tsne = TSNE(n_components=2, perplexity=40)
client_2d = tsne.fit_transform(X)

y_plot = client_norm.iloc[:len_client_plot, -1:].to_numpy()
y_plot

plt.figure(figsize=(15,10))
cmap = plt.get_cmap('jet', 10)
plt.figure(figsize=(25, 10))
plt.subplot(1, 2, 1)
plt.scatter(client_2d[:, 0], client_2d[:, 1], s=2)
plt.colorbar()
plt.subplot(1, 2, 2)
plt.scatter(client_2d[:, 0], client_2d[:, 1], c = y_plot, cmap=cmap, s=2)
plt.colorbar()
plt.show()

client_norm['Group_clients'] = clusters_client

client_norm.groupby(['Group_clients']).count()

client_score = sc(client_norm, k_means.label_)
client_score