# Установка библиотек и загрузка данных

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 10

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn\|seaborn\|matplotlib"

Библиотеки:

geopandas==0.13.2

lightgbm==4.1.0

matplotlib==3.7.1

matplotlib-inline==0.1.6

matplotlib-venn==0.11.10

numpy==1.25.2

pandas==1.5.3

pandas-datareader==0.10.0

pandas-gbq==0.19.2

pandas-stubs==1.5.3.230304

scikit-learn==1.2.2

seaborn==0.13.1

sklearn-pandas==2.2.0

In [2]:
# если работаем в колабе, подключаем гугл диск
from google.colab import drive
drive.mount('/content/drive')

FILE_PATH = '/content/drive/MyDrive/CLTV_Alfa/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_df = pd.read_parquet(FILE_PATH+'train_data.pqt')
test_df = pd.read_parquet(FILE_PATH+'test_data.pqt')
train_df.shape, test_df.shape

((600000, 93), (290120, 92))

# Создаем новые фичи

Фича 'filled_cols' описывает, сколько заполненых колонок есть в каждой строке

In [4]:
# Подсчет числа заполненных значений в каждой строке
train_df['filled_cols'] = train_df.drop(['id', 'date', 'start_cluster', 'end_cluster'], axis=1).count(axis=1)
test_df['filled_cols'] = test_df.drop(['id', 'date', 'start_cluster'], axis=1).count(axis=1)

Наиболее вероятный конечный кластер для заданного начального кластера

In [5]:
clusters = {
 '{other}': '{other}',
 '{}': '{}',
 '{α, β}': '{α, β}',
 '{α, γ}': '{α, γ}',
 '{α, δ}': '{α}',
 '{α, ε, η}': '{α, ε, η}',
 '{α, ε, θ}': '{α, ε, θ}',
 '{α, ε, ψ}': '{α, ε, ψ}',
 '{α, ε}': '{α, ε}',
 '{α, η}': '{α, η}',
 '{α, θ}': '{α, θ}',
 '{α, λ}': '{α, λ}',
 '{α, μ}': '{α, μ}',
 '{α, π}': '{other}',
 '{α, ψ}': '{α, ψ}',
 '{α}': '{α}',
 '{λ}': '{α, λ}'
 }

Заполняем начальный кластер в month_6 значениями из month_5

In [6]:
test_df['start_cluster'].fillna(method='ffill', inplace=True)

Предсказание конечного кластера по матрице вероятностей

In [7]:
train_df['end_cluster_1'] = train_df['start_cluster'].map(clusters)
test_df['end_cluster_1'] = test_df['start_cluster'].map(clusters)

# LGBM модель

In [8]:
cat_cols = [
    "channel_code",
    "city",
    "city_type",
    "okved",
    "segment",
    "start_cluster",
    "index_city_code",
    "ogrn_month",
    "ogrn_year",
    'end_cluster_1'
]

Обозначение категориальных признаков

In [9]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

Возьмем фичи с наибольшей значимостью на CatBoost и добавим две новые фичи.

In [10]:
top_features = ['filled_cols',
                'end_cluster_1',
    'start_cluster', 'okved', 'index_city_code', 'channel_code', 'city', 'balance_amt_min', 'segment', 'balance_amt_max', 'ogrn_days_end_quarter', 'sum_of_paym_1y', 'ogrn_days_end_month', 'cnt_a_oper_1m', 'min_founderpres',
 'ogrn_exist_months', 'ogrn_month', 'ft_registration_date', 'ogrn_year', 'sum_of_paym_6m', 'max_founderpres', 'sum_deb_e_oper_3m', 'balance_amt_avg', 'sum_of_paym_2m', 'sum_cred_e_oper_3m', 'cnt_days_deb_e_oper_3m',
 'balance_amt_day_avg', 'cnt_cred_e_oper_3m', 'sum_cred_e_oper_1m', 'sum_cred_h_oper_3m', 'cnt_days_cred_e_oper_3m',
 'cnt_deb_e_oper_3m', 'sum_deb_h_oper_3m', 'cnt_a_oper_3m', 'sum_deb_f_oper_3m', 'cnt_cred_e_oper_1m', 'sum_deb_g_oper_3m', 'sum_deb_e_oper_1m',
 'cnt_cred_h_oper_3m', 'cnt_deb_h_oper_3m', 'sum_deb_d_oper_3m', 'cnt_deb_e_oper_1m', 'sum_deb_f_oper_1m', 'sum_deb_h_oper_1m',
 'cnt_days_cred_e_oper_1m', 'cnt_deb_g_oper_3m', 'cnt_days_cred_h_oper_3m', 'sum_cred_g_oper_3m', 'sum_c_oper_3m',
 'sum_a_oper_3m', 'sum_cred_h_oper_1m', 'city_type', 'cnt_deb_d_oper_3m', 'sum_deb_d_oper_1m', 'cnt_days_deb_e_oper_1m', 'cnt_deb_f_oper_3m', 'sum_deb_g_oper_1m', 'cnt_days_deb_h_oper_3m', 'cnt_days_deb_f_oper_3m',
 'cnt_c_oper_3m', 'cnt_days_deb_g_oper_3m', 'cnt_b_oper_1m', 'cnt_days_deb_h_oper_1m', 'cnt_days_cred_h_oper_1m',
 'cnt_deb_f_oper_1m', 'sum_cred_f_oper_3m', 'cnt_days_cred_g_oper_1m', 'cnt_cred_g_oper_3m', 'sum_cred_d_oper_3m', 'sum_b_oper_3m', 'sum_c_oper_1m', 'sum_a_oper_1m', 'cnt_deb_g_oper_1m', 'cnt_deb_h_oper_1m',
                'cnt_b_oper_3m', 'cnt_days_cred_f_oper_1m', 'cnt_c_oper_1m', 'cnt_days_deb_g_oper_1m',
                'cnt_days_deb_f_oper_1m', 'sum_cred_g_oper_1m', 'cnt_cred_h_oper_1m', 'cnt_cred_d_oper_1m', 'sum_b_oper_1m', 'cnt_deb_d_oper_1m', 'cnt_cred_f_oper_3m',
 'cnt_cred_d_oper_3m', 'sum_cred_f_oper_1m', 'cnt_cred_f_oper_1m', 'sum_cred_d_oper_1m', 'cnt_cred_g_oper_1m', 'cnt_days_cred_g_oper_3m', 'cnt_days_cred_f_oper_3m']

selected_features = top_features[:53]

In [11]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

In [13]:
x_train, x_val, y_train, y_val = train_test_split(X[selected_features], y,test_size=0.2,random_state=42)

In [14]:
model = LGBMClassifier(verbosity=-1,
                       random_state=42,
                       n_jobs=-1,
                       learning_rate = 0.01,
                       max_depth=10,
                       lambda_l2 = 3,
                       n_estimators = 500)
model.fit(x_train, y_train)

Зададим функцию для взвешенной метрики roc auc

In [15]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [16]:
cluster_weights = pd.read_excel(FILE_PATH+'cluster_weights.xlsx').set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

Проверка работы модели

In [17]:
y_pred_proba = model.predict_proba(x_val)
y_pred_proba.shape

(120000, 17)

In [18]:
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

0.9430586606486161

Метрика на валидации 0.9430586606486161

## Прогноз на тестовой выборке

Обучаем модель на всей выборке

In [105]:
# 30 мин
model_full = LGBMClassifier(verbosity=-1,
                            random_state=42,
                            n_jobs=-1,
                            learning_rate = 0.01,
                            max_depth=10,
                            lambda_l2 = 3,
                            n_estimators = 500)
model_full.fit(X[selected_features], y)

In [78]:
sample_submission_df = pd.read_csv(FILE_PATH+"sample_submission.csv")

In [19]:
sample_submission_df.shape

(100000, 18)

In [20]:
sample_submission_df.head()

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,200001,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,200002,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
3,200003,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
4,200004,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


Для тестовой выборки будем использовать только последний месяц

In [19]:
last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)

In [None]:
test_pred_proba = model_full.predict_proba(last_m_test_df[selected_features])
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [25]:
test_pred_proba_df.shape

(100000, 17)

In [93]:
test_pred_proba_df.head(10)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.019205,0.034602,0.030663,0.028498,0.002695,0.000423,0.00066,0.000302,0.003421,0.011316,0.014111,0.000269,0.002111,8e-06,0.000864,0.850791,6.3e-05
1,0.014949,0.599694,0.004097,0.010794,0.001728,0.00116,0.000741,0.000145,0.00286,0.015703,0.003345,0.000264,0.000913,9e-06,0.00269,0.340794,0.000113
2,0.749838,0.024284,0.009491,0.045721,0.011495,0.001067,0.003439,0.00178,0.022122,0.015922,0.014727,0.000438,0.003226,1.6e-05,0.024966,0.071395,7.2e-05
3,0.053507,0.576895,0.004384,0.011626,0.001348,0.00073,0.000657,0.000262,0.014055,0.022412,0.003792,0.000277,0.000915,1e-05,0.000795,0.308298,3.8e-05
4,0.051955,0.180072,0.008029,0.023582,0.003885,0.001337,0.000616,0.00017,0.004084,0.264963,0.005932,0.00044,0.003186,1.5e-05,0.001352,0.450313,7.1e-05
5,0.075067,0.041817,0.041779,0.078582,0.040956,0.000444,0.004566,0.000195,0.009593,0.010145,0.0112,0.016953,0.00097,1.5e-05,0.003567,0.664075,7.7e-05
6,0.01352,0.663172,0.003623,0.008904,0.00134,0.000396,0.000518,9.1e-05,0.001917,0.009051,0.002987,0.000875,0.000686,1.3e-05,0.001549,0.291319,4.1e-05
7,0.021394,0.0275,0.015808,0.030849,0.003047,0.000371,0.000301,7.4e-05,0.002422,0.028712,0.004699,0.000989,0.018619,7e-06,0.000623,0.844557,2.9e-05
8,0.056336,0.305281,0.00819,0.023505,0.003277,0.001213,0.000863,0.000192,0.006082,0.119744,0.006158,0.000488,0.002103,1.6e-05,0.001639,0.464805,0.000107
9,0.024462,0.241121,0.063267,0.113612,0.002255,0.000666,0.000559,0.000152,0.00376,0.018643,0.00541,0.000459,0.001935,1.4e-05,0.001227,0.522374,8.6e-05


In [104]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("lgbm_submission_par_l2_sel_both_500.csv", index=False)