<h1>Содержание<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Обзор-данных" data-toc-modified-id="Обзор-данных-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Обзор данных</a></span><ul class="toc-item"><li><span><a href="#Объявление-функций" data-toc-modified-id="Объявление-функций-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Объявление функций</a></span></li><li><span><a href="#Распределение-типов-переменных" data-toc-modified-id="Распределение-типов-переменных-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Распределение типов переменных</a></span></li></ul></li><li><span><a href="#Очистка-датасета" data-toc-modified-id="Очистка-датасета-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Очистка датасета</a></span><ul class="toc-item"><li><span><a href="#Отбираем-кат-признаки" data-toc-modified-id="Отбираем-кат-признаки-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Отбираем кат признаки</a></span></li><li><span><a href="#Отбираем-числовые-признаки" data-toc-modified-id="Отбираем-числовые-признаки-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Отбираем числовые признаки</a></span></li><li><span><a href="#Получившиеся-фичи" data-toc-modified-id="Получившиеся-фичи-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Получившиеся фичи</a></span></li></ul></li><li><span><a href="#get_features_importances" data-toc-modified-id="get_features_importances-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>get_features_importances</a></span><ul class="toc-item"><li><span><a href="#Downsample" data-toc-modified-id="Downsample-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Downsample</a></span><ul class="toc-item"><li><span><a href="#Обычная-модель" data-toc-modified-id="Обычная-модель-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>Обычная модель</a></span></li><li><span><a href="#GridSearchCV" data-toc-modified-id="GridSearchCV-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>GridSearchCV</a></span></li><li><span><a href="#feature_importance" data-toc-modified-id="feature_importance-3.1.3"><span class="toc-item-num">3.1.3&nbsp;&nbsp;</span>feature_importance</a></span></li></ul></li><li><span><a href="#class_weight-=-'balanced'" data-toc-modified-id="class_weight-=-'balanced'-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>class_weight = 'balanced'</a></span><ul class="toc-item"><li><span><a href="#Обычная-модель" data-toc-modified-id="Обычная-модель-3.2.1"><span class="toc-item-num">3.2.1&nbsp;&nbsp;</span>Обычная модель</a></span></li><li><span><a href="#GridSearchCV" data-toc-modified-id="GridSearchCV-3.2.2"><span class="toc-item-num">3.2.2&nbsp;&nbsp;</span>GridSearchCV</a></span></li><li><span><a href="#feature_importnace" data-toc-modified-id="feature_importnace-3.2.3"><span class="toc-item-num">3.2.3&nbsp;&nbsp;</span>feature_importnace</a></span></li></ul></li><li><span><a href="#Отбираем-признаки" data-toc-modified-id="Отбираем-признаки-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Отбираем признаки</a></span></li></ul></li></ul></div>

# Отбор значимых непрерывных признаков

## Обзор данных

In [2]:
# импорт основных библиотек
import pandas as pd
import numpy as np

# импорт библиотек текущего модуля
from lightgbm import LGBMClassifier
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score
)
from sklearn.metrics import (
    roc_auc_score, 
    accuracy_score, 
    confusion_matrix
)
from sklearn.utils import resample
from scipy.stats import pointbiserialr

# настройки
pd.set_option('display.max_columns', 100)

# пути
dataset_train_path = '../../data/dataset_train.parquet'
dataset_test_path = '../../data/features_oot.parquet'
features_types_path = '../../data/features_types.json'
sample_submission_path = '../../data/sample_submission.csv'

# константы
RANDOM_STATE = 42


### Объявление функций

In [3]:
def read_data(path: str, left: int, right: int):
    """
    Функция для считывания определенного количества столбцов из parquet файла.
    ---
    Принимает на вход str путь к файлу, int левую и int правую границу.
    Столбец с таргетом добавляется в конец автоматически!
    Функция возвращает pd.DataFrame.
    """
    # следующие т столбцов, которые будем считывать + target
    columns_to_read = features_types['feature'][left:right].to_list() + ['channel_name'] + ['target']

    # считываем столбцы в файл
    data = pd.read_parquet(path, engine='pyarrow', columns=columns_to_read)

    return data


def variance_filter(data: pd.DataFrame, threshold: float):
    """
    Функция для проверки датасета на наличие константных (или около того) значений.
    ---
    Принимает на вход pd.DataFrame и float с значением порога для дисперсии.
    Функция возвращает отфильтрованный pd.DataFrame.
    """
    # порог для дисперсии
    threshold = threshold

    # считаем дисперсию для каждого признака
    variance = data.iloc[:, :-1].var()

    # отбираем те фичи, у которых var > threshold
    selected_features = variance[variance > threshold].index.tolist()

    # собираем в датасет
    filtered_data = data[selected_features + ['target']]
    
    return filtered_data
    

def check_corr_biserial(data: pd.DataFrame):
    """
    Функция для расчета коэф. корреляции бисериального всех признаков с целевой переменной.
    ---
    Принимает на вход pd.DataFrame.
    Возвращает pd.Series с значением коэффициента корреляции бисериального для каждого признака.
    """
    # все признаки, кроме последнего столбца с целевой переменной
    features = data.iloc[:, :-1]

    # пустой словарь для хранения результатов
    correlations = {}

    # проходимся по каждому признаку и вычисляем коэффициент корреляции
    for feature_name in features.columns:
        feature = data[feature_name]
        target = data['target']
        correlation, p_value = pointbiserialr(feature, target)
        correlations[feature_name] = correlation

    # создаем DataFrame из словаря корреляций
    correlation = pd.DataFrame.from_dict(correlations, orient='index', columns=['point_biserial_correlation'])
    
    # сортируем
    correlation = correlation.sort_values(by='point_biserial_correlation', ascending=False)
    
    return correlation
    
    
def check_corr_pearson(data: pd.DataFrame):
    """
    Функция для расчета корреляции Пирсона всех признаков с целевой переменной.
    ---
    Принимает на вход pd.DataFrame.
    Возвращает pd.Series с значением коэффициента корреляции для каждого признака.
    """
    # чекаем корреляцию с таргетом
    correlation = data.iloc[:, :-1].corrwith(data['target'])
    
    # сортируем
    correlation = correlation.sort_values(ascending=False)
    
    return correlation


def reshape_corr_matrix(data: pd.DataFrame):
    """
    Функция для преобразования вида корреляционной матрицы
    ---
    Принимает на вход pd.DataFrame (корр. матрицу)
    Возвращает pd.DataFrame со столбцами: ['feature_1', 'feature_2', 'corr']
    """
    # создаем пустой df
    reshaped_data = pd.DataFrame()
    
    # в цикле преобразовываем данные
    for feature in data.columns:    
        temp_data = data[feature] \
            .reset_index() \
            .rename(columns={'index': 'feature_1', feature: 'corr'})
    
        temp_data['feature_2'] = feature
    
        reshaped_data = pd.concat([reshaped_data, temp_data])
    
    # меняем порядок столбцов
    reshaped_data = reshaped_data[['feature_1', 'feature_2', 'corr']]
    
    # выводим результат
    return reshaped_data

### Распределение типов переменных

In [4]:
# читаем файлы
features_types = pd.read_json(features_types_path, orient='index')
sample_submission = pd.read_csv(sample_submission_path)


In [5]:
sample_submission

Unnamed: 0,id,target
0,0,0.343518
1,1,0.591216
2,2,0.913150
3,3,0.560035
4,4,0.352795
...,...,...
60656,60656,0.765319
60657,60657,0.533016
60658,60658,0.784497
60659,60659,0.804431


In [6]:
# приводим features_types к читаемому виду
features_types = features_types \
    .reset_index() \
    .rename(columns={'index': 'feature', 0: 'type'})

# выводим на экран
features_types


Unnamed: 0,feature,type
0,markers_0_1_cnt,numeric
1,markers_1_1_cnt,numeric
2,markers_2_1_cnt,numeric
3,markers_3_1_cnt,numeric
4,markers_4_1_cnt,numeric
...,...,...
2771,markers_941_1_cnt,numeric
2772,markers_942_1_cnt,numeric
2773,markers_943_1_cnt,numeric
2774,markers_944_1_cnt,numeric


In [7]:
# распределение типов данных
features_types.type.value_counts()


type
numeric            2607
categorical_int     138
categorical_str      31
Name: count, dtype: int64

In [8]:
# смотрим на все фичи в паркете
print(features_types['feature'].to_list())


['markers_0_1_cnt', 'markers_1_1_cnt', 'markers_2_1_cnt', 'markers_3_1_cnt', 'markers_4_1_cnt', 'markers_5_1_cnt', 'markers_6_1_cnt', 'markers_7_1_cnt', 'markers_8_1_cnt', 'markers_9_1_cnt', 'markers_10_1_cnt', 'markers_11_1_cnt', 'markers_12_1_cnt', 'markers_13_1_cnt', 'markers_14_1_cnt', 'markers_15_1_cnt', 'markers_16_1_cnt', 'markers_17_1_cnt', 'markers_18_1_cnt', 'markers_19_1_cnt', 'markers_20_1_cnt', 'markers_21_1_cnt', 'markers_22_1_cnt', 'markers_23_1_cnt', 'markers_24_1_cnt', 'markers_25_1_cnt', 'markers_26_1_cnt', 'markers_27_1_cnt', 'markers_28_1_cnt', 'markers_29_1_cnt', 'markers_30_1_cnt', 'markers_31_1_cnt', 'markers_32_1_cnt', 'markers_33_1_cnt', 'markers_34_1_cnt', 'markers_35_1_cnt', 'markers_36_1_cnt', 'markers_37_1_cnt', 'markers_38_1_cnt', 'markers_39_1_cnt', 'markers_40_1_cnt', 'markers_41_1_cnt', 'markers_42_1_cnt', 'markers_43_1_cnt', 'markers_44_1_cnt', 'markers_45_1_cnt', 'markers_46_1_cnt', 'markers_47_1_cnt', 'markers_48_1_cnt', 'markers_49_1_cnt', 'markers_

## Очистка датасета

In [9]:
# считаем файл
dataset_train = read_data(dataset_train_path, 0, 930)
# dataset_test = pd.read_parquet(dataset_test_path, engine='pyarrow')

### Отбираем кат признаки

In [10]:
# получим список кат фичей
cat_features = []

for feature in dataset_train.columns[:-2]:
    if feature.split('_')[-1] in ['flg', 'ctg']:
        cat_features.append(feature)
        
cat_features

['materials_details_0_1_ctg',
 'communication_availability_0_1_flg',
 'communication_availability_1_1_ctg',
 'communication_availability_2_1_flg',
 'materials_details_1_1_ctg',
 'user_lifetime_0_1_ctg',
 'user_lifetime_1_1_flg',
 'communication_availability_3_1_flg',
 'materials_details_5_1_flg',
 'materials_details_7_1_flg',
 'materials_details_8_1_flg']

In [11]:
dataset_train[cat_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Data columns (total 11 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   materials_details_0_1_ctg           702086 non-null  int64
 1   communication_availability_0_1_flg  702086 non-null  int32
 2   communication_availability_1_1_ctg  702086 non-null  int32
 3   communication_availability_2_1_flg  702086 non-null  int32
 4   materials_details_1_1_ctg           702086 non-null  int64
 5   user_lifetime_0_1_ctg               702086 non-null  int64
 6   user_lifetime_1_1_flg               702086 non-null  int32
 7   communication_availability_3_1_flg  702086 non-null  int32
 8   materials_details_5_1_flg           702086 non-null  int8 
 9   materials_details_7_1_flg           702086 non-null  int8 
 10  materials_details_8_1_flg           702086 non-null  int8 
dtypes: int32(5), int64(3), int8(3)
memory usage: 31.5 MB

In [12]:
# считаем корреляцию кт признаков между собой
cat_corr = dataset_train[cat_features].corr(method='spearman')

# удаляем категориальные признаки, которые сильно коррелируют друг с другом
cat_corr_reshaped = reshape_corr_matrix(cat_corr)

features_to_delete = cat_corr_reshaped \
    .query('corr != 1 and corr > 0.5') \
    .sort_values(by='corr', ascending=False)

features_to_delete

Unnamed: 0,feature_1,feature_2,corr
10,materials_details_8_1_flg,materials_details_7_1_flg,0.995847
9,materials_details_7_1_flg,materials_details_8_1_flg,0.995847
10,materials_details_8_1_flg,materials_details_5_1_flg,0.995015
8,materials_details_5_1_flg,materials_details_8_1_flg,0.995015
9,materials_details_7_1_flg,materials_details_5_1_flg,0.99207
8,materials_details_5_1_flg,materials_details_7_1_flg,0.99207
6,user_lifetime_1_1_flg,user_lifetime_0_1_ctg,0.977829
5,user_lifetime_0_1_ctg,user_lifetime_1_1_flg,0.977829
7,communication_availability_3_1_flg,communication_availability_0_1_flg,0.600903
1,communication_availability_0_1_flg,communication_availability_3_1_flg,0.600903


In [13]:
# отбираем кат признаки
cat_features = [
    'materials_details_8_1_flg',
    'user_lifetime_1_1_flg',
    'communication_availability_3_1_flg',
    'materials_details_1_1_ctg'
]

dataset_train[cat_features].corrwith(dataset_train.target, method='spearman').sort_values(ascending=False)

materials_details_8_1_flg             0.011134
communication_availability_3_1_flg    0.005204
user_lifetime_1_1_flg                 0.003914
materials_details_1_1_ctg            -0.002907
dtype: float64

### Отбираем числовые признаки

In [14]:
# теперь отберем числовые признаки
num_features = []

for feature in dataset_train.columns[:-2]:
    if feature.split('_')[-1] in ['num', 'sum', 'avg', 'cnt']:
        num_features.append(feature)
        
len(num_features)

907

Итого имеем отдельно категориальные и чистовые признаки!

Теперь очистим числовые с помощью дисперсии и корреляции

In [15]:
len(num_features)

907

In [16]:
# убираем константные столбцы
dataset_num = variance_filter(dataset_train[num_features + ['target']], 0.5)
dataset_num.shape

(702086, 877)

Убрали ~ 70 признаков

Теперь удалим столбцы в которых много пропусков.

In [21]:
# удалим столбцы где много пропусков
dataset_num = dataset_num.dropna(axis=1, thresh=dataset_num.shape[0]/2)
dataset_num = dataset_num.dropna(axis=0, thresh=dataset_num.shape[1]/2)
dataset_num.shape

(671264, 849)

In [23]:
# перезаписали
num_features = list(dataset_num.columns[:-1])

Теперь проведем корр анализ и также уберем лишние признаки

In [24]:
# установим трешхолд для корреляции
corr_threshold = 0.75

In [None]:
try:
    data_corr = pd.read_csv('../../data/data_corr.csv')
except:
    data_corr = dataset_train[num_features].corr()

In [28]:
# приводим в должный вид
reshaped_corr_matrix = reshape_corr_matrix(data_corr)

In [29]:
reshaped_corr_matrix \
    .query('corr != 1.0 and corr > 0.75') \
    .sort_values(by='corr', ascending=False)

Unnamed: 0,feature_1,feature_2,corr
800,charges_details_8_6_avg,charges_details_11_6_avg,0.999999
803,charges_details_11_6_avg,charges_details_8_6_avg,0.999999
820,charges_details_26_6_avg,charges_details_23_6_avg,0.999999
817,charges_details_23_6_avg,charges_details_26_6_avg,0.999999
799,charges_details_7_3_avg,charges_details_10_3_avg,0.999999
...,...,...,...
471,markers_509_1_cnt,markers_281_1_cnt,0.751511
679,markers_734_1_cnt,markers_530_1_cnt,0.751121
491,markers_530_1_cnt,markers_734_1_cnt,0.751121
835,payments_details_50_6_sum,plan_costs_4_3_avg,0.750007


In [30]:
features_to_delete = reshaped_corr_matrix \
    .query('corr != 1.0 and corr > 0.75') \
    .sort_values(by='corr', ascending=False) \
    .iloc[::2, 1].to_list()

len(features_to_delete)

286

In [31]:
dataset_num = dataset_num.drop(features_to_delete, axis=1)
dataset_num.shape

(671264, 756)

In [32]:
# опять перезапишем
num_features = list(dataset_num.columns[:-1])

### Получившиеся фичи

In [33]:
my_test_features = num_features + cat_features + ['channel_name'] + ['target']
len(my_test_features)

761

In [34]:
dataset_train = dataset_train[my_test_features]
dataset_train.head()

Unnamed: 0,markers_0_1_cnt,markers_1_1_cnt,markers_2_1_cnt,markers_3_1_cnt,markers_4_1_cnt,markers_5_1_cnt,markers_6_1_cnt,markers_7_1_cnt,markers_8_1_cnt,markers_9_1_cnt,markers_10_1_cnt,markers_11_1_cnt,markers_12_1_cnt,markers_13_1_cnt,markers_14_1_cnt,markers_15_1_cnt,markers_16_1_cnt,markers_17_1_cnt,markers_18_1_cnt,markers_19_1_cnt,markers_20_1_cnt,markers_21_1_cnt,markers_22_1_cnt,markers_24_1_cnt,markers_25_1_cnt,markers_26_1_cnt,markers_27_1_cnt,markers_28_1_cnt,markers_29_1_cnt,markers_30_1_cnt,markers_31_1_cnt,markers_32_1_cnt,markers_33_1_cnt,markers_34_1_cnt,markers_35_1_cnt,markers_36_1_cnt,markers_37_1_cnt,markers_38_1_cnt,markers_39_1_cnt,markers_40_1_cnt,markers_41_1_cnt,markers_42_1_cnt,markers_43_1_cnt,markers_44_1_cnt,markers_45_1_cnt,markers_46_1_cnt,markers_47_1_cnt,markers_48_1_cnt,markers_49_1_cnt,markers_50_1_cnt,...,markers_780_1_cnt,markers_781_1_cnt,markers_782_1_cnt,markers_783_1_cnt,markers_784_1_cnt,markers_785_1_cnt,markers_786_1_cnt,markers_787_1_cnt,markers_788_1_cnt,markers_789_1_cnt,markers_790_1_cnt,markers_791_1_cnt,markers_793_1_cnt,markers_794_1_cnt,markers_795_1_cnt,markers_796_1_cnt,markers_797_1_cnt,markers_798_1_cnt,markers_799_1_cnt,markers_800_1_cnt,markers_801_1_cnt,markers_802_1_cnt,markers_803_1_cnt,markers_804_1_cnt,markers_805_1_cnt,spas_symptoms_agr_7_6_sum,payments_details_10_3_cnt,payments_details_11_6_cnt,payments_details_16_1d3_avg,payments_details_23_3d6_avg,payments_details_24_3d6_cnt,user_devices_0_1_cnt,user_devices_1_1_cnt,user_devices_2_1_cnt,balance_details_0_1_num,charges_details_13_3_avg,tariff_plans_4_1_num,payments_details_30_1_sum,payments_details_35_6_sum,payments_details_39_1_sum,payments_details_46_1_sum,payments_details_47_3_avg,payments_details_49_6_avg,plan_costs_8_6_avg,materials_details_8_1_flg,user_lifetime_1_1_flg,communication_availability_3_1_flg,materials_details_1_1_ctg,channel_name,target
0,-0.149534,-0.096585,-0.40087,-0.362458,0.302917,-0.324892,-0.046691,-0.554345,-0.016534,-0.05408,-0.044653,-0.208968,-0.017454,-0.008052,-0.028391,-0.031395,-0.199957,-0.064366,-0.026839,-0.027484,-0.03147,-0.080534,-0.110653,-0.02527,-0.115151,-0.04685,-0.033911,-0.00794,-0.025816,-0.088375,-0.009532,-0.237766,-0.041626,-0.012926,-0.028445,-0.009258,-0.026347,-0.038496,-0.08904,-0.401337,-0.03564,-0.100471,-0.035238,-0.009976,-0.031354,-0.006144,-0.070189,-0.047567,-0.050881,-0.190083,...,-0.026327,-0.047214,-0.024438,-0.034565,-0.112963,-0.038788,-0.116006,-0.06547,-0.206636,-0.212451,-0.009755,-0.111689,-0.117005,-0.093901,-0.033756,-0.187771,-0.0051,-0.00963,-0.007384,-0.011695,-0.065081,-0.009588,-0.020736,-0.149392,-0.3601,0.475338,-0.007544,-0.009613,0.358846,0.172597,-0.107577,-0.310186,-0.03573,-0.039662,-0.06716,-0.8548,-2.232055,-0.396153,-0.509249,-0.00359,-0.66455,-0.790306,-0.790365,-0.05821,-1,-1,1,1,5,0
1,-0.149534,-0.096585,0.196468,-0.362458,0.092276,-0.324892,-0.046691,-0.554345,-0.016534,-0.05408,-0.044653,-0.208968,-0.017454,-0.008052,-0.028391,-0.031395,-0.199957,-0.064366,-0.026839,-0.027484,-0.03147,-0.080534,-0.110653,-0.02527,-0.115151,-0.04685,-0.033911,-0.00794,-0.025816,-0.088375,-0.009532,-0.237766,-0.041626,-0.012926,-0.028445,-0.009258,-0.026347,-0.038496,-0.08904,-0.401337,-0.03564,-0.100471,-0.035238,-0.009976,-0.031354,-0.006144,-0.070189,-0.047567,-0.050881,-0.190083,...,-0.026327,-0.047214,-0.024438,-0.034565,-0.112963,-0.038788,-0.116006,-0.06547,-0.206636,-0.212451,-0.009755,-0.111689,-0.117005,-0.093901,-0.033756,-0.187771,-0.0051,-0.00963,-0.007384,-0.011695,-0.065081,-0.009588,-0.020736,-0.149392,-0.3601,0.475338,-0.007544,-0.009613,-2.358446,-0.006716,-0.775059,-0.310186,-0.03573,-0.039662,-0.490323,-0.8548,-2.232055,-0.396153,-0.630958,-0.00359,-1.282044,-0.790306,-0.767356,-0.284361,-1,-1,1,1,5,0
2,-0.149534,-0.096585,-0.102201,0.911996,0.7242,0.445189,-0.046691,-0.397624,-0.016534,-0.05408,-0.044653,-0.208968,-0.017454,-0.008052,-0.028391,-0.031395,-0.199957,-0.064366,-0.026839,-0.027484,-0.03147,-0.080534,-0.110653,-0.02527,-0.115151,-0.04685,-0.033911,-0.00794,-0.025816,-0.088375,-0.009532,-0.237766,-0.041626,-0.012926,-0.028445,-0.009258,-0.026347,-0.038496,-0.08904,-0.044543,-0.03564,5.115505,-0.035238,-0.009976,-0.031354,-0.006144,-0.070189,-0.047567,-0.050881,-0.190083,...,-0.026327,-0.047214,-0.024438,-0.034565,-0.112963,-0.038788,-0.116006,-0.06547,-0.206636,-0.212451,-0.009755,-0.111689,3.52422,-0.093901,-0.033756,-0.187771,-0.0051,-0.00963,-0.007384,-0.011695,-0.065081,-0.009588,-0.020736,-0.149392,-0.3601,0.475338,-0.007544,-0.009613,0.24338,-0.006716,-0.107577,-0.310186,-0.03573,-0.039662,-1.184311,-0.8548,-0.866785,-0.396153,0.182613,-0.00359,-0.285819,-0.343081,-0.365943,-0.01298,-1,-1,1,1,5,1
3,-0.149534,-0.096585,-0.40087,-0.362458,1.145482,-0.324892,-0.046691,-0.554345,-0.016534,-0.05408,-0.044653,-0.208968,-0.017454,-0.008052,-0.028391,-0.031395,-0.199957,-0.064366,-0.026839,-0.027484,-0.03147,-0.080534,-0.110653,-0.02527,-0.115151,-0.04685,-0.033911,-0.00794,-0.025816,-0.088375,-0.009532,-0.237766,-0.041626,-0.012926,-0.028445,-0.009258,-0.026347,-0.038496,2.330488,0.669046,-0.03564,-0.100471,-0.035238,-0.009976,-0.031354,-0.006144,-0.070189,-0.047567,-0.050881,-0.190083,...,-0.026327,-0.047214,-0.024438,-0.034565,-0.112963,-0.038788,-0.116006,-0.06547,-0.206636,-0.212451,-0.009755,-0.111689,-0.117005,-0.093901,-0.033756,-0.187771,-0.0051,-0.00963,-0.007384,-0.011695,-0.065081,-0.009588,-0.020736,-0.149392,-0.3601,0.475338,-0.007544,-0.009613,0.677895,-0.904191,0.149148,-0.310186,-0.03573,-0.039662,0.192764,-0.8548,0.201687,0.773879,-1.359464,-0.00359,-0.302286,-0.979679,-0.830346,1.81884,-1,-1,1,1,3,0
4,-0.149534,-0.096585,0.196468,-0.362458,0.934841,-0.324892,-0.046691,0.072541,-0.016534,-0.05408,-0.044653,-0.208968,-0.017454,-0.008052,-0.028391,-0.031395,-0.199957,-0.064366,-0.026839,-0.027484,-0.03147,-0.080534,-0.110653,-0.02527,-0.115151,-0.04685,-0.033911,-0.00794,-0.025816,-0.088375,-0.009532,-0.237766,-0.041626,-0.012926,-0.028445,-0.009258,-0.026347,-0.038496,-0.08904,1.739428,-0.03564,-0.100471,-0.035238,-0.009976,-0.031354,-0.006144,-0.070189,-0.047567,-0.050881,-0.190083,...,-0.026327,-0.047214,-0.024438,-0.034565,-0.112963,-0.038788,-0.116006,-0.06547,-0.206636,-0.212451,-0.009755,-0.111689,-0.117005,-0.093901,-0.033756,-0.187771,-0.0051,-0.00963,-0.007384,-0.011695,-0.065081,-0.009588,-0.020736,-0.149392,-0.3601,0.4577,-0.007544,-0.009613,0.286385,-0.006716,-0.58435,-0.310186,-0.03573,-0.039662,0.169811,-0.8548,-1.044864,-0.396153,0.359287,-0.00359,0.64454,-0.396748,-0.414112,-0.216516,-1,-1,1,1,1,0


## get_features_importances

Приступим к анализу feature_importance

In [36]:
dataset_train['target'].value_counts()

target
0    666074
1      5190
Name: count, dtype: int64

### Downsample

In [37]:
dataset_train['channel_name'] = dataset_train['channel_name'].astype('int')

In [38]:
data_1 = dataset_train[dataset_train["target"] == 1]
data_0 = dataset_train[dataset_train["target"] == 0]

print(data_1.shape)
print(data_0.shape)

(5190, 761)
(666074, 761)


In [39]:
data_0_downsampled = resample(
    data_0,
    replace=True,
    n_samples=len(data_1),
    random_state=RANDOM_STATE
)

print(data_0_downsampled.shape)

(5190, 761)


In [40]:
data_downsampled = pd.concat([data_0_downsampled, data_1])

print(data_downsampled["target"].value_counts())

target
0    5190
1    5190
Name: count, dtype: int64


In [41]:
X_downsampled = data_downsampled.iloc[:, :-1]
y_downsampled = data_downsampled['target']

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_downsampled, y_downsampled, test_size=0.25, stratify=y_downsampled, random_state=RANDOM_STATE
)

In [42]:
cat_features_lgbm = [
    'materials_details_8_1_flg', 
    'user_lifetime_1_1_flg', 
    'communication_availability_3_1_flg', 
    'materials_details_1_1_ctg',
    'channel_name'
]

#### Обычная модель

In [55]:
%%time

# модель
lgbm_model = LGBMClassifier(
    n_jobs=-1, 
    random_state=RANDOM_STATE
)

# fit predict
lgbm_model.fit(
    X_train_d, y_train_d,
    eval_set=[(X_test_d, y_test_d)],
    eval_metric='roc_auc',
    categorical_feature=cat_features_lgbm
)

y_pred_proba = lgbm_model.predict_proba(X_test_d)
y_pred = lgbm_model.predict(X_test_d)

# скоры
roc_score = roc_auc_score(y_test_d, y_pred_proba[:,1])
gini_score = (2 * roc_score) - 1
acc_score = accuracy_score(y_test_d, y_pred)

# выводим информацию на экран
print(f'LGBM ROC AUC SCORE: {roc_score:.2f}')
print(f'LGBM GINI SCORE: {gini_score:.2f}')
print(f'LGBM ACCURACY SCORE: {acc_score:.2f}')
print('-' * 25)

New categorical_feature is ['channel_name', 'communication_availability_3_1_flg', 'materials_details_1_1_ctg', 'materials_details_8_1_flg', 'user_lifetime_1_1_flg']


[1]	valid_0's binary_logloss: 0.678827
[2]	valid_0's binary_logloss: 0.667676
[3]	valid_0's binary_logloss: 0.657792
[4]	valid_0's binary_logloss: 0.649927
[5]	valid_0's binary_logloss: 0.643783
[6]	valid_0's binary_logloss: 0.63904
[7]	valid_0's binary_logloss: 0.634901
[8]	valid_0's binary_logloss: 0.630921
[9]	valid_0's binary_logloss: 0.627894
[10]	valid_0's binary_logloss: 0.625251
[11]	valid_0's binary_logloss: 0.623311
[12]	valid_0's binary_logloss: 0.622222
[13]	valid_0's binary_logloss: 0.620607
[14]	valid_0's binary_logloss: 0.61911
[15]	valid_0's binary_logloss: 0.6177
[16]	valid_0's binary_logloss: 0.615744
[17]	valid_0's binary_logloss: 0.61483
[18]	valid_0's binary_logloss: 0.614248
[19]	valid_0's binary_logloss: 0.613578
[20]	valid_0's binary_logloss: 0.613043
[21]	valid_0's binary_logloss: 0.612985
[22]	valid_0's binary_logloss: 0.612101
[23]	valid_0's binary_logloss: 0.611231
[24]	valid_0's binary_logloss: 0.611213
[25]	valid_0's binary_logloss: 0.610419
[26]	valid_0's

#### GridSearchCV

In [44]:
%%time

# задаем сетку параметров
param_grid_lgbm = {
    'n_estimators': range(100, 1001, 100),
    'max_depth': range(1, 5, 1),
    'learning_rate': [0.01, 0.05, 0.1]
}

# описываем gs
gs_lgbm_pl = GridSearchCV(
    lgbm_model, 
    param_grid=param_grid_lgbm, 
    scoring='roc_auc', 
    n_jobs=-1,
    cv=5
)

# фиттим модель
gs_lgbm_pl.fit(
    X_train_d, y_train_d,
    eval_set=[(X_test_d, y_test_d)],
    eval_metric='roc_auc',
    categorical_feature=cat_features_lgbm
)

# сохраняем лучший скор и параметры
gs_lgbm_best_score = gs_lgbm_pl.best_score_
gs_lgbm_best_params = gs_lgbm_pl.best_params_


# выводим информацию на экран
print(f'LGBM BEST ROC AUC SCORE: {gs_lgbm_best_score:.2f}')
print(f'LGBM BEST PARAMS: {gs_lgbm_best_params}')
print('-' * 93)

New categorical_feature is ['channel_name', 'communication_availability_3_1_flg', 'materials_details_1_1_ctg', 'materials_details_8_1_flg', 'user_lifetime_1_1_flg']


[1]	valid_0's binary_logloss: 0.68733
[2]	valid_0's binary_logloss: 0.68208
[3]	valid_0's binary_logloss: 0.677355
[4]	valid_0's binary_logloss: 0.673085
[5]	valid_0's binary_logloss: 0.669315
[6]	valid_0's binary_logloss: 0.66588
[7]	valid_0's binary_logloss: 0.662516
[8]	valid_0's binary_logloss: 0.659696
[9]	valid_0's binary_logloss: 0.656906
[10]	valid_0's binary_logloss: 0.654532
[11]	valid_0's binary_logloss: 0.652164
[12]	valid_0's binary_logloss: 0.650237
[13]	valid_0's binary_logloss: 0.648321
[14]	valid_0's binary_logloss: 0.646676
[15]	valid_0's binary_logloss: 0.645084
[16]	valid_0's binary_logloss: 0.643594
[17]	valid_0's binary_logloss: 0.642385
[18]	valid_0's binary_logloss: 0.641095
[19]	valid_0's binary_logloss: 0.640086
[20]	valid_0's binary_logloss: 0.63901
[21]	valid_0's binary_logloss: 0.638053
[22]	valid_0's binary_logloss: 0.637033
[23]	valid_0's binary_logloss: 0.636148
[24]	valid_0's binary_logloss: 0.635481
[25]	valid_0's binary_logloss: 0.634869
[26]	valid_0'

[211]	valid_0's binary_logloss: 0.608032
[212]	valid_0's binary_logloss: 0.608041
[213]	valid_0's binary_logloss: 0.608038
[214]	valid_0's binary_logloss: 0.608056
[215]	valid_0's binary_logloss: 0.607999
[216]	valid_0's binary_logloss: 0.607932
[217]	valid_0's binary_logloss: 0.607991
[218]	valid_0's binary_logloss: 0.608008
[219]	valid_0's binary_logloss: 0.60793
[220]	valid_0's binary_logloss: 0.607931
[221]	valid_0's binary_logloss: 0.607917
[222]	valid_0's binary_logloss: 0.607787
[223]	valid_0's binary_logloss: 0.60779
[224]	valid_0's binary_logloss: 0.607758
[225]	valid_0's binary_logloss: 0.607709
[226]	valid_0's binary_logloss: 0.607641
[227]	valid_0's binary_logloss: 0.607571
[228]	valid_0's binary_logloss: 0.607499
[229]	valid_0's binary_logloss: 0.607551
[230]	valid_0's binary_logloss: 0.607466
[231]	valid_0's binary_logloss: 0.607465
[232]	valid_0's binary_logloss: 0.607474
[233]	valid_0's binary_logloss: 0.607488
[234]	valid_0's binary_logloss: 0.60736
[235]	valid_0's bin

#### feature_importance

In [60]:
# Получите важности признаков
feature_importance = lgbm_model.feature_importances_

# Создайте DataFrame с названиями признаков и их важностями
downsample_importance_df = pd.DataFrame({'feature': X_train_d.columns, 'importance': feature_importance})

# Отсортируйте по убыванию важности
downsample_importance_df = downsample_importance_df.sort_values('importance', ascending=False)

# Выведите результаты
downsample_importance_df.head(30)


Unnamed: 0,feature,importance
736,spas_symptoms_agr_7_6_sum,67
752,payments_details_47_3_avg,55
751,payments_details_46_1_sum,54
747,tariff_plans_4_1_num,51
753,payments_details_49_6_avg,51
746,charges_details_13_3_avg,51
59,markers_60_1_cnt,41
643,markers_706_1_cnt,40
749,payments_details_35_6_sum,40
740,payments_details_23_3d6_avg,38


### class_weight = 'balanced'

In [61]:
X = dataset_train.iloc[:, :-1]
y = dataset_train['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

#### Обычная модель

In [73]:
%%time

# модель
lgbm_model = LGBMClassifier(
    learning_rate=0.05,
    n_estimators=400,
    n_jobs=-1, 
    class_weight='balanced', 
    random_state=RANDOM_STATE
)

# fit predict
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='roc_auc',
    categorical_feature=cat_features_lgbm
)

y_pred_proba = lgbm_model.predict_proba(X_test)
y_pred = lgbm_model.predict(X_test)

# скоры
roc_score = roc_auc_score(y_test, y_pred_proba[:,1])
gini_score = (2 * roc_score) - 1
acc_score = accuracy_score(y_test, y_pred)

# выводим информацию на экран
print(f'LGBM ROC AUC SCORE: {roc_score:.2f}')
print(f'LGBM GINI SCORE: {gini_score:.2f}')
print(f'LGBM ACCURACY SCORE: {acc_score:.2f}')
print('-' * 25)

New categorical_feature is ['channel_name', 'communication_availability_3_1_flg', 'materials_details_1_1_ctg', 'materials_details_8_1_flg', 'user_lifetime_1_1_flg']


[1]	valid_0's binary_logloss: 0.684178
[2]	valid_0's binary_logloss: 0.6761
[3]	valid_0's binary_logloss: 0.668772
[4]	valid_0's binary_logloss: 0.66206
[5]	valid_0's binary_logloss: 0.655905
[6]	valid_0's binary_logloss: 0.65023
[7]	valid_0's binary_logloss: 0.645123
[8]	valid_0's binary_logloss: 0.640285
[9]	valid_0's binary_logloss: 0.635755
[10]	valid_0's binary_logloss: 0.631733
[11]	valid_0's binary_logloss: 0.627862
[12]	valid_0's binary_logloss: 0.624348
[13]	valid_0's binary_logloss: 0.621124
[14]	valid_0's binary_logloss: 0.618031
[15]	valid_0's binary_logloss: 0.615063
[16]	valid_0's binary_logloss: 0.612338
[17]	valid_0's binary_logloss: 0.609799
[18]	valid_0's binary_logloss: 0.60744
[19]	valid_0's binary_logloss: 0.60524
[20]	valid_0's binary_logloss: 0.603111
[21]	valid_0's binary_logloss: 0.601011
[22]	valid_0's binary_logloss: 0.599068
[23]	valid_0's binary_logloss: 0.597244
[24]	valid_0's binary_logloss: 0.595427
[25]	valid_0's binary_logloss: 0.593864
[26]	valid_0's 

[206]	valid_0's binary_logloss: 0.47262
[207]	valid_0's binary_logloss: 0.472241
[208]	valid_0's binary_logloss: 0.471848
[209]	valid_0's binary_logloss: 0.471357
[210]	valid_0's binary_logloss: 0.470947
[211]	valid_0's binary_logloss: 0.470392
[212]	valid_0's binary_logloss: 0.469957
[213]	valid_0's binary_logloss: 0.469622
[214]	valid_0's binary_logloss: 0.46925
[215]	valid_0's binary_logloss: 0.468864
[216]	valid_0's binary_logloss: 0.468512
[217]	valid_0's binary_logloss: 0.468134
[218]	valid_0's binary_logloss: 0.467777
[219]	valid_0's binary_logloss: 0.467305
[220]	valid_0's binary_logloss: 0.466962
[221]	valid_0's binary_logloss: 0.466498
[222]	valid_0's binary_logloss: 0.466105
[223]	valid_0's binary_logloss: 0.465762
[224]	valid_0's binary_logloss: 0.465256
[225]	valid_0's binary_logloss: 0.464819
[226]	valid_0's binary_logloss: 0.464407
[227]	valid_0's binary_logloss: 0.464098
[228]	valid_0's binary_logloss: 0.463657
[229]	valid_0's binary_logloss: 0.463276
[230]	valid_0's bi

#### GridSearchCV

In [78]:
%%time

# задаем сетку параметров
param_grid_lgbm = {
    'n_estimators': range(100, 501, 100),
    'max_depth': range(1, 5, 1),
    'learning_rate': [0.01, 0.05, 0.1]
}

# описываем gs
gs_lgbm_pl = GridSearchCV(
    lgbm_model, 
    param_grid=param_grid_lgbm, 
    scoring='roc_auc', 
    n_jobs=-1,
    cv=5
)

# фиттим модель
gs_lgbm_pl.fit(
    X_train_d, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='roc_auc',
    categorical_feature=cat_features_lgbm
)

# сохраняем лучший скор и параметры
gs_lgbm_best_score = gs_lgbm_pl.best_score_
gs_lgbm_best_params = gs_lgbm_pl.best_params_


# выводим информацию на экран
print(f'LGBM BEST ROC AUC SCORE: {gs_lgbm_best_score:.2f}')
print(f'LGBM BEST PARAMS: {gs_lgbm_best_params}')
print('-' * 93)

ValueError: Found input variables with inconsistent numbers of samples: [7785, 503448]

#### feature_importnace

In [76]:
# Получите важности признаков
feature_importance = lgbm_model.feature_importances_

# Создайте DataFrame с названиями признаков и их важностями
balanced_importance_df = pd.DataFrame({'feature': X_train.columns, 'importance': feature_importance})

# Отсортируйте по убыванию важности
balanced_importance_df = balanced_importance_df.sort_values('importance', ascending=False)

# Выведите результаты
balanced_importance_df.head(30)


Unnamed: 0,feature,importance
747,tariff_plans_4_1_num,209
752,payments_details_47_3_avg,196
740,payments_details_23_3d6_avg,195
753,payments_details_49_6_avg,193
736,spas_symptoms_agr_7_6_sum,182
749,payments_details_35_6_sum,163
751,payments_details_46_1_sum,154
746,charges_details_13_3_avg,141
100,markers_103_1_cnt,120
759,channel_name,118


### Отбираем признаки

In [75]:
# сохраним результаты
for_now = balanced_importance_df['feature'].head(30).to_list()
for_now

['tariff_plans_4_1_num',
 'payments_details_47_3_avg',
 'payments_details_23_3d6_avg',
 'payments_details_49_6_avg',
 'spas_symptoms_agr_7_6_sum',
 'payments_details_35_6_sum',
 'payments_details_46_1_sum',
 'charges_details_13_3_avg',
 'markers_103_1_cnt',
 'channel_name',
 'markers_349_1_cnt',
 'markers_706_1_cnt',
 'markers_330_1_cnt',
 'balance_details_0_1_num',
 'markers_346_1_cnt',
 'markers_324_1_cnt',
 'markers_508_1_cnt',
 'markers_348_1_cnt',
 'markers_146_1_cnt',
 'markers_333_1_cnt',
 'markers_772_1_cnt',
 'payments_details_16_1d3_avg',
 'markers_104_1_cnt',
 'markers_533_1_cnt',
 'markers_542_1_cnt',
 'markers_537_1_cnt',
 'markers_4_1_cnt',
 'markers_65_1_cnt',
 'plan_costs_8_6_avg',
 'markers_318_1_cnt']

In [77]:
# Получение матрицы ошибок
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Создание pd.DataFrame для матрицы ошибок
data = {'Прогноз (+)': [tp, fp], 'Прогноз (-)': [fn, tn]}
index = ['Реальность (+)', 'Реальность (-)']
conf_matrix = pd.DataFrame(data=data, index=index)

# Вывод матрицы ошибок
print(conf_matrix)

                Прогноз (+)  Прогноз (-)
Реальность (+)          598          699
Реальность (-)        31002       135517
