## Изучение данных

In [1]:
#!pip install phik

In [31]:
import json
import pandas as pd
import numpy as np

from phik import resources
from phik.binning import bin_data
from phik.report import plot_correlation_matrix
from phik import report

import catboost as cb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from catboost import CatBoostClassifier
from sklearn.utils import shuffle

In [3]:
f = open('features_types.json')
features_types = json.load(f)

In [4]:
data_features = pd.DataFrame(features_types.items(), columns=['features', 'types'])
data_features.head()

Unnamed: 0,features,types
0,markers_0_1_cnt,numeric
1,markers_1_1_cnt,numeric
2,markers_2_1_cnt,numeric
3,markers_3_1_cnt,numeric
4,markers_4_1_cnt,numeric


Создадим отдельно таргет.

In [5]:
target=['target']

In [6]:
target = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=target)
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   target  702086 non-null  int32
dtypes: int32(1)
memory usage: 2.7 MB


In [7]:
target.value_counts()

target
0         696617
1           5469
dtype: int64

Выбираю только свою часть признаков.

In [8]:
my_data_features = data_features.iloc[930:1860]

In [9]:
my_data_features.types.value_counts()

numeric            851
categorical_int     69
categorical_str     10
Name: types, dtype: int64

In [10]:
my_data_features.head()

Unnamed: 0,features,types
930,vas_details_1_3_sum,numeric
931,vas_details_2_6_sum,numeric
932,vas_details_3_1_sum,numeric
933,vas_details_4_3_sum,numeric
934,vas_details_5_6_sum,numeric


In [11]:
my_data_features_num = my_data_features[my_data_features['types'] == 'numeric']

In [12]:
my_data_features_cat = my_data_features[my_data_features['types'] != 'numeric']

In [13]:
my_data_features_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 851 entries, 930 to 1859
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   features  851 non-null    object
 1   types     851 non-null    object
dtypes: object(2)
memory usage: 19.9+ KB


## Обработка и удаление лишних числовых признаков

In [14]:
def read_data(left, right):

    # следующие т столбцов, которые будем считывать + target
    columns_to_read = my_data_features_num['features'][left:right].to_list() + ['target']

    # считываем столбцы в файл
    data = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=columns_to_read)

    return data

def variance_filter(data, threshold=0.5):

    # считаем дисперсию для каждого признака
    variance = data.var()

    # отбираем те фичи, у которых var > threshold
    selected_features = variance[variance > threshold].index.tolist()

    # собираем в датасет
    filtered_data = data[selected_features + ['target']]
    
    return filtered_data


def corr_with_target(data):

    # чекаем корреляцию с таргетом
    correlation = data.corrwith(data['target']).abs()
    
    # сортируем
    correlation = correlation.sort_values(ascending=False)
    
    return correlation


def common_correlation(data):
    corr_matrix = data.corr().abs()
    
    # выбираем верхний треугольник матрицы корреляции
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    
    # находим индексы признаков с корреляцией больше 0.90
    to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
    
    # удаляем эти признаки из датафрейма
    data.drop(data[to_drop], axis=1, inplace=True)
    
    return data    

Возьму сразу все числовые признаки.

In [15]:
dataset_num = read_data(0, -1)
dataset_num.head()

Unnamed: 0,vas_details_1_3_sum,vas_details_2_6_sum,vas_details_3_1_sum,vas_details_4_3_sum,vas_details_5_6_sum,vas_details_6_1_sum,vas_details_7_3_sum,vas_details_8_6_sum,vas_details_9_1_sum,vas_details_10_3_sum,...,issues_47_6_sum,issues_48_1_sum,issues_49_3_sum,issues_50_6_sum,issues_51_1_sum,issues_52_3_sum,issues_53_6_sum,issues_54_1_sum,issues_55_3_sum,target
0,,,,,,,,,,,...,,,,,,,,,,0
1,,,,,,,,,,,...,,,,,,,,,,0
2,,,,,,,,,,,...,,,,,,,,,,1
3,,,,,,,,,,,...,,,,,,,,,,0
4,,,,,,,,,,,...,,,,,,,,,,0


In [16]:
dataset_num = dataset_num.fillna(0)
dataset_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Columns: 851 entries, vas_details_1_3_sum to target
dtypes: float64(850), int32(1)
memory usage: 4.4 GB


In [17]:
variance_filter(dataset_num)

Unnamed: 0,user_devices_3_1_cnt,user_devices_4_1_cnt,user_devices_5_1_cnt,user_devices_6_1_cnt,user_devices_7_1_cnt,spas_symptoms_int_0_1_cnt,spas_symptoms_int_2_1_cnt,spas_symptoms_int_3_1_cnt,spas_symptoms_int_4_1_cnt,spas_symptoms_int_5_1_cnt,...,spas_symptoms_agr_230_3_avg,spas_symptoms_agr_231_3_std,spas_symptoms_agr_232_3_sum,spas_symptoms_agr_233_6_avg,spas_symptoms_agr_234_6_std,spas_symptoms_agr_235_6_sum,issues_12_1_sum,issues_39_1_sum,issues_40_3_sum,target
0,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,0.349689,-0.013018,-1.529651,1.552915,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,0
1,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,0.349689,-0.013018,-1.529651,1.552915,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,0
2,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,0.349689,-0.013018,-1.529651,1.552915,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,1
3,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,0.349689,-0.013018,-1.529651,1.552915,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,0
4,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,-0.245287,-0.013018,-1.529651,1.338408,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702081,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,-0.245287,-0.013018,1.000743,-1.021166,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,-0.083628,-0.001627,-0.002301,0
702082,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,0.349689,-0.013018,0.540671,-0.377646,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,0
702083,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,0.349689,-0.013018,0.770707,-0.592152,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,0
702084,-0.006703,-0.079334,-0.001046,-0.037455,-0.038275,0.349689,-0.013018,1.000743,-0.806659,-0.017977,...,-0.123783,-0.113338,-0.123765,-0.125866,-0.125118,-0.125834,0.000000,0.000000,0.000000,0


In [18]:
corr_with_target(dataset_num)

target                          1.000000
spas_symptoms_agr_105_12_std    0.029198
spas_symptoms_agr_114_6_std     0.025408
spas_symptoms_ott_93_1_cnt      0.023776
spas_symptoms_ott_74_1_cnt      0.023636
                                  ...   
spas_symptoms_agr_210_6_std          NaN
spas_symptoms_agr_211_6_sum          NaN
issues_24_1_sum                      NaN
issues_25_3_sum                      NaN
issues_26_6_sum                      NaN
Length: 851, dtype: float64

In [19]:
common_correlation(dataset_num)

Unnamed: 0,vas_details_1_3_sum,vas_details_6_1_sum,vas_details_7_3_sum,vas_details_8_6_sum,vas_details_9_1_sum,vas_details_10_3_sum,vas_details_11_6_sum,vas_details_12_1_sum,vas_details_13_3_sum,vas_details_14_6_sum,...,issues_47_6_sum,issues_48_1_sum,issues_49_3_sum,issues_50_6_sum,issues_51_1_sum,issues_52_3_sum,issues_53_6_sum,issues_54_1_sum,issues_55_3_sum,target
0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0
1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0
2,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,1
3,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0
4,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702081,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.484357,-0.083936,-0.167973,-0.22824,-0.028022,-0.049234,-0.05957,-0.014744,-0.026752,0
702082,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0
702083,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0
702084,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.00000,0.000000,0.000000,0


In [20]:
dataset_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Columns: 449 entries, vas_details_1_3_sum to target
dtypes: float64(448), int32(1)
memory usage: 2.3 GB


## Категориальные признаки

In [21]:
columns_to_read = my_data_features_cat['features'].to_list()

dataset_cat = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=columns_to_read)

In [22]:
dataset_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Data columns (total 79 columns):
 #   Column                               Non-Null Count   Dtype
---  ------                               --------------   -----
 0   communication_availability_4_1_flg   702086 non-null  int32
 1   communication_availability_5_1_flg   702086 non-null  int32
 2   communication_availability_6_1_flg   702086 non-null  int32
 3   social_dem_0_0_flg                   702086 non-null  int32
 4   communication_availability_7_1_ctg   702086 non-null  int32
 5   communication_availability_8_1_flg   702086 non-null  int32
 6   communication_availability_9_1_flg   702086 non-null  int32
 7   communication_availability_10_1_ctg  702086 non-null  int32
 8   communication_availability_11_1_flg  702086 non-null  int32
 9   communication_availability_12_1_flg  702086 non-null  int32
 10  communication_availability_13_1_flg  702086 non-null  int32
 11  communication_availability_14_1_flg  70

In [23]:
dataset_cat.describe()

Unnamed: 0,communication_availability_4_1_flg,communication_availability_5_1_flg,communication_availability_6_1_flg,social_dem_0_0_flg,communication_availability_7_1_ctg,communication_availability_8_1_flg,communication_availability_9_1_flg,communication_availability_10_1_ctg,communication_availability_11_1_flg,communication_availability_12_1_flg,...,tariff_plans_17_src_id,deact_react_5_1_ctg,communication_availability_45_1_flg,user_active_4_1_flg,deact_react_7_1_ctg,exist_tariff_1_1_flg,tariff_plans_18_1_ctg,tariff_plans_19_src_id,deact_react_8_1_ctg,deact_react_10_1_ctg
count,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,...,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0,702086.0
mean,0.566758,0.501732,0.574743,-0.865487,3.980877,-0.038609,-0.050518,100.707492,0.17548,0.012487,...,28289.348044,0.039458,0.575189,0.983637,0.1641,0.09844,2923.678856,60700.114218,-0.955638,0.21942
std,0.605814,0.609479,0.604881,0.500932,1.463002,0.285249,0.261527,81.25525,0.515901,0.365784,...,33383.682066,0.459654,0.604825,0.179883,0.861815,0.337596,1201.665947,22220.519884,0.219681,1.087445
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,-1.0,0.0
25%,0.0,0.0,0.0,-1.0,4.0,0.0,0.0,75.0,0.0,0.0,...,-1.0,0.0,0.0,1.0,0.0,0.0,1887.0,45407.0,-1.0,0.0
50%,1.0,1.0,1.0,-1.0,4.0,0.0,0.0,100.0,0.0,0.0,...,-1.0,0.0,1.0,1.0,0.0,0.0,2669.0,55085.0,-1.0,0.0
75%,1.0,1.0,1.0,-1.0,5.0,0.0,0.0,100.0,0.0,0.0,...,53628.0,0.0,1.0,1.0,0.0,0.0,3872.0,75261.0,-1.0,0.0
max,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1000.0,1.0,1.0,...,114540.0,7.0,1.0,1.0,6.0,1.0,5664.0,115984.0,1.0,6.0


In [24]:
data = dataset_num.join(dataset_cat)

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Columns: 528 entries, vas_details_1_3_sum to deact_react_10_1_ctg
dtypes: float64(448), int32(60), int64(10), int8(10)
memory usage: 2.6 GB


In [28]:
data.target.value_counts()

0    696617
1      5469
Name: target, dtype: int64

## Важность признаков

In [26]:
features = data.drop(['target'], axis=1)
target = data['target']

In [35]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=42)
    
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled = upsample(features, target, 126)

In [36]:
model_cat = CatBoostClassifier(verbose=False, random_state=42)
model_cat.fit(features_upsampled, target_upsampled)

<catboost.core.CatBoostClassifier at 0x1339d5bf970>

In [40]:
cat_imp = pd.Series(model_cat.get_feature_importance(), features_train.columns)
cat_imp.sort_values(ascending=False).head(40)

tariff_plans_18_1_ctg           3.063124
user_lifetime_2_1_num           2.811563
area_0_0_num                    2.766040
info_house_5_0_num              2.704146
traffic_details_45_3_avg        2.252905
tariff_plans_19_src_id          2.249622
traffic_details_42_1_avg        2.173714
info_house_6_0_num              2.053878
traffic_details_60_1_avg        1.891650
traffic_details_43_1_std        1.872938
traffic_details_25_3d6_std      1.740090
traffic_details_54_3_avg        1.706893
traffic_details_9_3d6_part      1.633744
traffic_details_19_1d6_std      1.588256
traffic_details_32_1d3_avg      1.529805
traffic_details_5_1d3_part      1.528148
traffic_details_63_3_avg        1.515617
spas_symptoms_agr_140_12_avg    1.507206
traffic_details_51_1_avg        1.470906
traffic_details_27_6_avg        1.432484
traffic_details_16_1d3_std      1.414750
traffic_details_21_3_avg        1.380340
traffic_details_7_3_cnt         1.346959
spas_symptoms_agr_154_12_sum    1.311702
traffic_details_