## Изучение данных

In [1]:
#!pip install phik

In [47]:
import json
import pandas as pd
import numpy as np

from phik import resources
from phik.binning import bin_data
from phik.report import plot_correlation_matrix
from phik import report

import catboost as cb
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from catboost import CatBoostClassifier
from sklearn.utils import shuffle

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
f = open('features_types.json')
features_types = json.load(f)

In [4]:
data_features = pd.DataFrame(features_types.items(), columns=['features', 'types'])
data_features.head()

Unnamed: 0,features,types
0,markers_0_1_cnt,numeric
1,markers_1_1_cnt,numeric
2,markers_2_1_cnt,numeric
3,markers_3_1_cnt,numeric
4,markers_4_1_cnt,numeric


Создадим отдельно таргет.

In [5]:
target=['target']

In [6]:
target = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=target)
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   target  702086 non-null  int32
dtypes: int32(1)
memory usage: 2.7 MB


In [7]:
target.value_counts()

target
0         696617
1           5469
dtype: int64

Выбираю только свою часть признаков.

In [8]:
my_data_features = data_features.iloc[930:1860]

In [9]:
my_data_features.types.value_counts()

numeric            851
categorical_int     69
categorical_str     10
Name: types, dtype: int64

In [10]:
my_data_features.head()

Unnamed: 0,features,types
930,vas_details_1_3_sum,numeric
931,vas_details_2_6_sum,numeric
932,vas_details_3_1_sum,numeric
933,vas_details_4_3_sum,numeric
934,vas_details_5_6_sum,numeric


In [11]:
my_data_features_num = my_data_features[my_data_features['types'] == 'numeric']

In [12]:
my_data_features_cat = my_data_features[my_data_features['types'] != 'numeric']

In [13]:
my_data_features_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 851 entries, 930 to 1859
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   features  851 non-null    object
 1   types     851 non-null    object
dtypes: object(2)
memory usage: 19.9+ KB


## Обработка и удаление лишних числовых признаков

In [14]:
def read_data(left, right):

    # следующие т столбцов, которые будем считывать + target
    columns_to_read = my_data_features_num['features'][left:right].to_list() + ['target']

    # считываем столбцы в файл
    data = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=columns_to_read)

    return data

def variance_filter(data, threshold=0.5):

    # считаем дисперсию для каждого признака
    variance = data.var()

    # отбираем те фичи, у которых var > threshold
    selected_features = variance[variance > threshold].index.tolist()

    # собираем в датасет
    filtered_data = data[selected_features + ['target']]
    
    return filtered_data


def corr_with_target(data):

    # чекаем корреляцию с таргетом
    correlation = data.corrwith(data['target']).abs()
    
    # сортируем
    correlation = correlation.sort_values(ascending=False)
    
    return correlation


def common_correlation(data):
    corr_matrix = data.corr().abs()
    
    # выбираем верхний треугольник матрицы корреляции
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
    
    # находим индексы признаков с корреляцией больше 0.90
    to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
    
    # удаляем эти признаки из датафрейма
    data.drop(data[to_drop], axis=1, inplace=True)
    
    return data


def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=42)
    
    return features_downsampled, target_downsampled

Создадим датасет с числовыми признаками.

In [15]:
dataset_num = read_data(0, -1)
dataset_num.head()

Unnamed: 0,vas_details_1_3_sum,vas_details_2_6_sum,vas_details_3_1_sum,vas_details_4_3_sum,vas_details_5_6_sum,vas_details_6_1_sum,vas_details_7_3_sum,vas_details_8_6_sum,vas_details_9_1_sum,vas_details_10_3_sum,...,issues_47_6_sum,issues_48_1_sum,issues_49_3_sum,issues_50_6_sum,issues_51_1_sum,issues_52_3_sum,issues_53_6_sum,issues_54_1_sum,issues_55_3_sum,target
0,,,,,,,,,,,...,,,,,,,,,,0
1,,,,,,,,,,,...,,,,,,,,,,0
2,,,,,,,,,,,...,,,,,,,,,,1
3,,,,,,,,,,,...,,,,,,,,,,0
4,,,,,,,,,,,...,,,,,,,,,,0


In [16]:
dataset_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Columns: 851 entries, vas_details_1_3_sum to target
dtypes: float64(850), int32(1)
memory usage: 4.4 GB


Теперь категориальные:

In [17]:
columns_to_read = my_data_features_cat['features'].to_list()

dataset_cat = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=columns_to_read)

In [18]:
dataset_cat.head()

Unnamed: 0,communication_availability_4_1_flg,communication_availability_5_1_flg,communication_availability_6_1_flg,social_dem_0_0_flg,communication_availability_7_1_ctg,communication_availability_8_1_flg,communication_availability_9_1_flg,communication_availability_10_1_ctg,communication_availability_11_1_flg,communication_availability_12_1_flg,...,tariff_plans_17_src_id,deact_react_5_1_ctg,communication_availability_45_1_flg,user_active_4_1_flg,deact_react_7_1_ctg,exist_tariff_1_1_flg,tariff_plans_18_1_ctg,tariff_plans_19_src_id,deact_react_8_1_ctg,deact_react_10_1_ctg
0,1,1,1,-1,4,0,0,50,1,0,...,-1,0,1,1,0,0,4055,51551,-1,0
1,1,1,1,-1,4,0,0,50,1,0,...,-1,0,1,1,0,0,4055,51551,-1,0
2,1,1,1,-1,5,0,0,100,0,0,...,-1,0,1,1,0,0,3755,51529,-1,0
3,1,1,1,-1,4,0,0,100,0,0,...,-1,0,1,1,0,0,3637,51412,-1,0
4,1,1,1,-1,4,0,0,100,0,0,...,-1,0,1,1,0,0,2601,51675,-1,0


In [19]:
dataset_cat = dataset_cat.astype('category')

In [20]:
dataset_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Data columns (total 79 columns):
 #   Column                               Non-Null Count   Dtype   
---  ------                               --------------   -----   
 0   communication_availability_4_1_flg   702086 non-null  category
 1   communication_availability_5_1_flg   702086 non-null  category
 2   communication_availability_6_1_flg   702086 non-null  category
 3   social_dem_0_0_flg                   702086 non-null  category
 4   communication_availability_7_1_ctg   702086 non-null  category
 5   communication_availability_8_1_flg   702086 non-null  category
 6   communication_availability_9_1_flg   702086 non-null  category
 7   communication_availability_10_1_ctg  702086 non-null  category
 8   communication_availability_11_1_flg  702086 non-null  category
 9   communication_availability_12_1_flg  702086 non-null  category
 10  communication_availability_13_1_flg  702086 non-null  category
 11  

Объединим их:

In [21]:
my_data = dataset_num.join(dataset_cat)

In [22]:
my_data.head()

Unnamed: 0,vas_details_1_3_sum,vas_details_2_6_sum,vas_details_3_1_sum,vas_details_4_3_sum,vas_details_5_6_sum,vas_details_6_1_sum,vas_details_7_3_sum,vas_details_8_6_sum,vas_details_9_1_sum,vas_details_10_3_sum,...,tariff_plans_17_src_id,deact_react_5_1_ctg,communication_availability_45_1_flg,user_active_4_1_flg,deact_react_7_1_ctg,exist_tariff_1_1_flg,tariff_plans_18_1_ctg,tariff_plans_19_src_id,deact_react_8_1_ctg,deact_react_10_1_ctg
0,,,,,,,,,,,...,-1,0,1,1,0,0,4055,51551,-1,0
1,,,,,,,,,,,...,-1,0,1,1,0,0,4055,51551,-1,0
2,,,,,,,,,,,...,-1,0,1,1,0,0,3755,51529,-1,0
3,,,,,,,,,,,...,-1,0,1,1,0,0,3637,51412,-1,0
4,,,,,,,,,,,...,-1,0,1,1,0,0,2601,51675,-1,0


In [23]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Columns: 930 entries, vas_details_1_3_sum to deact_react_10_1_ctg
dtypes: category(79), float64(850), int32(1)
memory usage: 4.5 GB


Удалим столбцы, где больше 1/14 пропусков.

In [24]:
my_data.dropna(thresh=len(my_data)/14, axis=1, inplace=True)

In [27]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Columns: 647 entries, vas_details_1_3_sum to target
dtypes: float64(646), int32(1)
memory usage: 3.4 GB


In [52]:
my_data.drop_duplicates(inplace=True)

In [53]:
my_data['target'].value_counts()

0    696446
1      5469
Name: target, dtype: int64

## Важность признаков

Выделим таргет и проведем даунсемплинг.

In [29]:
features = my_data.drop(['target'], axis=1)
target = my_data['target']

In [30]:
features_downsampled, target_downsampled = downsample(features, target, 0.01)

### Catboost

In [31]:
cat_features = features_downsampled.select_dtypes(include='category').columns.to_list()

In [32]:
model_cat = CatBoostClassifier(loss_function='Logloss', verbose=False,\
                               cat_features=cat_features, eval_metric='AUC:hints=skip_train~false',\
                               random_state=42)
model_cat.fit(features_downsampled, target_downsampled)

<catboost.core.CatBoostClassifier at 0x19ce8adfd90>

In [34]:
cat_imp = pd.Series(model_cat.get_feature_importance(), features_downsampled.columns)
cat_imp.sort_values(ascending=False).head(20)

info_house_5_0_num              2.252200
user_lifetime_3_0_dt            1.740520
info_house_6_0_num              1.644187
area_0_0_num                    1.597189
user_lifetime_2_1_num           1.406348
traffic_details_43_1_std        1.371310
tariff_plans_22_1_min           1.367264
user_devices_11_1_cnt           1.310490
tariff_plans_21_1_max           1.294540
tariff_plans_20_1_ctg           1.293514
spas_symptoms_agr_104_12_avg    1.269763
traffic_details_16_1d3_std      1.203165
spas_symptoms_agr_79_6_sum      1.158814
traffic_details_49_6_std        1.125087
traffic_details_44_1_sum        1.111514
traffic_details_9_3d6_part      1.065838
traffic_details_67_6_std        1.017557
traffic_details_10_6_cnt        1.007405
traffic_details_1_3_cnt         0.931660
traffic_details_39_3d6_std      0.928013
dtype: float64

### LGBM

In [38]:
model_lgbm = LGBMClassifier(random_state=42)
model_lgbm.fit(features_downsampled, target_downsampled)

LGBMClassifier(random_state=42)

In [46]:
lgbm_imp = pd.Series(model_lgbm.feature_importances_, features_downsampled.columns)
lgbm_imp.sort_values(ascending=False).head(100)

area_0_0_num                   82
info_house_5_0_num             74
user_lifetime_2_1_num          68
info_house_6_0_num             68
traffic_details_43_1_std       46
                               ..
traffic_details_36_3_avg       13
traffic_details_54_3_avg       12
traffic_details_23_3_sum       12
traffic_details_15_1d3_avg     12
spas_symptoms_agr_146_3_avg    12
Length: 100, dtype: int32

In [50]:
feature_importance = np.array(model_lgbm.feature_importances_)
feature_names = np.array(features_downsampled.columns)
data = {'feature_names':feature_names,'feature_importance':feature_importance}
fitures_df = pd.DataFrame(data=data, dtype = np.int32).sort_values(by=['feature_importance'], ascending=False).head(25)
fitures_df

  fitures_df = pd.DataFrame(data=data, dtype = np.int32).sort_values(by=['feature_importance'], ascending=False).head(25)


Unnamed: 0,feature_names,feature_importance
320,area_0_0_num,82
465,info_house_5_0_num,74
394,user_lifetime_2_1_num,68
466,info_house_6_0_num,68
365,traffic_details_43_1_std,46
371,traffic_details_49_6_std,45
347,traffic_details_25_3d6_std,41
467,tariff_plans_20_1_ctg,40
482,spas_symptoms_agr_104_12_avg,36
366,traffic_details_44_1_sum,35


In [58]:
important_futures = fitures_df['feature_names'].to_list()
important_futures

['area_0_0_num',
 'info_house_5_0_num',
 'user_lifetime_2_1_num',
 'info_house_6_0_num',
 'traffic_details_43_1_std',
 'traffic_details_49_6_std',
 'traffic_details_25_3d6_std',
 'tariff_plans_20_1_ctg',
 'spas_symptoms_agr_104_12_avg',
 'traffic_details_44_1_sum',
 'traffic_details_48_6_avg',
 'traffic_details_5_1d3_part',
 'traffic_details_9_3d6_part',
 'spas_symptoms_agr_153_12_std',
 'traffic_details_28_6_std',
 'traffic_details_46_3_std',
 'traffic_details_16_1d3_std',
 'spas_symptoms_agr_79_6_sum',
 'traffic_details_10_6_cnt',
 'traffic_details_67_6_std',
 'traffic_details_39_3d6_std',
 'traffic_details_38_3d6_avg',
 'traffic_details_18_1d6_avg',
 'spas_symptoms_agr_154_12_sum',
 'traffic_details_24_3d6_avg']

Посмотрим на полученный датасет с важными признаками:

In [60]:
columns_to_read = fitures_df['feature_names'].to_list() + ['target']

dataset_for_test = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=columns_to_read)

In [61]:
dataset_for_test.head()

Unnamed: 0,area_0_0_num,info_house_5_0_num,user_lifetime_2_1_num,info_house_6_0_num,traffic_details_43_1_std,traffic_details_49_6_std,traffic_details_25_3d6_std,tariff_plans_20_1_ctg,spas_symptoms_agr_104_12_avg,traffic_details_44_1_sum,...,traffic_details_16_1d3_std,spas_symptoms_agr_79_6_sum,traffic_details_10_6_cnt,traffic_details_67_6_std,traffic_details_39_3d6_std,traffic_details_38_3d6_avg,traffic_details_18_1d6_avg,spas_symptoms_agr_154_12_sum,traffic_details_24_3d6_avg,target
0,-0.389706,-0.14801,1.047965,0.195933,-0.003515,-0.00105,0.211627,-0.748844,-0.312153,0.019032,...,-0.749995,0.287253,-0.822154,-0.001062,0.440552,-0.211741,-0.352481,1.203344,-0.525663,0
1,-0.360158,-0.15488,1.08466,0.19646,-0.025559,-0.001079,0.132429,-0.748844,-0.312153,-0.067416,...,0.355538,0.287253,0.510383,-0.001064,0.309751,-0.062897,-0.15959,1.34977,0.08935,0
2,,-0.159672,0.94902,0.186659,-0.01255,-0.001077,0.639417,-0.037023,5.598951,-0.041754,...,-0.609298,0.287253,-1.205258,-0.001064,0.931132,0.003689,-0.510244,-3.362484,-0.189385,1
3,-0.104079,-0.137583,0.984405,0.2002,-0.019408,-0.001057,0.997187,-0.037023,-0.312153,-0.059123,...,2.050425,0.287253,-2.387885,-0.001063,0.342601,2.747052,4.926261,1.416327,0.765043,0
4,1.762342,-0.122424,0.94771,0.20019,-0.00208,-0.001075,-2.94814,-0.037023,-0.312153,0.094784,...,-0.047792,0.252429,-2.554452,-0.001059,-2.72617,-3.368683,-1.38112,-2.723534,-3.136473,0


In [63]:
corr_with_target(dataset_for_test)

target                          1.000000
spas_symptoms_agr_104_12_avg    0.022623
tariff_plans_20_1_ctg           0.014144
traffic_details_67_6_std        0.013640
traffic_details_49_6_std        0.013639
traffic_details_28_6_std        0.013639
traffic_details_48_6_avg        0.013638
spas_symptoms_agr_154_12_sum    0.012160
traffic_details_10_6_cnt        0.012063
spas_symptoms_agr_79_6_sum      0.009684
info_house_6_0_num              0.006218
user_lifetime_2_1_num           0.005308
info_house_5_0_num              0.003528
traffic_details_24_3d6_avg      0.002566
area_0_0_num                    0.002379
traffic_details_44_1_sum        0.001838
traffic_details_25_3d6_std      0.001823
traffic_details_16_1d3_std      0.001588
traffic_details_18_1d6_avg      0.001219
traffic_details_38_3d6_avg      0.001037
traffic_details_46_3_std        0.000682
traffic_details_43_1_std        0.000607
spas_symptoms_agr_153_12_std    0.000525
traffic_details_9_3d6_part      0.000421
traffic_details_

In [64]:
common_correlation(dataset_for_test)

Unnamed: 0,area_0_0_num,info_house_5_0_num,user_lifetime_2_1_num,info_house_6_0_num,traffic_details_43_1_std,traffic_details_49_6_std,traffic_details_25_3d6_std,tariff_plans_20_1_ctg,spas_symptoms_agr_104_12_avg,traffic_details_44_1_sum,...,spas_symptoms_agr_153_12_std,traffic_details_46_3_std,traffic_details_16_1d3_std,spas_symptoms_agr_79_6_sum,traffic_details_10_6_cnt,traffic_details_38_3d6_avg,traffic_details_18_1d6_avg,spas_symptoms_agr_154_12_sum,traffic_details_24_3d6_avg,target
0,-0.389706,-0.148010,1.047965,0.195933,-0.003515,-0.001050,0.211627,-0.748844,-0.312153,0.019032,...,-1.427644,0.029845,-0.749995,0.287253,-0.822154,-0.211741,-0.352481,1.203344,-0.525663,0
1,-0.360158,-0.154880,1.084660,0.196460,-0.025559,-0.001079,0.132429,-0.748844,-0.312153,-0.067416,...,-2.039655,-0.015781,0.355538,0.287253,0.510383,-0.062897,-0.159590,1.349770,0.089350,0
2,,-0.159672,0.949020,0.186659,-0.012550,-0.001077,0.639417,-0.037023,5.598951,-0.041754,...,-3.600635,-0.013113,-0.609298,0.287253,-1.205258,0.003689,-0.510244,-3.362484,-0.189385,1
3,-0.104079,-0.137583,0.984405,0.200200,-0.019408,-0.001057,0.997187,-0.037023,-0.312153,-0.059123,...,-2.439662,-0.014456,2.050425,0.287253,-2.387885,2.747052,4.926261,1.416327,0.765043,0
4,1.762342,-0.122424,0.947710,0.200190,-0.002080,-0.001075,-2.948140,-0.037023,-0.312153,0.094784,...,-0.514963,-0.009937,-0.047792,0.252429,-2.554452,-3.368683,-1.381120,-2.723534,-3.136473,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
702081,,1.536492,-0.577086,-1.179092,-0.016526,-0.001068,0.239084,-0.037023,-0.312153,-0.027575,...,0.832499,-0.002116,0.314891,0.252429,-1.904840,0.053374,-0.137927,-0.394031,-0.012686,0
702082,1.457017,1.550355,-0.675375,-1.185673,-0.025548,-0.001078,0.293960,-0.037023,-0.312153,-0.064967,...,0.503083,-0.013883,-0.079155,-0.026165,0.310503,0.190668,-0.166079,0.045247,0.379010,0
702083,,1.615714,-0.696999,-1.195002,-0.025158,-0.001079,0.476393,-0.037023,-0.312153,-0.064145,...,0.525879,-0.015640,-0.850405,0.043484,-0.505676,0.121722,-0.408672,0.058559,0.307509,0
702084,,1.546459,-0.735659,-1.186741,-0.025581,-0.001079,0.461791,-0.037023,-0.312153,-0.067542,...,0.630811,-0.015828,0.899857,0.287253,-1.255228,1.102863,0.292478,-0.021310,0.385311,0


In [66]:
corr_matrix = dataset_for_test.corr().abs()
corr_matrix

Unnamed: 0,area_0_0_num,info_house_5_0_num,user_lifetime_2_1_num,info_house_6_0_num,traffic_details_43_1_std,traffic_details_49_6_std,traffic_details_25_3d6_std,tariff_plans_20_1_ctg,spas_symptoms_agr_104_12_avg,traffic_details_44_1_sum,...,spas_symptoms_agr_153_12_std,traffic_details_46_3_std,traffic_details_16_1d3_std,spas_symptoms_agr_79_6_sum,traffic_details_10_6_cnt,traffic_details_38_3d6_avg,traffic_details_18_1d6_avg,spas_symptoms_agr_154_12_sum,traffic_details_24_3d6_avg,target
area_0_0_num,1.0,0.040719,0.064328,0.037293,0.001369,0.00011445,0.00264,0.048053,0.007932,0.006732,...,0.013702,0.002429,0.009796,0.025265,0.02854432,0.000802,0.00347,0.013087,0.003125,0.002379
info_house_5_0_num,0.040719,1.0,0.004217,0.20637,0.001022,0.001842915,0.042545,0.051935,0.033091,0.007576,...,0.06887,0.000608,0.021373,0.022339,0.07235112,0.019423,0.030499,0.061681,0.036408,0.003528
user_lifetime_2_1_num,0.064328,0.004217,1.0,0.069829,0.003281,0.001497296,0.003402,0.10262,0.126701,0.004528,...,0.038125,0.003075,0.007862,0.11012,0.04583875,0.023178,0.003328,0.111924,0.007442,0.005308
info_house_6_0_num,0.037293,0.20637,0.069829,1.0,0.002599,0.001407621,0.062581,0.103653,0.025941,0.008684,...,0.361384,0.002065,0.029247,0.007732,0.04639116,0.075429,0.032545,0.218296,0.060427,0.006218
traffic_details_43_1_std,0.001369,0.001022,0.003281,0.002599,1.0,7.537455e-05,0.010449,0.011201,0.002491,0.826252,...,0.00556,0.167756,0.023347,0.000406,0.006678573,0.015148,0.048431,0.00647,0.014131,0.000607
traffic_details_49_6_std,0.000114,0.001843,0.001497,0.001408,7.5e-05,1.0,0.00353,0.007613,0.000386,3.3e-05,...,0.000825,0.003623,0.002152,0.000351,6.147284e-07,0.004079,0.001654,0.000193,0.003791,0.013639
traffic_details_25_3d6_std,0.00264,0.042545,0.003402,0.062581,0.010449,0.003529871,1.0,0.019662,0.000629,0.011351,...,0.058524,0.011518,0.000178,0.02063,0.06953915,0.69964,0.340946,0.057372,0.717102,0.001823
tariff_plans_20_1_ctg,0.048053,0.051935,0.10262,0.103653,0.011201,0.007612524,0.019662,1.0,0.03124,0.051537,...,0.033185,0.012775,0.006916,0.103368,0.009116763,0.018501,0.003813,0.132633,0.011871,0.014144
spas_symptoms_agr_104_12_avg,0.007932,0.033091,0.126701,0.025941,0.002491,0.0003859975,0.000629,0.03124,1.0,0.007196,...,0.23519,0.002192,0.001152,0.000763,0.01322816,0.010196,0.002859,0.555231,0.009963,0.022623
traffic_details_44_1_sum,0.006732,0.007576,0.004528,0.008684,0.826252,3.301549e-05,0.011351,0.051537,0.007196,1.0,...,0.008,0.171154,0.021339,8.2e-05,0.02046577,0.017437,0.042585,0.005057,0.015547,0.001838


In [67]:
dataset_for_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Data columns (total 22 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   area_0_0_num                  467367 non-null  float64
 1   info_house_5_0_num            696471 non-null  float64
 2   user_lifetime_2_1_num         696827 non-null  float64
 3   info_house_6_0_num            696471 non-null  float64
 4   traffic_details_43_1_std      686189 non-null  float64
 5   traffic_details_49_6_std      686189 non-null  float64
 6   traffic_details_25_3d6_std    686189 non-null  float64
 7   tariff_plans_20_1_ctg         697313 non-null  float64
 8   spas_symptoms_agr_104_12_avg  692032 non-null  float64
 9   traffic_details_44_1_sum      686189 non-null  float64
 10  traffic_details_5_1d3_part    686189 non-null  float64
 11  traffic_details_9_3d6_part    686189 non-null  float64
 12  spas_symptoms_agr_153_12_std  691863 non-nul

In [70]:
important_futures = dataset_for_test.drop('target', axis=1).columns.to_list()
important_futures

['area_0_0_num',
 'info_house_5_0_num',
 'user_lifetime_2_1_num',
 'info_house_6_0_num',
 'traffic_details_43_1_std',
 'traffic_details_49_6_std',
 'traffic_details_25_3d6_std',
 'tariff_plans_20_1_ctg',
 'spas_symptoms_agr_104_12_avg',
 'traffic_details_44_1_sum',
 'traffic_details_5_1d3_part',
 'traffic_details_9_3d6_part',
 'spas_symptoms_agr_153_12_std',
 'traffic_details_46_3_std',
 'traffic_details_16_1d3_std',
 'spas_symptoms_agr_79_6_sum',
 'traffic_details_10_6_cnt',
 'traffic_details_38_3d6_avg',
 'traffic_details_18_1d6_avg',
 'spas_symptoms_agr_154_12_sum',
 'traffic_details_24_3d6_avg']