In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
f = open('features_types.json')
features_types = json.load(f)

In [3]:
data_features = pd.DataFrame(features_types.items(), columns=['features', 'types'])
data_features.head()

Unnamed: 0,features,types
0,markers_0_1_cnt,numeric
1,markers_1_1_cnt,numeric
2,markers_2_1_cnt,numeric
3,markers_3_1_cnt,numeric
4,markers_4_1_cnt,numeric


Создадим отдельно таргет.

In [4]:
target=['target']

In [5]:
target = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=target)
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702086 entries, 0 to 702085
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   target  702086 non-null  int32
dtypes: int32(1)
memory usage: 2.7 MB


In [6]:
target.value_counts()

target
0         696617
1           5469
dtype: int64

Выбираю только свою часть признаков.

In [7]:
my_data_features = data_features.iloc[930:1860]

In [8]:
my_data_features.types.value_counts()

numeric            851
categorical_int     69
categorical_str     10
Name: types, dtype: int64

In [9]:
my_data_features.head()

Unnamed: 0,features,types
930,vas_details_1_3_sum,numeric
931,vas_details_2_6_sum,numeric
932,vas_details_3_1_sum,numeric
933,vas_details_4_3_sum,numeric
934,vas_details_5_6_sum,numeric


In [10]:
my_data_features_num = my_data_features[my_data_features['types'] == 'numeric']

In [11]:
my_data_features_cat = my_data_features[my_data_features['types'] != 'numeric']

In [12]:
my_data_features_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 851 entries, 930 to 1859
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   features  851 non-null    object
 1   types     851 non-null    object
dtypes: object(2)
memory usage: 19.9+ KB


Возьмем первые 200 признаков:

In [13]:
columns_to_read_1 = my_data_features_num['features'][:200].to_list()
columns_to_read_1.append('target')

In [14]:
raw_data_1 = pd.read_parquet('dataset_train.parquet', engine='pyarrow', columns=columns_to_read_1)

Отбрасываю те признаки, где больше 1/3 пропусков.

In [15]:
raw_data_1 = raw_data_1.dropna(thresh=(len(raw_data_1))/3, axis=1)

Отбрасываю пропуски.

In [16]:
raw_data_1 = raw_data_1.dropna()

Посмотрим на корреляцию с целевым признаком, если она выше 0.95 - удаляем. Дальше делаем тоже самое, только смотрим на корреляцию всех признаков между собой.

In [17]:
corr_matrix = raw_data_1.corr().abs()

In [18]:
corr_with_target = raw_data_1.corrwith(raw_data_1['target']).abs()

In [19]:
corr_with_target.sort_values(ascending=False)

target                         1.000000
spas_symptoms_int_130_1_cnt    0.018977
spas_symptoms_int_93_1_cnt     0.018241
spas_symptoms_int_134_1_cnt    0.012546
spas_symptoms_int_92_1_cnt     0.009987
                                 ...   
spas_symptoms_int_60_1_cnt     0.000040
spas_symptoms_int_1_1_cnt           NaN
spas_symptoms_int_8_1_cnt           NaN
spas_symptoms_int_125_1_cnt         NaN
spas_symptoms_int_132_1_cnt         NaN
Length: 154, dtype: float64

Переходим к корреляции между признаками:

In [20]:
# выбираем верхний треугольник матрицы корреляции
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

In [21]:
# находим индексы признаков с корреляцией больше 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

In [22]:
# удаляем эти признаки из датафрейма
raw_data_1.drop(raw_data_1[to_drop], axis=1, inplace=True)

In [23]:
raw_data_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 692072 entries, 0 to 702085
Data columns (total 75 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   user_devices_3_1_cnt         692072 non-null  float64
 1   user_devices_4_1_cnt         692072 non-null  float64
 2   user_devices_5_1_cnt         692072 non-null  float64
 3   user_devices_6_1_cnt         692072 non-null  float64
 4   user_devices_7_1_cnt         692072 non-null  float64
 5   spas_symptoms_int_0_1_cnt    692072 non-null  float64
 6   spas_symptoms_int_1_1_cnt    692072 non-null  float64
 7   spas_symptoms_int_2_1_cnt    692072 non-null  float64
 8   spas_symptoms_int_3_1_cnt    692072 non-null  float64
 9   spas_symptoms_int_5_1_cnt    692072 non-null  float64
 10  spas_symptoms_int_8_1_cnt    692072 non-null  float64
 11  spas_symptoms_int_11_1_cnt   692072 non-null  float64
 12  spas_symptoms_int_12_1_cnt   692072 non-null  float64
 13 