In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

import category_encoders as ce

plt.style.use('seaborn-colorblind')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [84]:
use_cols = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country'
]

data = pd.read_csv('datasets/adult.data', index_col=False,
                   names=[
                            'age',
                            'workclass',
                            'fnlwgt',
                            'education',
                            'education-num',
                            'marital-status',
                            'occupation',
                            'relationship',
                            'race',
                            'sex',
                            'capital-gain',
                            'capital-loss',
                            'hours-per-week',
                            'native-country'])
print(data.shape)
data.head(8)

(32561, 14)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States


## Пропущенные значения

Проверка пропущенных значений

In [3]:
def check_missing(data, output_path=None):
    result = pd.concat([data.isnull().sum(), data.isnull().mean()], axis=1)
    result = result.rename(index=str, columns={0: 'total missing', 1: 'proportion'})
    if output_path:
        result.to_csv(output_path + 'missing.csv')
        print(output_path + 'missing.csv')
    return result

In [4]:
check_missing(data, 'datasets/')

datasets/missing.csv


Unnamed: 0,total missing,proportion
age,0,0.0
workclass,0,0.0
fnlwgt,0,0.0
education,0,0.0
education-num,0,0.0
marital-status,0,0.0
occupation,0,0.0
relationship,0,0.0
race,0,0.0
sex,0,0.0


In [5]:
def drop_missing(data, axis=0):
    data_copy = data.copy()
    data_copy = data_copy.dropna(axis=axis, inplace=False)
    return data_copy

In [6]:
data2 = drop_missing(data)
data2.shape

(32561, 14)

Добавление переменной оценки пропущенных значений

In [7]:
def add_var_denote_NA(data, NA_col=[]):
    data_copy = data.copy()
    for i in NA_col:
        if data_copy[i].isnull().sum() > 0:
            data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(), 1, 0)
        else:
             warn('Нет пропущенных значений' % i)
    return data_copy

In [8]:
# data3 = add_var_denote_NA(data, NA_col=['age'])
# print(data3.Age_is_NA.value_counts())
# data3.head(8)

Заполнение пропусков выборочным значением

In [9]:
def impute_NA_with_arbitrary(data, impute_value, NA_col=[]):
    data_copy = data.copy()
    for i in NA_col:
        if data_copy[i].isnull().sum() > 0:
            data_copy[i + '_' + str(impute_value)] = data_copy[i].fillna(impute_value)
        else:
            warn('Нет пропущенных знаечний' % i)
    return data_copy

In [10]:
# data4 = impute_NA_with_arbitrary(data, -999, ['age'])
# data4.head(8)

Заполнение пропущенных значений средним/медианой/модой

In [11]:
def impute_with_avg(data, strategy='mean', NA_col=[]):
    data_copy = data.copy()
    for i in NA_col:
        if data_copy[i].isnull().sum() > 0:
            if strategy == 'mean':
                data_copy[i + '_impute_mean'] = data_copy[i].fillna(data[i].mean())
            elif strategy == 'median':
                data_copy[i + '_impute_median'] = data_copy[i].fillna(data[i].median())
            elif strategy == 'mode':
                data_copy[i + '_impute_mode'] = data_copy[i].fillna(data[i].mode())
        else:
            warn('Нет пропущенных данных' % i)
    return data_copy

In [12]:
# print(data.Age.median())
# data5 = impute_with_avg(data, 'median', ['age'])
# data5.head(8)

Заполнение пропусков из хвоста распределения

In [13]:
def impute_NA_with_end_of_distribution(data, NA_col=[]):
    data_copy = data.copy()
    for i in NA_col:
        if data_copy[i].isnull().sum() > 0:
            data_copy[i + '_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean() + 3 * data[i].std())
        else:
            warn('Нет пропущенных данных' % i)
    return data_copy

In [14]:
# data6 = impute_NA_with_end_of_distribution(data, ['age'])
# data6.head(8)

Заполнение пропусков случайными значениями

In [15]:
def imput_NA_with_random(data, NA_col=[], random_state=0):
    data_copy = data.copy()
    for i in NA_col:
        if data_copy[i].isnull().sum() > 0:
            data_copy[i + '_random'] = data_copy[i]
            random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
            random_sample.index = data_copy[data_copy[i].isnull()].index
            data_copy.loc[data_copy[i].isnull(), str(i) + '_random'] = random_sample
        else:
            warn('Нет пропущенных данных' % i)
    return data_copy

In [16]:
# data7 = imput_NA_with_random(data, ['age'])
# data7.head(8)

# Выбросы

Детекция с помощью выборочных значений

In [17]:
def outlier_detect_arbitrary(data, col, uppper_fence, lower_fence):
    para = (uppper_fence, lower_fence)
    tmp = pd.concat([data[col] > uppper_fence, data[col] < lower_fence], axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:', outlier_index.value_counts()[1])
    print('Доля выбросов', outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [25]:
data.age.max()

90

In [28]:
index, para = outlier_detect_arbitrary(data, 'age', 89, 5)
print('Верхняя граница:', para[0], '\nНижняя граница:', para[1])

Количество выбросов в данных: 43
Доля выбросов 0.0013205982617241485
Верхняя граница: 89 
Нижняя граница: 5


In [29]:
data.loc[index, 'age'].sort_values()

222      90
14159    90
15356    90
15892    90
18277    90
18413    90
18725    90
18832    90
19212    90
12975    90
19489    90
20610    90
22220    90
24043    90
24238    90
25303    90
28463    90
31030    90
31696    90
19747    90
32277    90
12451    90
11731    90
1040     90
1935     90
2303     90
2891     90
4070     90
4109     90
5104     90
5272     90
11996    90
5370     90
6232     90
6624     90
8806     90
8963     90
8973     90
10210    90
10545    90
11512    90
5406     90
32367    90
Name: age, dtype: int64

Интерквартильное расстояние

In [73]:
def outlier_detect_IQR(data, col, threshold=3):
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
    Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
    para = (Upper_fence, Lower_fence)
    tmp = pd.concat([data[col] > Upper_fence, data[col] < Lower_fence], axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:', outlier_index.value_counts()[1])
    print('Доля выбросов', outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [85]:
# index, para = outlier_detect_IQR(data, 'age', 3)
# print('Верхняя граница:', para[0], '\nНижняя граница:', para[1])

In [35]:
data.loc[index, 'age'].sort_values()

222      90
14159    90
15356    90
15892    90
18277    90
18413    90
18725    90
18832    90
19212    90
12975    90
19489    90
20610    90
22220    90
24043    90
24238    90
25303    90
28463    90
31030    90
31696    90
19747    90
32277    90
12451    90
11731    90
1040     90
1935     90
2303     90
2891     90
4070     90
4109     90
5104     90
5272     90
11996    90
5370     90
6232     90
6624     90
8806     90
8963     90
8973     90
10210    90
10545    90
11512    90
5406     90
32367    90
Name: age, dtype: int64

Среднее-среднеквадратичное отклонение

In [37]:
def outlier_detect_mean_std(data, col, threshold=3):
    Upper_fence = data[col].mean() + threshold * data[col].std()
    Lower_fence = data[col].mean() - threshold * data[col].std()
    para = (Upper_fence, Lower_fence)
    tmp = pd.concat([data[col] > Upper_fence, data[col] < Lower_fence], axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:', outlier_index.value_counts()[1])
    print('Доля выбросов', outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [38]:
index, para = outlier_detect_mean_std(data, 'age', 3)
print('Верхняя граница:', para[0], '\nНижняя граница:', para[1])

Количество выбросов в данных: 121
Доля выбросов 0.00371610208531679
Верхняя граница: 79.5029444160648 
Нижняя граница: -2.339650905423241


In [39]:
data.loc[index, 'age'].sort_values()

20953    80
20482    80
20249    80
6439     80
19689    80
         ..
20610    90
6624     90
15356    90
19212    90
222      90
Name: age, Length: 121, dtype: int64

Медианы абсолютного отклонения (MAD)

In [40]:
def outlier_detect_mad(data, col, threshold=3.5):
    median = data[col].median()
    median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
    modified_z_scores = pd.Series([0.6745 * (y -median) / median_absolute_deviation for y in data[col]])
    outlier_index = np.abs(modified_z_scores) > threshold
    print('Количество выбросов в данных:', outlier_index.value_counts()[1])
    print('Доля выбросов', outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index

In [41]:
index = outlier_detect_mad(data, 'age', 3.5)

Количество выбросов в данных: 43
Доля выбросов 0.0013205982617241485


Замена выброса выборочным значением

In [42]:
def impute_outlier_with_arbitrary(data, outlier_index, value, col=[]):
    data_copy = data.copy()
    for i in col:
        data_copy.loc[outlier_index, i] = value
    return data_copy

In [43]:
data2 = impute_outlier_with_arbitrary(data, index, -999, ['age'])
data2[25:35]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
25,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States
26,19,Private,168294,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States
27,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South
28,39,Private,367260,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,80,United-States
29,49,Private,193366,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
30,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52,United-States
31,20,Private,266015,Some-college,10,Never-married,Sales,Own-child,Black,Male,0,0,44,United-States
32,45,Private,386940,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40,United-States
33,30,Federal-gov,59951,Some-college,10,Married-civ-spouse,Adm-clerical,Own-child,White,Male,0,0,40,United-States
34,22,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,15,United-States


Винзоризация

{92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, −40, 101, 86, 85, 15, 89, 89, 28, −5, 41}  

{92, 19, 101, 58, 101, 91, 26, 78, 10, 13, −5, 101, 86, 85, 15, 89, 89, 28, −5, 41} 

In [45]:
def winsorization(data, col, para, strategy='both'):
    data_copy = data.copy()
    if strategy == 'both':
        data_copy.loc[data_copy[col] > para[0], col] = para[0]
        data_copy.loc[data_copy[col] < para[1], col] = para[1]
    elif strategy == 'top':
        data_copy.loc[data_copy[col] > para[0], col] = para[0]
    elif strategy == 'bottom':
        data_copy.loc[data_copy[col] > para[1], col] = para[1]
    return data_copy

In [46]:
data3 = winsorization(data, 'age', para, 'both')
data3[25:35]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
25,56.0,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States
26,19.0,Private,168294,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States
27,54.0,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South
28,39.0,Private,367260,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,80,United-States
29,49.0,Private,193366,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
30,23.0,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52,United-States
31,20.0,Private,266015,Some-college,10,Never-married,Sales,Own-child,Black,Male,0,0,44,United-States
32,45.0,Private,386940,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40,United-States
33,30.0,Federal-gov,59951,Some-college,10,Married-civ-spouse,Adm-clerical,Own-child,White,Male,0,0,40,United-States
34,22.0,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,15,United-States


Удаление выбросов

In [48]:
def drop_outlier(data, outlier_index):
    data_copy = data[-outlier_index]
    return data_copy

In [49]:
data4 = drop_outlier(data, index)
print(data4.age.max())
print(data4.age.min())

88
17


Замена выбросов средним/медианой/модой

In [50]:
def impute_outlier_with_avg(data, col, outlier_index, strategy='mean'):
    data_copy = data.copy()
    if strategy == 'mean':
        data_copy.loc[outlier_index, col] = data_copy[col].mean()
    elif strategy == 'median':
        data_copy.loc[outlier_index, col] = data_copy[col].median()
    elif strategy == 'mode':
        data_copy.loc[outlier_index, col] = data_copy[col].mode()[0]
    return data_copy

In [51]:
data5 = impute_outlier_with_avg(data, 'age', index, 'mean')
data5[25:35]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
25,56.0,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States
26,19.0,Private,168294,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States
27,54.0,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South
28,39.0,Private,367260,HS-grad,9,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,80,United-States
29,49.0,Private,193366,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
30,23.0,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52,United-States
31,20.0,Private,266015,Some-college,10,Never-married,Sales,Own-child,Black,Male,0,0,44,United-States
32,45.0,Private,386940,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40,United-States
33,30.0,Federal-gov,59951,Some-college,10,Married-civ-spouse,Adm-clerical,Own-child,White,Male,0,0,40,United-States
34,22.0,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,15,United-States


# Шкалирование данных

In [53]:
X_train, X_test, y_train, y_test = train_test_split(data, data.age, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((26048, 14), (6513, 14))

# Нормализация данных

In [54]:
ss = StandardScaler().fit(X_train[['age']])
X_train_copy = X_train.copy()
X_train_copy['age_zscore'] = ss.transform(X_train_copy[['age']])
print(X_train_copy.head(6))

       age          workclass  fnlwgt      education  education-num  \
15282   36            Private  174308           11th              7   
24870   35            Private  198202        HS-grad              9   
18822   38            Private   52963      Bachelors             13   
26404   50            Private  138270        HS-grad              9   
7842    68   Self-emp-not-inc  116903      Assoc-voc             11   
4890    51   Self-emp-not-inc  149220   Some-college             10   

            marital-status         occupation    relationship    race  \
15282             Divorced   Transport-moving   Not-in-family   White   
24870        Never-married    Exec-managerial   Not-in-family   White   
18822        Never-married       Adm-clerical   Not-in-family   White   
26404   Married-civ-spouse              Sales            Wife   Black   
7842    Married-civ-spouse     Prof-specialty         Husband   White   
4890    Married-civ-spouse    Farming-fishing         Husband   

In [55]:
print(X_train_copy['age_zscore'].mean())
print(X_train_copy['age_zscore'].std())

1.481206639733317e-16
1.000019195884404


Мин-макс шкалирование

In [56]:
mms = MinMaxScaler().fit(X_train[['age']])
X_train_copy = X_train.copy()
X_train_copy['age_minmax'] = mms.transform(X_train_copy[['age']])
print(X_train_copy.head(6))

       age          workclass  fnlwgt      education  education-num  \
15282   36            Private  174308           11th              7   
24870   35            Private  198202        HS-grad              9   
18822   38            Private   52963      Bachelors             13   
26404   50            Private  138270        HS-grad              9   
7842    68   Self-emp-not-inc  116903      Assoc-voc             11   
4890    51   Self-emp-not-inc  149220   Some-college             10   

            marital-status         occupation    relationship    race  \
15282             Divorced   Transport-moving   Not-in-family   White   
24870        Never-married    Exec-managerial   Not-in-family   White   
18822        Never-married       Adm-clerical   Not-in-family   White   
26404   Married-civ-spouse              Sales            Wife   Black   
7842    Married-civ-spouse     Prof-specialty         Husband   White   
4890    Married-civ-spouse    Farming-fishing         Husband   

In [57]:
print(X_train_copy['age_minmax'].min())
print(X_train_copy['age_minmax'].max())

0.0
1.0


Робустное шкалирование

In [58]:
rs = RobustScaler().fit(X_train[['age']])
X_train_copy = X_train.copy()
X_train_copy['age_robust'] = rs.transform(X_train_copy[['age']])
print(X_train_copy.head(6))

       age          workclass  fnlwgt      education  education-num  \
15282   36            Private  174308           11th              7   
24870   35            Private  198202        HS-grad              9   
18822   38            Private   52963      Bachelors             13   
26404   50            Private  138270        HS-grad              9   
7842    68   Self-emp-not-inc  116903      Assoc-voc             11   
4890    51   Self-emp-not-inc  149220   Some-college             10   

            marital-status         occupation    relationship    race  \
15282             Divorced   Transport-moving   Not-in-family   White   
24870        Never-married    Exec-managerial   Not-in-family   White   
18822        Never-married       Adm-clerical   Not-in-family   White   
26404   Married-civ-spouse              Sales            Wife   Black   
7842    Married-civ-spouse     Prof-specialty         Husband   White   
4890    Married-civ-spouse    Farming-fishing         Husband   

# Энкодинг переменных

One-Hot encoding

In [77]:
data1 = pd.get_dummies(data, drop_first=True)

Энкодинг значением

In [78]:
ord_enc = ce.OrdinalEncoder(cols=['sex']).fit(X_train, y_train)

In [79]:
data4 = ord_enc.transform(data)
print(data4.head(5))

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race  sex  \
0        Never-married        Adm-clerical   Not-in-family   White    1   
1   Married-civ-spouse     Exec-managerial         Husband   White    1   
2             Divorced   Handlers-cleaners   Not-in-family   White    1   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black    1   
4   Married-civ-spouse      Prof-specialty            Wife   Black    2   

   capital-gain  capital-loss  hours-per-week  native-country  
0          2174             0              40   United-States  
1             0             0       

Таргет энкодинг

In [87]:
target_enc = ce.TargetEncoder(cols=['sex']).fit(X_train, y_train)

In [88]:
data2 = target_enc.transform(data)
data2.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,39.431971,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,39.431971,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,39.431971,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,39.431971,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,36.852097,0,0,40,Cuba


WOE энкодинг

In [90]:
# woe_enc = ce.WOEEncoder(cols=['sex']).fit(X_train, y_train)

In [92]:
# data3 = woe_enc.transform(data)
# data3.head()

# Feature Selection

In [93]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data = pd.DataFrame(np.c_[data['data'], data['target']], columns=np.append(data['feature_names'], ['target']))

In [94]:
data.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [95]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1),
                                                   data.target, test_size=0.2,
                                                   random_state=0)
X_train.shape, X_test.shape

((455, 30), (114, 30))

In [96]:
def constant_feature_detect(data, threshold=0.98):
    data_copy = data.copy()
    quasi_constant_feature = []
    for feature in data_copy.columns:
        predomiant = (data_copy[feature].value_counts() / np.float(
                     len(data_copy))).sort_values(ascending=False).values[0]
        if predomiant >= threshold:
            quasi_constant_feature.append(feature)
    print(len(quasi_constant_feature), 'константные переменные')
    return quasi_constant_feature

In [97]:
quasi_constant_feature = constant_feature_detect(data=X_train, threshold=0.9)

0 константные переменные


In [104]:
X_train['dummy'] = np.floor(X_train['worst smoothness']*10)
X_train.dummy.value_counts() / np.float(len(X_train))

1.0    0.923077
0.0    0.068132
2.0    0.008791
Name: dummy, dtype: float64

In [105]:
quasi_constant_feature = constant_feature_detect(data=X_train, threshold=0.9)
quasi_constant_feature

1 константные переменные


['dummy']

In [106]:
X_train.drop(labels=quasi_constant_feature, axis=1, inplace=True)
print(X_train.shape)

(455, 30)


Корреляционная фильтрация

In [107]:
def corr_feature_detect(data, threshold=0.8):
    
    corrmat = data.corr()
    corrmat = corrmat.abs().unstack()
    corrmat = corrmat.sort_values(ascending=False)
    corrmat = corrmat[corrmat >= threshold]
    corrmat = corrmat[corrmat < 1]
    corrmat = pd.DataFrame(corrmat).reset_index()
    corrmat.columns = ['feature1', 'feature2', 'corr']
    
    grouped_feature_ls = []
    correlated_groups = []
    
    for feature in corrmat.feature1.unique():
        if feature not in grouped_feature_ls:
            
            correlation_block = corrmat[corrmat.feature1 == feature]
            grouped_feature_ls = grouped_feature_ls + list(
                correlation_block.feature2.unique()) + [feature]
            
            correlated_groups.append(correlation_block)
    return correlated_groups

In [108]:
corr = corr_feature_detect(data=X_train, threshold=0.9)
for i in corr:
    print(i, '\n')

          feature1         feature2      corr
0   mean perimeter      mean radius  0.998185
7   mean perimeter        mean area  0.986692
14  mean perimeter  worst perimeter  0.970507
19  mean perimeter     worst radius  0.969520
32  mean perimeter       worst area  0.941920 

           feature1      feature2      corr
12  perimeter error  radius error  0.978323
30  perimeter error    area error  0.944995 

               feature1              feature2      corr
36  mean concave points        mean concavity  0.914627
41  mean concave points  worst concave points  0.906312 

         feature1      feature2      corr
38  worst texture  mean texture  0.908182 



Взаимная информация

In [109]:
def mutual_info(X, y, select_k=10):
    
    if select_k >= 1:
        sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X, y)
        col = X.columns[sel_.get_support()]
    
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X, y)
        col = X.columns[sel_.get_support()]
    
    else:
        raise ValueError('select_k должно быть положительным значением')
    
    return col

In [110]:
mi = mutual_info(X=X_train, y=y_train, select_k=3)
print(mi)

Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')


In [111]:
mi = mutual_info(X=X_train, y=y_train, select_k=0.2)
print(mi)

Index(['mean perimeter', 'mean concave points', 'worst radius',
       'worst perimeter', 'worst area', 'worst concave points'],
      dtype='object')


Хи-квадрат тест

In [112]:
def chi_square_test(X, y, select_k=10):
    
    if select_k >= 1:
        sel_ = SelectKBest(chi2, k=select_k).fit(X, y)
        col = X.columns[sel_.get_support()]
        
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X, y)
        col = X.columns[sel_.get_support()]
    
    else:
        raise ValueError('select_k должно быть положительным значением')
    
    return col

In [113]:
chi = chi_square_test(X_train, y_train, 3)
print(chi)

Index(['mean area', 'area error', 'worst area'], dtype='object')


In [114]:
chi = chi_square_test(X_train, y_train, 0.2)
print(chi)

Index(['mean perimeter', 'mean area', 'area error', 'worst radius',
       'worst perimeter', 'worst area'],
      dtype='object')


 Одномерный ROC-AUC(если задача классификации) или MSE (если задача регрессии)

In [115]:
def univariate_roc_auc(X_train, y_train, X_test, y_test, threshold):
    
    roc_values = []
    for feature in X_train.columns:
        clf = DecisionTreeClassifier()
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict_proba(X_test[feature].to_frame())
        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
    roc_values = pd.Series(roc_values)
    roc_values.index = X_train.columns
    print(roc_values.sort_values(ascending=False))
    print(len(roc_values[roc_values > threshold]), len(X_train.columns))
    keep_col = roc_values[roc_values > threshold]
    return keep_col

In [116]:
uni_roc_auc = univariate_roc_auc(X_train, y_train, X_test, y_test, threshold=0.8)
print(uni_roc_auc)

worst perimeter            0.917275
worst area                 0.895840
worst radius               0.893458
worst concave points       0.863131
mean concavity             0.856939
mean radius                0.849000
mean area                  0.839314
worst concavity            0.831375
mean perimeter             0.829628
mean concave points        0.826453
area error                 0.812321
worst compactness          0.742299
radius error               0.740235
mean compactness           0.734360
perimeter error            0.680534
worst texture              0.647666
worst fractal dimension    0.640997
concavity error            0.640203
worst symmetry             0.620991
concave points error       0.618133
compactness error          0.607336
mean symmetry              0.591775
mean texture               0.573357
texture error              0.568593
worst smoothness           0.565100
mean smoothness            0.557637
fractal dimension error    0.542077
smoothness error           0

# Oversampling/undersampling

In [117]:
len(y_train)

455

In [118]:
sm = SMOTE(random_state=42)

In [119]:
X_res, y_res = sm.fit_resample(X_train, y_train)

In [120]:
len(y_res)

580

In [121]:
cn = CondensedNearestNeighbour(random_state=42)

In [122]:
X_res, y_res = cn.fit_resample(X_train, y_train)

In [123]:
len(y_res)

205