In [82]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict 
from sklearn.model_selection import KFold 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [83]:
adult = pd.read_csv('adult.csv')
customer_churn = pd.read_csv('Customer-Churn.csv')
employee = pd.read_csv('employee.csv')

## Adult

### FreqEncoder

In [86]:
t = pd.value_counts(adult['marital-status'])
adult['marital-status'].apply(lambda x : t[x])

0        16117
1        22379
2        22379
3        22379
4        16117
         ...  
48837    22379
48838    22379
48839     1518
48840    16117
48841    22379
Name: marital-status, Length: 48842, dtype: int64

### LabelEncoder & OneHotEncoder

In [90]:
X_features = pd.DataFrame()
X_features[['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']] =  adult[['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']]

In [107]:
X = pd.concat((X_features, pd.get_dummies(adult[['workclass','education', 
                                                     'marital-status','occupation','relationship',
                                                     'race','gender','native-country']])), axis=1)

In [113]:
y = adult['income'].map(dict(zip(pd.unique(adult['income']), range(2))))

In [115]:
cv = KFold(n_splits=5, shuffle=True, random_state=1)
cls = RandomForestClassifier(n_estimators=50, 
                             max_features=5, 
                             bootstrap=True, 
                             random_state=42)
a_rf = cross_val_predict(cls, X, y, cv=cv) # ответы rf на CV

In [118]:
roc_auc_score(a_rf.astype(int), y.astype(int))

0.8010542759522385

### Freq Encoder

In [119]:
lis = ['workclass','education', 'marital-status','occupation','relationship', 'race','gender','native-country']

In [121]:
for i in lis:
    t = pd.value_counts(adult[i])
    X_features[i] = adult[i].apply(lambda x : t[x])

In [122]:
X_features

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender,native-country
0,25,226802,7,0,0,40,33906,1812,16117,3022,7581,4685,32650,43832
1,38,89814,9,0,0,50,33906,15784,22379,1490,19716,41762,32650,43832
2,28,336951,12,0,0,40,3136,1601,22379,983,19716,41762,32650,43832
3,44,160323,10,7688,0,40,33906,10878,22379,3022,19716,4685,32650,43832
4,18,103497,10,0,0,30,2799,10878,16117,2809,7581,41762,16192,43832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,0,0,38,33906,1601,22379,1446,2331,41762,16192,43832
48838,40,154374,9,0,0,40,33906,15784,22379,3022,19716,41762,32650,43832
48839,58,151910,9,0,0,40,33906,15784,1518,5611,5125,41762,16192,43832
48840,22,201490,9,0,0,20,33906,15784,16117,5611,7581,41762,32650,43832


In [123]:
freq_encoder = cross_val_predict(cls, X_features, y, cv=cv) # ответы rf на CV

In [124]:
roc_auc_score(freq_encoder.astype(int), y.astype(int))

0.8118844604190287

#### LeaveOneOutEncoder

In [162]:
X_features = pd.DataFrame()
X_features[['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']] =  \
    adult[['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']]

In [163]:
adult['income'] = adult['income'].map(dict(zip(pd.unique(adult['income']), range(2))))
lis = ['workclass','education', 'marital-status','occupation','relationship', 'race','gender','native-country']

In [243]:
def LeaveOneOutEncoder(ser):
    step1 = adult[ser.index]
    step2 = step1.drop(ser.name)
    step3 = step2[step2[ser.index[0]]==ser.iloc[0]]
    if ser.name % 1000==0:
        print(f'iteration number: {ser.name}')
    result = pd.Series(step3['income'].sum() / step3.shape[0])
    return result

In [244]:
for bale in lis:
    print(f'Preprocessing LeaveOneOutEncoder: {bale}')
    X_features[bale] = adult[[bale, 'income']].apply(LeaveOneOutEncoder, axis=1)

Preprocessing LeaveOneOutEncoder: workclass
iteration number: 0
iteration number: 1000
iteration number: 2000
iteration number: 3000
iteration number: 4000
iteration number: 5000
iteration number: 6000
iteration number: 7000
iteration number: 8000
iteration number: 9000
iteration number: 10000
iteration number: 11000
iteration number: 12000
iteration number: 13000
iteration number: 14000
iteration number: 15000
iteration number: 16000
iteration number: 17000
iteration number: 18000
iteration number: 19000
iteration number: 20000
iteration number: 21000
iteration number: 22000
iteration number: 23000
iteration number: 24000
iteration number: 25000
iteration number: 26000
iteration number: 27000
iteration number: 28000
iteration number: 29000
iteration number: 30000
iteration number: 31000
iteration number: 32000
iteration number: 33000
iteration number: 34000
iteration number: 35000
iteration number: 36000
iteration number: 37000
iteration number: 38000
iteration number: 39000
iteration

iteration number: 39000
iteration number: 40000
iteration number: 41000
iteration number: 42000
iteration number: 43000
iteration number: 44000
iteration number: 45000
iteration number: 46000
iteration number: 47000
iteration number: 48000
Preprocessing LeaveOneOutEncoder: native-country
iteration number: 0
iteration number: 1000
iteration number: 2000
iteration number: 3000
iteration number: 4000
iteration number: 5000
iteration number: 6000
iteration number: 7000
iteration number: 8000
iteration number: 9000
iteration number: 10000
iteration number: 11000
iteration number: 12000
iteration number: 13000
iteration number: 14000
iteration number: 15000
iteration number: 16000
iteration number: 17000
iteration number: 18000
iteration number: 19000
iteration number: 20000
iteration number: 21000
iteration number: 22000
iteration number: 23000
iteration number: 24000
iteration number: 25000
iteration number: 26000
iteration number: 27000
iteration number: 28000
iteration number: 29000
iter

  result = pd.Series(step3['income'].sum() / step3.shape[0])


iteration number: 36000
iteration number: 37000
iteration number: 38000
iteration number: 39000
iteration number: 40000
iteration number: 41000
iteration number: 42000
iteration number: 43000
iteration number: 44000
iteration number: 45000
iteration number: 46000
iteration number: 47000
iteration number: 48000


In [250]:
X_features_LOU = X_features.fillna(0)

In [251]:
leave_one_out_encoder = cross_val_predict(cls, X_features_LOU, y, cv=cv) # ответы rf на CV
roc_auc_score(leave_one_out_encoder.astype(int), y.astype(int))

0.9999865432231673

### Что произошло почему проверим

In [261]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features_LOU, y, test_size=0.33, random_state=42)

In [264]:
cls.fit(X_train, y_train)

RandomForestClassifier(max_features=5, n_estimators=50, random_state=42)

In [265]:
roc_auc_score(cls.predict(X_test), y_test)

1.0

In [266]:
from sklearn.linear_model import LogisticRegression

In [268]:
s = LogisticRegression().fit(X_train, y_train)

In [270]:
roc_auc_score(s.predict(X_test), y_test)

0.770741083094878

Как дурачок сижу радуюсь.

**Вывод**: были протестированы 3 способа кодирования, `LeaveOneOut` показал невероятный скачок, правда кодирование проводилось около 1.5 часов.