Обработка категориальных признаков 

In [46]:
import pandas as pd
import numpy as np
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import sklearn.preprocessing
import sklearn.feature_extraction

%matplotlib inline

import matplotlib.pyplot as plt
from pprint import pformat
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')

In [63]:
df = pd.read_csv('bank-additional-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [64]:
# Функция для замены бинарных признаков на 1 и 0
# word_binar: в зависимости от того, какое значение в столбце "column_name"
def categor_repl(data, column_name, word_binar): 
    for col in [column_name]:
        data[col] = np.where(data[col] == word_binar, 1, 0)
    labels = data[column_name]

In [65]:
categor_repl(df,'loan', 'yes')
categor_repl(df,'housing', 'yes')
categor_repl(df,'default', 'yes')
categor_repl(df,'contact', 'telephone')
labels = df['y']
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,0,0,0,1,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,0,0,0,1,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,0,1,0,1,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,0,0,0,1,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,0,0,1,1,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [66]:
print('train shape: ', df.shape)

train shape:  (41188, 21)


In [67]:
# Записываем категориальные фичи
cat_features = df.columns[df.dtypes == 'object'].tolist()

In [68]:
cat_features

['job', 'marital', 'education', 'month', 'day_of_week', 'poutcome', 'y']

In [69]:
cat_features = cat_features[0:len(cat_features)-1]

In [70]:
cat_features

['job', 'marital', 'education', 'month', 'day_of_week', 'poutcome']

In [71]:
# Записываем непрерывные фичи
numerical_feature = []

for col_name in df.columns:
    if col_name not in cat_features+['y']:
        numerical_feature = numerical_feature + [col_name]

In [72]:
numerical_feature

['age',
 'default',
 'housing',
 'loan',
 'contact',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed']

In [73]:
# Обработка категориальных признаков с помощью One Hot Encoding
df_concat_cat_features = pd.get_dummies(df[cat_features])

# Удаление категориальных переменных
df.drop(cat_features, axis = 1, inplace = True)

# Присоединение новых
df = pd.concat([df_concat_cat_features, df], axis = 1 )

In [74]:
df.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,0,0,0,1,0,0,0,0,0,0,...,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
1,0,0,0,0,0,0,0,1,0,0,...,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
2,0,0,0,0,0,0,0,1,0,0,...,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
3,1,0,0,0,0,0,0,0,0,0,...,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
4,0,0,0,0,0,0,0,1,0,0,...,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no


In [75]:
df['y']

0         no
1         no
2         no
3         no
4         no
5         no
6         no
7         no
8         no
9         no
10        no
11        no
12        no
13        no
14        no
15        no
16        no
17        no
18        no
19        no
20        no
21        no
22        no
23        no
24        no
25        no
26        no
27        no
28        no
29        no
        ... 
41158    yes
41159    yes
41160    yes
41161     no
41162     no
41163    yes
41164    yes
41165    yes
41166    yes
41167     no
41168     no
41169     no
41170     no
41171    yes
41172    yes
41173    yes
41174    yes
41175     no
41176     no
41177     no
41178    yes
41179     no
41180     no
41181    yes
41182     no
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object

In [76]:
print('train shape: ', df.shape)

train shape:  (41188, 57)


In [None]:
# Для сравнения кач-ва можно в линейных моделях попробовать как стандратизированные данные, так и нет

scaler=StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_feature]), columns = numerical_feature)
test_scaled = pd.DataFrame(scaler.fit_transform(test[numerical_feature]), columns = numerical_feature)

Модель

In [77]:
df.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,0,0,0,1,0,0,0,0,0,0,...,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
1,0,0,0,0,0,0,0,1,0,0,...,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
2,0,0,0,0,0,0,0,1,0,0,...,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
3,1,0,0,0,0,0,0,0,0,0,...,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no
4,0,0,0,0,0,0,0,1,0,0,...,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,no


In [79]:
d = {'yes': 1, 'no': 0}
df['y'] = df['y'].map(d)   
df['y'] = df['y'].astype('int64')


In [80]:
df.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,0,0,0,1,0,0,0,0,0,0,...,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
1,0,0,0,0,0,0,0,1,0,0,...,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
2,0,0,0,0,0,0,0,1,0,0,...,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
3,1,0,0,0,0,0,0,0,0,0,...,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
4,0,0,0,0,0,0,0,1,0,0,...,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0


In [81]:
df.corr()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
job_admin.,1.0,-0.313313,-0.111417,-0.094595,-0.160892,-0.121502,-0.110021,-0.190063,-0.085748,-0.257516,...,-0.008918,0.011745,-0.025398,0.018888,-0.024572,-0.04184,0.032341,-0.023831,-0.023446,0.031426
job_blue-collar,-0.313313,1.0,-0.10305,-0.087492,-0.14881,-0.112378,-0.101759,-0.175791,-0.079308,-0.238178,...,0.012992,-0.001775,0.065335,-0.054845,0.057264,0.075322,-0.101567,0.046775,0.063921,-0.074423
job_entrepreneur,-0.111417,-0.10305,1.0,-0.031113,-0.052918,-0.039962,-0.036186,-0.062513,-0.028203,-0.084698,...,0.003679,-0.002203,0.019246,-0.013239,0.009363,0.009825,-0.032306,0.018744,0.024581,-0.016644
job_housemaid,-0.094595,-0.087492,-0.031113,1.0,-0.044929,-0.033929,-0.030723,-0.053075,-0.023945,-0.071911,...,-0.004908,0.004226,-0.001649,-0.011569,0.036367,0.028335,0.035374,0.036392,0.028105,-0.006505
job_management,-0.160892,-0.14881,-0.052918,-0.044929,1.0,-0.057708,-0.052255,-0.090272,-0.040726,-0.122309,...,-0.001308,-0.009135,0.000254,0.006735,-0.016642,-0.025268,0.000784,-0.00159,-0.001475,-0.000419
job_retired,-0.121502,-0.112378,-0.039962,-0.033929,-0.057708,1.0,-0.039462,-0.068171,-0.030756,-0.092365,...,0.012421,-0.006847,-0.072084,0.065113,-0.103678,-0.052251,0.087033,-0.102454,-0.129367,0.092221
job_self-employed,-0.110021,-0.101759,-0.036186,-0.030723,-0.052255,-0.039462,1.0,-0.06173,-0.027849,-0.083637,...,0.00427,0.006361,0.014307,-0.011231,0.001477,-0.005122,0.000592,0.007421,0.009519,-0.004663
job_services,-0.190063,-0.175791,-0.062513,-0.053075,-0.090272,-0.068171,-0.06173,1.0,-0.04811,-0.144485,...,0.000142,0.002383,0.030573,-0.011885,0.019431,0.033282,-0.055561,0.014666,0.020629,-0.032301
job_student,-0.085748,-0.079308,-0.028203,-0.023945,-0.040726,-0.030756,-0.027849,-0.04811,1.0,-0.065185,...,0.014432,-0.024657,-0.096367,0.104671,-0.139725,-0.062117,0.010029,-0.147551,-0.165365,0.093955
job_technician,-0.257516,-0.238178,-0.084698,-0.071911,-0.122309,-0.092365,-0.083637,-0.144485,-0.065185,1.0,...,-0.013742,0.001557,0.004575,-0.017142,0.054268,-0.010849,0.054973,0.050793,0.052741,-0.006149


In [82]:
X = df.drop('y', axis=1)
y = df['y']

In [83]:
def get_pred_res(X,y):
    from sklearn.model_selection import train_test_split, StratifiedKFold
    X_train, X_holdout, y_train, y_holdout = train_test_split(X.values, y, test_size=0.3,
                                                          random_state=17)

    logit = sklearn.linear_model.LogisticRegression(n_jobs=-1, random_state=7)
    logit.fit(X_train, y_train)
    
    svm = sklearn.svm.LinearSVC(random_state=7)
    svm.fit(X_train, y_train)
    
    from sklearn.metrics import accuracy_score
    logit_pred = logit.predict(X_holdout)
    svm_pred = svm.predict(X_holdout)

    # Подберем коэффициент регуляризации для логистической регрессии
    
    from sklearn.model_selection import GridSearchCV

    param_grid_logit = {'C': 
                    np.logspace(-5, 0, 6)}
    grid_logit = GridSearchCV(logit, param_grid_logit, cv=3, n_jobs=-1)

    grid_logit.fit(X_train, y_train)
    
    # То же самое для LinearSVC.
    
    param_grid_svm = {'C': np.logspace(-5, 0, 6)}
    grid_svm = GridSearchCV(svm, param_grid_svm, cv=3, n_jobs=-1)
  
    grid_svm.fit(X_train, y_train) 
    
    pred_result=pd.DataFrame()
    pred_result.loc[0,'model'] = 'SVM'
    pred_result.loc[0,'accuracy_score'] = accuracy_score(y_holdout, svm_pred)
    pred_result.loc[0,'best_param'] = grid_svm.best_params_
    pred_result.loc[0,'best_score'] = grid_svm.best_score_
    pred_result.loc[1,'model'] = 'logit'
    pred_result.loc[1,'accuracy_score'] = accuracy_score(y_holdout, logit_pred)
    pred_result.loc[1,'best_param'] = grid_logit.best_params_
    pred_result.loc[1,'best_score'] = grid_logit.best_score_
        
    return pred_result
    
    

In [84]:
pred_result = get_pred_res(X,y)

In [85]:
pred_result

Unnamed: 0,model,accuracy_score,best_param,best_score
0,SVM,0.885409,C,0.90531
1,logit,0.907745,C,0.91079
