In [142]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [None]:
# 1 - knn
# 2 - деревья принятия решения
# 3 - логистическая регрессия

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (18,12)

from ipywidgets import interact, IntSlider, FloatSlider

In [144]:
df_train = pd.read_csv('train.csv', index_col = '_id')
df_test = pd.read_csv('test.csv', index_col = '_id')
df_targ = pd.read_csv('sample_submission.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24712 entries, df7489733b004bbe40d3d37b34f82419 to 46d0d25dfd1def79632dd437261d0b5c
Data columns (total 21 columns):
age               24712 non-null int64
job               24712 non-null object
marital           24712 non-null object
education         24712 non-null object
default           24712 non-null object
housing           24712 non-null object
loan              24712 non-null object
contact           24712 non-null object
month             24712 non-null object
day_of_week       24712 non-null object
duration          24712 non-null int64
campaign          24712 non-null int64
pdays             24712 non-null int64
previous          24712 non-null int64
poutcome          24712 non-null object
emp.var.rate      24712 non-null float64
cons.price.idx    24712 non-null float64
cons.conf.idx     24712 non-null float64
euribor3m         24712 non-null float64
nr.employed       24712 non-null float64
target            24712 non-null int64

In [145]:
def preproc_data(df_input):
    df_output = df_input.copy()

    df_output = df_output.drop(df_output[df_output['duration'] == 0].index, axis = 0)
    
    for col in df_output.loc[:, 'job':'day_of_week']:
        df_output[col] = pd.factorize(df_output[col])[0]
        
    df_output = pd.get_dummies(df_output, prefix=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week'],
                                   columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week'])
    
    df_output['poutcome'] = pd.factorize(df_output['poutcome'])[0]
    df_output = pd.get_dummies(df_output, prefix=['poutcome'], columns=['poutcome'])
    
    return df_output

In [256]:
df_train_preproc  = df_train.pipe(preproc_data)
df_test_preproc = df_test.pipe(preproc_data)

X, X_test = train_test_split(df_train_preproc.drop(['target'], axis=1), test_size = 0.2, random_state = 123)
y, y_test = train_test_split(df_train_preproc['target'], test_size = 0.2, random_state = 123)

In [257]:
knn = KNeighborsClassifier(n_neighbors=17, 
                          weights='uniform', 
                          metric='manhattan')
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
           metric_params=None, n_jobs=1, n_neighbors=17, p=2,
           weights='uniform')

In [258]:
knn_y_hat = knn.predict_proba(X_test)
roc_auc_score(y_test, knn_y_hat[:, 1]) #knn - 1 model

0.91478787648199

In [172]:
# 2 - деревья решений
import subprocess
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

In [259]:
df_tree = DecisionTreeClassifier(random_state = 123)

In [260]:
df_tree.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [261]:
tree_y_hat = df_tree.predict_proba(X_test)

In [262]:
roc_auc_score(y_test, tree_y_hat[:, 1]) # деревья решений

0.721840341409901

In [219]:
# 3 - логистическая регрессия
from sklearn.linear_model import LogisticRegression


In [263]:
df_regress = LogisticRegression(C=1.0, 
                           fit_intercept=True, 
                           penalty='l2')
df_regress.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [264]:
reg_y_hat = df_regress.predict_proba(X_test)

In [265]:
roc_auc_score(y_test, reg_y_hat[:, 1])

0.9249502744680266

In [266]:
df_submit = pd.DataFrame()
df_submit.loc[:, '_id'] = df_test_preproc.reset_index()['_id']
df_submit.loc[:, 'target'] = df_regress.predict_proba(df_test_preproc)[:, 1]
df_submit.to_csv('submit.csv', sep = ',', index = False)