In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,8)

from ipywidgets import interact, IntSlider, FloatSlider

In [2]:
df_train = pd.read_csv('data/train.csv', sep=',', encoding='utf8')
df_test = pd.read_csv('data/test.csv', sep=',', encoding='utf8')
df_target = pd.read_csv('data/sample_submission.csv', sep=',', encoding='utf8')

In [3]:
def prepTrainXY ( dfIn, clrLvl ):
    dfXOut = dfIn.copy()
    # деперсонифицируем выборку
    dfXOut = dfXOut.drop(['_id', 'contact', 'month', 'day_of_week', 'poutcome', 'pdays'], axis=1)
    if clrLvl['1'][0]  == 1:
        dfXOut = dfXOut.drop(['marital'], axis=1)
    if clrLvl['2'][0]  == 1:
        dfXOut = dfXOut.drop(['education'], axis=1)
    if clrLvl['3'][0]  == 1:
        dfXOut = dfXOut.drop(['job'], axis=1)
    if clrLvl['4'][0]  == 1:
        dfXOut = dfXOut.drop(dfXOut[dfXOut['default']=='unknown'].index, axis=0)
        dfXOut = dfXOut.drop(dfXOut[dfXOut['housing']=='unknown'].index, axis=0)
        dfXOut = dfXOut.drop(dfXOut[dfXOut['loan']=='unknown'].index, axis=0)
        #dfXOut = dfXOut.drop(['housing']=='unknown', axis=0)
        #dfXOut = dfXOut.drop(['loan']=='unknown', axis=0)
    if clrLvl['5'][0]  == 1:
        dfXOut = dfXOut.drop(['emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed' ], axis=1)
    if clrLvl['6'][0]  == 1:
        dfXOut = dfXOut.drop(['campaign'], axis=1)
    if clrLvl['7'][0]  == 1:
        dfXOut = dfXOut.drop(['age'], axis=1)
    
    # удаляю оставшиеся записи с NaN значениями
    dfXOut = dfXOut.dropna(axis=0)    
    
    try:
        yOut = dfXOut['target']
        dfXOut = dfXOut.drop(['target'], axis=1)
    except Exception:
        yOut = []
        
        
    # Кодировка категориальных признаков
    dfXOut = pd.get_dummies(dfXOut)
    
    return dfXOut, yOut

In [4]:
clrLvl = pd.DataFrame([[0, 1, 1, 0, 1, 1, 1] ], columns=list('1234567'))
X, y = prepTrainXY ( df_train, clrLvl )

In [5]:
# clrLvl = pd.DataFrame([[1, 1, 1, 0, 1, 1, 1] ], columns=list('1234567'))
tmpX_test = df_test.join(df_target.set_index('_id'), on='_id')
X_test, y_test = prepTrainXY ( tmpX_test, clrLvl )

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [7]:
model = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(C=1.0, fit_intercept=True, penalty='l2'))
])

In [8]:
model.fit( X, y )

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [9]:
knn_predict = model.predict(X_test)

In [10]:
knn_predict_proba = model.predict_proba(X_test)
knn_predict_proba.shape

(16476, 2)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('precision', precision_score(y_test, knn_predict))
print('recall', recall_score(y_test, knn_predict))
print('accuracy', accuracy_score(y_test, knn_predict))
print('f1', f1_score(y_test, knn_predict))

precision 0.0
recall 0.0
accuracy 0.9588492352512746
f1 0.0


In [12]:
from sklearn.metrics import roc_auc_score, roc_curve

In [13]:
roc_auc_score(y_test, knn_predict)

0.479453717754173

In [14]:
knn_predict_df = pd.DataFrame(data=knn_predict)
knn_predict_df.to_csv('knn_predict_df.csv', sep='\t', index=False)

In [168]:
knn_predict_df

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [162]:
knn_predict_df['_id'] = df_target['_id']

In [174]:
df_final = df_target.copy()
df_final['target'] = knn_predict_df[0]
df_final.to_csv('knn_submission.csv', sep=',', index=False)

In [169]:
df_target

Unnamed: 0,_id,target
0,66810d8e6bf2b41c880a7bc6c8a1e295,1
1,ccac3879652b08cb8b44c1920fd93afa,0
2,fcccab4d7a76f70647f015f2c84c2af8,0
3,ed8399278c30678dab739045fa12b440,0
4,1d4d62ac5cabcb48bac7112813f290cb,0
5,aba2dec4c5cab88824f36babd24b986f,0
6,06f318f1dd178e738f675bb88a5adb84,0
7,d5036f5956e42ee6207296238fc4bc1d,0
8,fe0cc8933698ad4046ff2b82f65756eb,0
9,70190122f4ebf196535e11b33eb95b81,0


In [170]:
knn_predict_df

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
