In [1]:
import os
import time
import pickle
import itertools
import json

import pandas as pd
import numpy as np

from copy import copy

from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from bvkpy.ml_utils import cm_score
from bvkpy.ml_utils import GridSearcher

In [2]:
experiments_dir = 'experiments'

if not os.path.exists(experiments_dir):
    os.mkdir(experiments_dir)
    
app_knn_dir = os.path.join(experiments_dir, 'app_logistic_regression')
if not os.path.exists(app_knn_dir):
    os.mkdir(app_knn_dir)
    
msg = \
    'WARNING this file is generated automatically. Do not edit it.\n\n' \
    'This directory contains experiment with scikin-learn LogisticRegression model'\
    '\n'

with open(os.path.join(app_knn_dir, 'note.txt'), 'w') as fp:
    fp.write(msg)

In [3]:
train_file = 'data/adult.data.clean'
df = pd.read_csv(train_file, header=None)

with open('data/col_names.txt', 'r') as fp:
    df.columns = fp.read().split(',')

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [5]:
X = df.loc[:, :'native_country']
y = df.loc[:, 'income']

In [6]:
seed = 7
test_size = 0.33

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

opts = {
    'penalty' : ['l1', 'l2'],
    'C': list(np.linspace(0.1, 1, 10))
}

data = {
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test
}

estimator = LogisticRegression()

gs = GridSearcher(estimator, data)
opts, score = gs.search(opts)

print('best score', score)
print('best params', opts)

best score 0.6918588701914204
best params {'penalty': 'l1', 'C': 0.2}


In [8]:
def run_experiment(opts, data, intel=None):
    experiment_start = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
    
    estimator = LogisticRegression()
    estimator.set_params(**opts)
    estimator.fit(data['X_train'], data['y_train'])
    
    res = {}
    if intel:
        res = copy(intel)
    res['accuracy'] = estimator.score(data['X_test'], data['y_test'])
    
    y_pred = estimator.predict(data['X_test'])
    cm = confusion_matrix(data['y_test'], y_pred)
    res['cm_accuracy'] = cm_score(cm)
    
    experiment_end = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
    res['time_start'] = experiment_start
    res['time_end'] = experiment_end
    
    df = pd.DataFrame(res, index=[0])
    experiment_dir = os.path.join(app_knn_dir, experiment_end)
    os.mkdir(experiment_dir)
    
    joblib.dump(estimator, os.path.join(experiment_dir, 'model.pkl'))
    
    with open(os.path.join(experiment_dir, 'opts.json'), 'w') as fp:
        json.dump(opts, fp)
    
    df.to_csv(os.path.join(experiment_dir, 'results.cvs'), index=None)
    
    return df

In [9]:
intel = {
    'data_normalized': 0
}

res = run_experiment(opts, data, intel)
res

Unnamed: 0,accuracy,cm_accuracy,data_normalized,time_end,time_start
0,0.820957,0.687129,0,2018-03-02-22-34-10,2018-03-02-22-34-10


In [11]:
# just check if model is fine

estimator2 = joblib.load(os.path.join(app_knn_dir, res.loc[0, 'time_end'], 'model.pkl')) 
estimator2.score(X_test, y_test)

0.8209566350269868