In [2]:
from lib.project_5 import load_data_from_database, make_data_dict, general_model, general_transformer

# Step 3 - Build Model

**NOTE: EACH OF THESE SHOULD BE WRITTEN SOLELY WITH REGARD TO STEP 3 - Build Model**

### Domain and Data

**TODO:** Write a simple statement about the domain of your problem and the dataset upon which you will be working. 

### Problem Statement

**TODO:** At this point we desire to get the best model with least amount of features as possible without harming the accuracy and risking under fitting. 

### Solution Statement

**TODO:** We will implement kNN classifier along with a KBest transformer. For the sace of experimenting we will also run the logistic regression model through the grid search to compare the score.

### Metric

**TODO**: Accuracy score. 

### Benchmark

**TODO**: We started at a benchmark of accuracy score of 0.53. The final score is 0.882.

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/build_model.png" width="600px">

	1. construct a Pipeline that uses SelectKBest to transform data
	2. construct a Pipeline that uses LogisticRegression to model data
	3. construct a Pipeline that uses KNearestNeighbors to model data
Gridsearch optimal parameters for logistic regression and KNN

In [3]:
import numpy as np
import pandas as pd

from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier


In [4]:
madelon_df = load_data_from_database('joshuacook.me', '5432',
                           'dsi', 'madelon', 'dsi_student',
                           'correct horse battery staple')
madelon_df = madelon_df.drop('index', axis = 1)
X = madelon_df.drop('label', axis = 1)
y = madelon_df.label
split_data_dict = make_data_dict(X, y, random_state=42)
scaled_data = general_transformer(StandardScaler(), split_data_dict)

### Gridsearch | KBest: 
#####        Logistic Regression Model 

In [5]:
feat_sel_params = range(3,30)
gridsearch_lr_params = {
    'penalty' : ["l1", "l2"],
    'C' : np.linspace(0.001,1,100)}

gridsearch_result_list_lr = []

for k in feat_sel_params:
    final_data = general_transformer(SelectKBest(k=k), scaled_data)
    gridsearch_result = GridSearchCV(LogisticRegression(), 
                                     param_grid=gridsearch_lr_params)
    gridsearch_lr = general_model(gridsearch_result,final_data)
    gridsearch_result_list_lr.append(gridsearch_lr)

In [8]:
gridsearch_result_list_lr

[{'model': GridSearchCV(cv=None, error_score='raise',
         estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False),
         fit_params={}, iid=True, n_jobs=1,
         param_grid={'penalty': ['l1', 'l2'], 'C': array([ 0.001  ,  0.01109, ...,  0.98991,  1.     ])},
         pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
         scoring=None, verbose=0),
  'test_score': 0.59333333333333338,
  'train_score': 0.62285714285714289},
 {'model': GridSearchCV(cv=None, error_score='raise',
         estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=

In [9]:
print ("Best parameters : ", gridsearch_lr["model"].best_params_)
print ("Best score : ", gridsearch_lr["model"].best_score_)
print ("Best estimator : ", gridsearch_lr["model"].best_estimator_)

('Best parameters : ', {'penalty': 'l2', 'C': 0.021181818181818184})
('Best score : ', 0.64642857142857146)
('Best estimator : ', LogisticRegression(C=0.021181818181818184, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))


In [10]:
gridsearch_lr_df = pd.DataFrame(gridsearch_lr['model'].cv_results_)
gridsearch_lr_df.sort(columns='rank_test_score').head(5)

  from ipykernel import kernelapp as app


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_penalty,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
5,0.010333,0.0,0.646429,0.667854,0.0211818,l2,"{u'penalty': u'l2', u'C': 0.0211818181818}",1,0.638116,0.676313,0.642398,0.649518,0.658798,0.67773,0.007318,0.0,0.008911,0.012978
26,0.010333,0.000333,0.642143,0.670356,0.132182,l1,"{u'penalty': u'l1', u'C': 0.132181818182}",2,0.635974,0.681672,0.642398,0.65702,0.648069,0.672377,0.000943,0.000471,0.00494,0.010165
7,0.006333,0.0,0.642143,0.66964,0.0312727,l2,"{u'penalty': u'l2', u'C': 0.0312727272727}",2,0.62955,0.676313,0.642398,0.653805,0.654506,0.678801,0.008957,0.0,0.010188,0.011243
21,0.004667,0.000333,0.642143,0.671781,0.101909,l2,"{u'penalty': u'l2', u'C': 0.101909090909}",2,0.627409,0.676313,0.640257,0.653805,0.658798,0.685225,0.000471,0.000471,0.012882,0.013221
19,0.004667,0.000333,0.642143,0.672495,0.0918182,l2,"{u'penalty': u'l2', u'C': 0.0918181818182}",2,0.627409,0.678457,0.640257,0.653805,0.658798,0.685225,0.000471,0.000471,0.012882,0.013502


#####        kNN Model 

In [11]:
featsel_kn_params = range(3, 20)
gridSch_params = {
    'n_neighbors' : range(3,10,2)
}
gridsearch_result_list_kNN = []

for k in feat_sel_params:
    final_data = general_transformer(SelectKBest(k=k), scaled_data)
    gridsearch_result = GridSearchCV(KNeighborsClassifier(), 
                                     param_grid=gridSch_params )
    gridsearch_lr = general_model(gridsearch_result,final_data)
    gridsearch_result_list_kNN.append(gridsearch_lr)
    
gridsearch_result_list_kNN

[{'model': GridSearchCV(cv=None, error_score='raise',
         estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=5, p=2,
             weights='uniform'),
         fit_params={}, iid=True, n_jobs=1,
         param_grid={'n_neighbors': [3, 5, 7, 9]}, pre_dispatch='2*n_jobs',
         refit=True, return_train_score=True, scoring=None, verbose=0),
  'test_score': 0.56833333333333336,
  'train_score': 0.68857142857142861},
 {'model': GridSearchCV(cv=None, error_score='raise',
         estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=5, p=2,
             weights='uniform'),
         fit_params={}, iid=True, n_jobs=1,
         param_grid={'n_neighbors': [3, 5, 7, 9]}, pre_dispatch='2*n_jobs',
         refit=True, return_train_score=True, scoring=None, verbose=0),
  'test_score': 0.57166666666666666,
  'train_score': 0.

In [21]:
best_kNN_score = 0
k_min = min(featsel_kn_params)
for k in featsel_kn_params:
    test_score = gridsearch_result_list_kNN[k-k_min]['test_score']
    if best_kNN_score < test_score:
        best_kNN_score = test_score
        best_k = k
print ("best model test score : "), best_kNN_score
print ("best model train score : "), gridsearch_result_list_kNN[best_k - k_min]['train_score']
print ("best k : "), best_k
print ("best model : "), gridsearch_result_list_kNN[best_k - k_min]['model']

best model test score :  0.881666666667
best model train score :  0.917142857143
best k :  13
best model :  GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [3, 5, 7, 9]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)
