In [2]:
from lib.project_5 import load_data_from_database, make_data_dict, general_model, general_transformer

# Step 2 - Identify Salient Features Using $\ell1$-penalty

**NOTE: EACH OF THESE SHOULD BE WRITTEN SOLELY WITH REGARD TO STEP 2 - Identify Features**

### Domain and Data

**TODO:** Write a simple statement about the domain of your problem and the dataset upon which you will be working. 

### Problem Statement

**TODO:** At this part we want to scale and transorm the model so that we have feature reduction, in order to distinguish salient features from noise.

### Solution Statement

**TODO:** We apply the l1 penatly in the model to reduce the coeficient of insignificant features. We want to find the salient features.

### Metric

**TODO**: Again we will use the accuracy score as the main indication if the model is performing will, but in addition, we want to monitor the number of features and optimally reduce the number of features.

### Benchmark

**TODO**: Introducing the l1 penalty actually harmed the accuracy scored, although not too much. However it did have a positive effect on the number of features and reduced it to 453.

By adding a k Best transformer and appling a grid search, we were able to reduce the number of features to two. The final accuracy score is 0.62 and the AUC is 0.593. Despite the feature reduction the model still needs to be improved.  

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/identify_features.png" width="600px">

In [3]:
import numpy as np
import pandas as pd

from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

In [4]:
madelon_df = load_data_from_database('joshuacook.me', '5432',
                           'dsi', 'madelon', 'dsi_student',
                           'correct horse battery staple')
madelon_df = madelon_df.drop('index', axis = 1)
X = madelon_df.drop('label', axis = 1)
y = madelon_df.label
split_data_dict = make_data_dict(X, y, random_state=42)
final_data = general_transformer(StandardScaler(), split_data_dict)
logisticRegression_l1_C1 = general_model(LogisticRegression(penalty='l1', C=1), final_data)

In [5]:
logisticRegression_l1_C1

{'model': LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'test_score': 0.52833333333333332,
 'train_score': 0.8035714285714286}

In [6]:
X_ = final_data['X_train']
y_ = final_data['y_train']
X__ = final_data['X_test']
y__ = final_data['y_test']

##### Also manauly we can check:

In [7]:
lr = LogisticRegression(penalty='l1').fit(X_,y_)
tr = lr.score(X_, y_)
ts = lr.score(X__,y__)
print (tr,ts)

(0.8035714285714286, 0.52833333333333332)


### Top features for Log. Regr. with penalty l1 and c=1

In [8]:
feature_co = []
for i,j in enumerate (X.columns):
    feature_co.append([j, (logisticRegression_l1_C1['model'].coef_[0][i]),
                       abs(logisticRegression_l1_C1['model'].coef_[0][i])])
    
log_reg_coef_l1_C1 = pd.DataFrame(feature_co,columns=['feat.','coef.','abscoef.'])
log_reg_coef_l1_C1.sort_values(['abscoef.'],ascending=False).head(15)

Unnamed: 0,feat.,coef.,abscoef.
442,feat_442,-0.793853,0.793853
153,feat_153,-0.653517,0.653517
241,feat_241,0.626621,0.626621
472,feat_472,0.430815,0.430815
424,feat_424,0.319375,0.319375
46,feat_046,0.302231,0.302231
56,feat_056,0.298208,0.298208
48,feat_048,0.295484,0.295484
493,feat_493,0.285366,0.285366
494,feat_494,0.276396,0.276396


In [9]:
len([coef for coef in logisticRegression_l1_C1['model'].coef_[0] if abs(coef) > 0])

453

### Gridsearch to find if l1 or l2 gives better result with what values of C

In [10]:
gridsearch_lr_params = {
    'penalty' : ["l1", "l2"],
    'C' : [0.001,0.01,0.1,1,10,100,1000]    
}

gridsearch_result = GridSearchCV(LogisticRegression(), 
                                 param_grid=gridsearch_lr_params)
gridsearch_lr = general_model(gridsearch_result,final_data)
gridsearch_lr

{'model': GridSearchCV(cv=None, error_score='raise',
        estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
        fit_params={}, iid=True, n_jobs=1,
        param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
        pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
        scoring=None, verbose=0),
 'test_score': 0.59333333333333338,
 'train_score': 0.62285714285714289}

In [11]:
gridsearch_lr["model"].best_params_

{'C': 0.01, 'penalty': 'l1'}

In [11]:
gridsearch_lr["model"].best_score_

0.62214285714285711

In [12]:
gridsearch_lr["model"].best_estimator_

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
gridsearch_lr_df = pd.DataFrame(gridsearch_lr['model'].cv_results_)
gridsearch_lr_df.head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_penalty,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.031667,0.001333,0.503571,0.503572,0.001,l1,"{u'penalty': u'l1', u'C': 0.001}",14,0.503212,0.503751,0.503212,0.503751,0.504292,0.503212,0.005735,0.0004713704,0.000509,0.000254
1,0.048,0.001,0.602143,0.773927,0.001,l2,"{u'penalty': u'l2', u'C': 0.001}",2,0.586724,0.789925,0.631692,0.753483,0.587983,0.778373,0.001633,1.123916e-07,0.020912,0.015206
2,0.028667,0.001,0.622143,0.623928,0.01,l1,"{u'penalty': u'l1', u'C': 0.01}",1,0.631692,0.619507,0.616702,0.62701,0.618026,0.625268,0.000943,1.123916e-07,0.006777,0.003206
3,0.089,0.001333,0.569286,0.849283,0.01,l2,"{u'penalty': u'l2', u'C': 0.01}",4,0.56531,0.859593,0.5803,0.830654,0.562232,0.857602,0.004967,0.0004714827,0.007893,0.013198
4,0.127,0.001333,0.577857,0.80642,0.1,l1,"{u'penalty': u'l1', u'C': 0.1}",3,0.571734,0.801715,0.59743,0.78671,0.564378,0.830835,0.041045,0.0004714266,0.01417,0.018319


### Top features for Log. Regr. for best estimator (Penalty = l1 and C=0.01)

In [17]:
feature_co = []
for i,j in enumerate (X.columns):
    feature_co.append([j, (logisticRegressionn_l1_C01['model'].coef_[0][i]),
                       abs(logisticRegressionn_l1_C01['model'].coef_[0][i])])
    
log_reg_coef_l1_C01 = pd.DataFrame(feature_co,columns=['feat.','coef.','abscoef.'])
log_reg_coef_l1_C01.sort_values(['abscoef.'],ascending=False).head(15)

Unnamed: 0,feat.,coef.,abscoef.
475,feat_475,0.102169,0.102169
241,feat_241,0.076752,0.076752
0,feat_000,0.0,0.0
329,feat_329,0.0,0.0
341,feat_341,0.0,0.0
340,feat_340,0.0,0.0
339,feat_339,0.0,0.0
338,feat_338,0.0,0.0
337,feat_337,0.0,0.0
336,feat_336,0.0,0.0


In [28]:
len([coef for coef in logisticRegressionn_l1_C01['model'].coef_[0] if abs(coef) > 0])

2

In [22]:
y_pred = logisticRegressionn_l1_C01['model'].predict(X__)


In [23]:
confusion_mtx = confusion_matrix(y__,y_pred)
confusion_mtx

array([[174, 121],
       [123, 182]])

In [25]:
type_1_error, true_positive, _ = roc_curve(y__,y_pred)
area_under_curve_lr = auc(type_1_error, true_positive)
area_under_curve_lr

0.59327590997499313