# Logistic Regression Comparison Exercise
Compare three versions of logistic regression

Selected dataset from UC Irvine dataset repository:
https://archive.ics.uci.edu/ml/datasets/Epileptic+Seizure+Recognition

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv('epileptic_seizures.csv')

## Exploring the data

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
0,X21.V1.791,135,190,229,223,192,125,55,-9,-33,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,4
1,X15.V1.924,386,382,356,331,320,315,307,272,244,...,164,150,146,152,157,156,154,143,129,1
2,X8.V1.1,-32,-39,-47,-37,-32,-36,-57,-73,-85,...,57,64,48,19,-12,-30,-35,-35,-36,5
3,X16.V1.60,-105,-101,-96,-92,-89,-95,-102,-100,-87,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,5
4,X20.V1.54,-9,-65,-98,-102,-78,-48,-16,0,-21,...,4,2,-12,-32,-41,-65,-83,-89,-73,5


In [5]:
df.tail()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
11495,X22.V1.114,-22,-22,-23,-26,-36,-42,-45,-42,-45,...,15,16,12,5,-1,-18,-37,-47,-48,2
11496,X19.V1.354,-47,-11,28,77,141,211,246,240,193,...,-65,-33,-7,14,27,48,77,117,170,1
11497,X8.V1.28,14,6,-13,-16,10,26,27,-9,4,...,-65,-48,-61,-62,-67,-30,-2,-1,-8,5
11498,X10.V1.932,-40,-25,-9,-12,-2,12,7,19,22,...,121,135,148,143,116,86,68,59,55,3
11499,X16.V1.210,29,41,57,72,74,62,54,43,31,...,-59,-25,-4,2,5,4,-2,2,20,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11500 entries, 0 to 11499
Columns: 180 entries, Unnamed: 0 to y
dtypes: int64(179), object(1)
memory usage: 15.8+ MB


In [7]:
df.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,y
count,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,...,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0
mean,-11.581391,-10.911565,-10.18713,-9.143043,-8.009739,-7.003478,-6.502087,-6.68713,-6.558,-6.168435,...,-10.145739,-11.630348,-12.943478,-13.66887,-13.363304,-13.045043,-12.70513,-12.426,-12.195652,3.0
std,165.626284,166.059609,163.524317,161.269041,160.998007,161.328725,161.467837,162.11912,162.03336,160.436352,...,164.652883,166.14979,168.554058,168.556486,167.25729,164.241019,162.895832,162.886311,164.852015,1.414275
min,-1839.0,-1838.0,-1835.0,-1845.0,-1791.0,-1757.0,-1832.0,-1778.0,-1840.0,-1867.0,...,-1867.0,-1865.0,-1642.0,-1723.0,-1866.0,-1863.0,-1781.0,-1727.0,-1829.0,1.0
25%,-54.0,-55.0,-54.0,-54.0,-54.0,-54.0,-54.0,-55.0,-55.0,-54.0,...,-55.0,-56.0,-56.0,-56.0,-55.0,-56.0,-55.0,-55.0,-55.0,2.0
50%,-8.0,-8.0,-7.0,-8.0,-8.0,-8.0,-8.0,-8.0,-7.0,-7.0,...,-9.0,-10.0,-10.0,-10.0,-10.0,-9.0,-9.0,-9.0,-9.0,3.0
75%,34.0,35.0,36.0,36.0,35.0,36.0,35.0,36.0,36.0,35.25,...,34.0,34.0,33.0,33.0,34.0,34.0,34.0,34.0,34.0,4.0
max,1726.0,1713.0,1697.0,1612.0,1518.0,1816.0,2047.0,2047.0,2047.0,2047.0,...,1777.0,1472.0,1319.0,1436.0,1733.0,1958.0,2047.0,2047.0,1915.0,5.0


In [9]:
df.isna().sum().sum()

0

In [14]:
df.dtypes

Unnamed: 0    object
X1             int64
X2             int64
X3             int64
X4             int64
X5             int64
X6             int64
X7             int64
X8             int64
X9             int64
X10            int64
X11            int64
X12            int64
X13            int64
X14            int64
X15            int64
X16            int64
X17            int64
X18            int64
X19            int64
X20            int64
X21            int64
X22            int64
X23            int64
X24            int64
X25            int64
X26            int64
X27            int64
X28            int64
X29            int64
               ...  
X150           int64
X151           int64
X152           int64
X153           int64
X154           int64
X155           int64
X156           int64
X157           int64
X158           int64
X159           int64
X160           int64
X161           int64
X162           int64
X163           int64
X164           int64
X165           int64
X166         

In [44]:
X = df.drop(columns=['Unnamed: 0', 'y']).values
y = np.where(df['y'] == 1, 1, 0).ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [62]:
print('The percentage of epilepsy occurances is {}%'.format(y.sum()/len(y)*100))

The percentage of epilepsy occurances is 20.0%


## Regular Logistic Regression

In [46]:
logreg = LogisticRegression(solver='liblinear', max_iter=200)
logreg.fit(X_train, y_train)
logreg.score(X_test, y_test)


0.8217391304347826

In [65]:
y_pred = logreg.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[2754    4]
 [ 611   81]]


## Ridge Logistic Regression

In [51]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
ridge = LogisticRegression(solver='liblinear', penalty='l2', max_iter=200)
grid_ridge = GridSearchCV(ridge, param_grid, cv=3)

In [52]:
grid_ridge.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
grid_ridge.best_params_

{'C': 0.01}

In [54]:
grid_ridge.best_score_

0.8248447204968944

In [55]:
grid_ridge.score(X_test, y_test)

0.8214492753623188

In [67]:
y_pred = grid_ridge.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[2752    6]
 [ 610   82]]


## Lasso Logistic Regression

In [56]:
lasso = LogisticRegression(solver='liblinear', penalty='l1', max_iter=200)
grid_lass = GridSearchCV(lasso, param_grid, cv=3)
grid_lass.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [57]:
grid_lass.best_params_

{'C': 10}

In [59]:
grid_lass.best_score_

0.8240993788819876

In [60]:
grid_lass.score(X_test, y_test)

0.8217391304347826

In [66]:
y_pred = grid_lass.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[2754    4]
 [ 611   81]]
