## __Logistic Regression Classifier__

---

<br>

Author:      Tyler J. Brough <br>
Last Update: March 1, 2022 <br>

---

<br>

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/islr2_smarket.csv')

In [3]:
data.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [4]:
data['Label'] = data['Direction']
data['Direction'].replace(['Up', 'Down'], [1.0, 0.0], inplace=True)

In [5]:
data.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction,Label
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,1.0,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,1.0,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,0.0,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,1.0,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,1.0,Up


In [7]:
X_train = data.iloc[:1000, [1, 6]].to_numpy()
y_train = data.iloc[:1000, [8]].to_numpy().ravel()

### __Using Scikit-Learn__

In [8]:
## Scikit-learn 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [10]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [11]:
model.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [12]:
X_test = data.iloc[1000:, [1, 6]].to_numpy()
y_test = data.iloc[1000:, [8]].to_numpy().ravel()

In [13]:
model.score(X_test, y_test)

0.536

In [14]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.536

In [15]:
confusion_matrix(y_test, y_pred)

array([[77, 32],
       [84, 57]])

### __Using Statsmodels (Logit)__

In [16]:
## Statsmodels
import statsmodels.api as sm

In [17]:
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [18]:
log_reg = sm.Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.692435
         Iterations 3


In [19]:
print(log_reg.summary())

                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      997
Method:                           MLE   Df Model:                            2
Date:                Wed, 02 Mar 2022   Pseudo R-squ.:               0.0008860
Time:                        10:13:25   Log-Likelihood:                -692.44
converged:                       True   LL-Null:                       -693.05
Covariance Type:            nonrobust   LLR p-value:                    0.5412
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1575      0.330      0.478      0.633      -0.489       0.804
x1            -0.0522      0.052     -1.009      0.313      -0.154       0.049
x2            -0.0947      0.237     -0.400      0.6

In [20]:
y_probs_sm_log = log_reg.predict(X_test)
y_probs_sm_log

array([0.51383657, 0.50293011, 0.49763997, 0.50624271, 0.49962146,
       0.51207323, 0.4971928 , 0.51486163, 0.49992797, 0.4889516 ,
       0.51625667, 0.50945245, 0.50881467, 0.5085855 , 0.49602271,
       0.49432331, 0.50093692, 0.50404317, 0.48856393, 0.49055905,
       0.49824468, 0.50617694, 0.48595017, 0.50889283, 0.50521394,
       0.51477896, 0.49856104, 0.49334096, 0.50791081, 0.49890915,
       0.50385418, 0.51228374, 0.5017288 , 0.51697052, 0.49651205,
       0.49312049, 0.49115221, 0.50521551, 0.49154519, 0.50258316,
       0.50068384, 0.48806935, 0.50071507, 0.50956671, 0.51228707,
       0.49896479, 0.5149247 , 0.49800685, 0.51333865, 0.51076245,
       0.49956627, 0.48447229, 0.50271789, 0.50260292, 0.48525507,
       0.49984175, 0.49484029, 0.49663539, 0.47178188, 0.48783919,
       0.49649309, 0.48656447, 0.48923029, 0.49384986, 0.48658221,
       0.51090867, 0.50324192, 0.48525483, 0.50617045, 0.49664388,
       0.49747515, 0.48390294, 0.48090641, 0.50415097, 0.45904

In [21]:
y_pred_sm_log = np.round(y_probs_sm_log)
y_pred_sm_log

array([1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1.,
       0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1.,
       0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [22]:
accuracy_score(y_test, y_pred_sm_log)

0.528

### __The Probit Model__

In [23]:
prob_reg = sm.Probit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.692435
         Iterations 3


In [24]:
print(prob_reg.summary())

                          Probit Regression Results                           
Dep. Variable:                      y   No. Observations:                 1000
Model:                         Probit   Df Residuals:                      997
Method:                           MLE   Df Model:                            2
Date:                Wed, 02 Mar 2022   Pseudo R-squ.:               0.0008860
Time:                        10:16:45   Log-Likelihood:                -692.44
converged:                       True   LL-Null:                       -693.05
Covariance Type:            nonrobust   LLR p-value:                    0.5412
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0989      0.207      0.478      0.632      -0.306       0.504
x1            -0.0327      0.032     -1.010      0.313      -0.096       0.031
x2            -0.0595      0.148     -0.401      0.6

In [25]:
y_pred_sm_prob = np.round(prob_reg.predict(X_test))

In [26]:
accuracy_score(y_test, y_pred_sm_prob)

0.528

### __The Linear Probability Model__

In [27]:
lpm = sm.OLS(y_train, X_train).fit()

In [28]:
print(lpm.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.6124
Date:                Wed, 02 Mar 2022   Prob (F-statistic):              0.542
Time:                        10:16:59   Log-Likelihood:                -725.08
No. Observations:                1000   AIC:                             1456.
Df Residuals:                     997   BIC:                             1471.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5393      0.082      6.544      0.0

In [29]:
y_probs_sm_lpm = lpm.predict(X_test)
y_probs_sm_lpm[:10], y_probs_sm_lpm.min(), y_probs_sm_lpm.max()

(array([0.51382373, 0.50293711, 0.49765705, 0.50624122, 0.49963382,
        0.51206142, 0.49721078, 0.51484605, 0.49993865, 0.48898559]),
 0.4540219517688243,
 0.519452154020871)

In [30]:
y_pred_sm_lpm = np.round(y_probs_sm_lpm)

In [31]:
accuracy_score(y_test, y_pred_sm_lpm)

0.528