# Lab 2: classification methods

This lab is due by midnight Saturday Feb 19th

In [1]:
pip install -U scikit-learn scipy matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import neighbors
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split


  from pandas import Int64Index as NumericIndex


In [3]:
# You will need to change this for your environment
DATA_ROOT = ''

In [4]:
# Note the 'index_col' argument here, which makes slicing easier below.
market = pd.read_csv(DATA_ROOT + 'Smarket.csv', index_col=0, parse_dates=True)
market.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2001-01-01,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2001-01-01,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
2001-01-01,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
2001-01-01,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


## Logistic Regression


In [5]:
# We will re-use this formula with other learning methods below
all_lags = 'Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume'

marklr = smf.glm(formula=all_lags, data=market, family=sm.families.Binomial())
mlr_res = marklr.fit()
print(mlr_res.summary())

# The predicted values are probabilities
mlr_prob = mlr_res.predict()
print('predicted probabilities:', mlr_prob[0:10])

# Here we create a set of qualitative predictions by thresholding on the probabilities
predictions_nominal = ["Up" if x < 0.5 else "Down" for x in mlr_prob]
print('qualitative predictions:', predictions_nominal[0:10])

# Note: the '.T' here to take the transpose so that the true classes are columns and the predicted classes are rows,
# matching the class slides
print('confusion matrix:\n', confusion_matrix(market["Direction"], predictions_nominal).T)

print(classification_report(market["Direction"], predictions_nominal, digits=3))

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                 1250
Model:                                              GLM   Df Residuals:                     1243
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    Logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -863.79
Date:                                  Fri, 18 Feb 2022   Deviance:                       1727.6
Time:                                          21:46:00   Pearson chi2:                 1.25e+03
No. Iterations:                                       4   Pseudo R-squ. (CS):           0.002868
Covariance Type:                              nonrobust                                         
                 coef    std e

In [6]:
# Split the data into training and test sets, training on everything up to and including 2004 data
# and testing on 2005 and later data:
x_train = market[:'2004'][:]
y_train = market[:'2004']['Direction']

x_test = market['2005':][:]
y_test = market['2005':]['Direction']

In [7]:
# Fit a logistic regression to the training data and (below) evaluate it using the test data
mlr_04 = smf.glm(formula=all_lags, data=x_train, family=sm.families.Binomial())
res_04 = mlr_04.fit()
print(res_04.summary())

# Build predictions of the test data using a 0.5 threshold
prob_04 = res_04.predict(x_test)
pred_04 = ['Up' if x < 0.5 else 'Down' for x in prob_04]

print('confusion matrix:\n', confusion_matrix(y_test, pred_04).T)
print(classification_report(y_test, pred_04))

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                  998
Model:                                              GLM   Df Residuals:                      991
Model Family:                                  Binomial   Df Model:                            6
Link Function:                                    Logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -690.55
Date:                                  Fri, 18 Feb 2022   Deviance:                       1381.1
Time:                                          21:46:00   Pearson chi2:                     998.
No. Iterations:                                       4   Pseudo R-squ. (CS):           0.002162
Covariance Type:                              nonrobust                                         
                 coef    std e

## Your job: build and test a LR model with only the two predictors with the best p-values above

Looking at the model summary above, that will be Lag1 and Lag2.

Build the new model below, and generate a new confusion matrix and classification report as above.

In [8]:
# Build a model using just lag1 and lag2 and test it (skip the code for the lab)

slr = smf.glm(formula='Direction ~ Lag1 + Lag2', data=x_train, family=sm.families.Binomial())
slr_fit = slr.fit()
print(slr_fit.summary())
prob_slr = slr_fit.predict(x_test)
pred_slr = ['Up' if x < 0.5 else 'Down' for x in prob_slr]
print('confusion matrix:\n', confusion_matrix(y_test, pred_slr).T)
print(classification_report(y_test, pred_slr))

                          Generalized Linear Model Regression Results                           
Dep. Variable:     ['Direction[Down]', 'Direction[Up]']   No. Observations:                  998
Model:                                              GLM   Df Residuals:                      995
Model Family:                                  Binomial   Df Model:                            2
Link Function:                                    Logit   Scale:                          1.0000
Method:                                            IRLS   Log-Likelihood:                -690.70
Date:                                  Fri, 18 Feb 2022   Deviance:                       1381.4
Time:                                          21:46:00   Pearson chi2:                     998.
No. Iterations:                                       4   Pseudo R-squ. (CS):           0.001865
Covariance Type:                              nonrobust                                         
                 coef    std e

## Questions 1 - 3

Question 1: How does the overall accuracy of this smaller model compare

Question 2: Show how to use the confusion matrix to derive the overall accuracy as shown in the classification report.
(The calculations can be typed here and do not have to be shown with code.)

Question 3: How does the interpretability of the second model compare with the first in your opinion? Justify your answer.


1) The overall accuracy of the model with just Lag1 and Lag2 is better than the model with all of the variables. Once we split the data into training and testing sets, the accuracy of the first model with all of the lags and Volume is 0.48 while the model with only lag 1 and lag 2 has an accuracy of 0.56. 

2) To get overall accuracy, you divide the total number of correct classifications (35+106=141) by the total number of observations (35+35+76+106=252). So, you get 141/252 which is 0.56. We can test this formula on the first regression with all of the lags and Volume as well: (77+44)/(77+44+97+34) = 0.48.

3) The interpretability does not change much between the two models because they are both logistic regression models. However, you could say the second model is easier to interpret because there are less variables.

## K-Nearest Neighbors

We now build a model for the same data with K-Nearest neighbors

In [9]:
knn = neighbors.KNeighborsClassifier(n_neighbors=1)

# Restrict the training and test data to only have the 'Lag1' and 'Lag2' predictor variables.
# (This code fits the model and makes predictions in one line.)
pred = knn.fit(x_train[['Lag1', 'Lag2']], y_train).predict(x_test[['Lag1', 'Lag2']])

print('KNN confusion matrix:\n', confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred))

KNN confusion matrix:
 [[43 58]
 [68 83]]
              precision    recall  f1-score   support

        Down       0.43      0.39      0.41       111
          Up       0.55      0.59      0.57       141

    accuracy                           0.50       252
   macro avg       0.49      0.49      0.49       252
weighted avg       0.50      0.50      0.50       252



In [10]:
# KNN with K of 1 performed poorly, let's try K of 3

knn = neighbors.KNeighborsClassifier(n_neighbors=3)
pred = knn.fit(x_train[['Lag1', 'Lag2']], y_train).predict(x_test[['Lag1', 'Lag2']])

print('KNN confusion matrix:\n', confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred))

KNN confusion matrix:
 [[48 55]
 [63 86]]
              precision    recall  f1-score   support

        Down       0.47      0.43      0.45       111
          Up       0.58      0.61      0.59       141

    accuracy                           0.53       252
   macro avg       0.52      0.52      0.52       252
weighted avg       0.53      0.53      0.53       252



## Your task: try some more values for K (number of neighbors) and report on which has best overall accuracy

In [11]:
# That was an improvement, try some other values to compare

for k in range(10):
    # Your code here (and delete the 'pass' line)
    knn = neighbors.KNeighborsClassifier(n_neighbors=k+1)
    pred = knn.fit(x_train[['Lag1', 'Lag2']], y_train).predict(x_test[['Lag1', 'Lag2']])
    print('KNN confusion matrix with k=', (k+1), ':\n', confusion_matrix(y_test, pred).T)
    print(classification_report(y_test, pred))
    # pass

KNN confusion matrix with k= 1 :
 [[43 58]
 [68 83]]
              precision    recall  f1-score   support

        Down       0.43      0.39      0.41       111
          Up       0.55      0.59      0.57       141

    accuracy                           0.50       252
   macro avg       0.49      0.49      0.49       252
weighted avg       0.50      0.50      0.50       252

KNN confusion matrix with k= 2 :
 [[74 93]
 [37 48]]
              precision    recall  f1-score   support

        Down       0.44      0.67      0.53       111
          Up       0.56      0.34      0.42       141

    accuracy                           0.48       252
   macro avg       0.50      0.50      0.48       252
weighted avg       0.51      0.48      0.47       252

KNN confusion matrix with k= 3 :
 [[48 55]
 [63 86]]
              precision    recall  f1-score   support

        Down       0.47      0.43      0.45       111
          Up       0.58      0.61      0.59       141

    accuracy           

## Question 4:

Question 4: Which of the other K values that you tried for K-Nearest neighbors worked the best, based on overall accuracy?

4) Based on the K vaues I tried (1-10), K=3 had the best overall accuracy with 0.53 (53% accuracy).

# Linear discriminant analysis

In [12]:
lda = LinearDiscriminantAnalysis()
ldm = lda.fit(x_train[['Lag1', 'Lag2']], y_train)

print('Priors:', ldm.priors_)
print('Means:', ldm.means_)
print('Coefficients:', ldm.coef_)

pred = ldm.predict(x_test[['Lag1', 'Lag2']])
print(confusion_matrix(pred, y_test).T)
print(classification_report(y_test, pred))


Priors: [0.49198397 0.50801603]
Means: [[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]
Coefficients: [[-0.05544078 -0.0443452 ]]
[[ 35  76]
 [ 35 106]]
              precision    recall  f1-score   support

        Down       0.50      0.32      0.39       111
          Up       0.58      0.75      0.66       141

    accuracy                           0.56       252
   macro avg       0.54      0.53      0.52       252
weighted avg       0.55      0.56      0.54       252



## Quadratic discriminant analysis

In [13]:
qda = QuadraticDiscriminantAnalysis()
qdm = qda.fit(x_train[['Lag1', 'Lag2']], y_train)

print('Priors:', qdm.priors_)
print('Means:', qdm.means_)

q_pred = qdm.predict(x_test[['Lag1', 'Lag2']])
print(confusion_matrix(q_pred, y_test).T)
print(classification_report(y_test, q_pred))

Priors: [0.49198397 0.50801603]
Means: [[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]
[[ 30  81]
 [ 20 121]]
              precision    recall  f1-score   support

        Down       0.60      0.27      0.37       111
          Up       0.60      0.86      0.71       141

    accuracy                           0.60       252
   macro avg       0.60      0.56      0.54       252
weighted avg       0.60      0.60      0.56       252



## Question 5

Question 5: which of the methods that you tried produced the best results for predicting Direction from Lag1 and Lag2?

5) Quadratic discriminant analysis produced the best results for predicting Direction from Lag1 and Lag2. LDA did better than k-nearest neighbors because the overall accuracy was 0.56, higher than any of the KNN models we tried. LDA had the same accuracy as the logistic regression with 0.56. But, QDA beat them all with an overall accuracy of 0.60.

# Carseats data

Now load the carseats data and try to predict whether the store is located in the US from the other predictor variables.

Report below on your findings about (at least) three different learning approaches, comparing their overall accuracy.

If you use K-nearest neighbors, be sure to try a few different values for K and report on the best one, showing your work.

If you use logistic regression, try to find a simple model with good accuracy by dropping predictors with high p-values.

In [14]:
seats = pd.read_csv(DATA_ROOT + 'Carseats.csv')
seats.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [15]:
# Pick random training and test sets for your analysis:
x_train, x_test, y_train, y_test = train_test_split(seats, seats['US'],
                                                    train_size=0.8, test_size=0.2)

# Hint: if you need to remove some predictors for training or testing in any of the learning methods,
# you can use the pandas 'drop' function to drop the corresponding columns, e.g.
x_train.drop(columns=['US']).head()

# Hint 2: if you want to write a formula and include a lot of columns, you could use the method
# that was shown in lab 1, e.g.:
#sm.OLS.from_formula('medv ~ ' + '+'.join(df.columns.difference(['medv', 'age', 'indus'])), df)


Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban
308,9.24,126,80,19,436,126,Medium,52,10,Yes
366,5.98,124,56,11,447,134,Medium,53,12,No
284,6.97,106,46,11,414,96,Bad,79,17,No
21,12.13,134,29,12,239,109,Good,62,18,No
31,8.25,136,58,16,241,131,Medium,44,18,Yes


In [16]:
# Your code goes here. I would recommend using a different cell for each learning method:

# learning method 1: Logistic Regression (LR)
all_vars = 'US ~ Sales+CompPrice+Income+Advertising+Population+Price+ShelveLoc+Age+Education+Urban'

slr = smf.glm(formula=all_vars, data=x_train, family=sm.families.Binomial())
slr_fit = slr.fit()
print(slr_fit.summary())
prob_slr = slr_fit.predict(x_test)
pred_slr = ['Yes' if x < 0.5 else 'No' for x in prob_slr]
print('confusion matrix:\n', confusion_matrix(y_test, pred_slr).T)
print(classification_report(y_test, pred_slr))

                   Generalized Linear Model Regression Results                   
Dep. Variable:     ['US[No]', 'US[Yes]']   No. Observations:                  320
Model:                               GLM   Df Residuals:                      308
Model Family:                   Binomial   Df Model:                           11
Link Function:                     Logit   Scale:                          1.0000
Method:                             IRLS   Log-Likelihood:                -83.021
Date:                   Fri, 18 Feb 2022   Deviance:                       166.04
Time:                           21:46:01   Pearson chi2:                     236.
No. Iterations:                        8   Pseudo R-squ. (CS):             0.5449
Covariance Type:               nonrobust                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
Inte

In [17]:
# Now take only the 4 variables with the lowest p-values
better_vars = 'US ~ Income+Advertising+Population+Price'

slr = smf.glm(formula=better_vars, data=x_train, family=sm.families.Binomial())
slr_fit = slr.fit()
print(slr_fit.summary())
prob_slr = slr_fit.predict(x_test)
pred_slr = ['Yes' if x < 0.5 else 'No' for x in prob_slr]
print('confusion matrix:\n', confusion_matrix(y_test, pred_slr).T)
print(classification_report(y_test, pred_slr))

                   Generalized Linear Model Regression Results                   
Dep. Variable:     ['US[No]', 'US[Yes]']   No. Observations:                  320
Model:                               GLM   Df Residuals:                      315
Model Family:                   Binomial   Df Model:                            4
Link Function:                     Logit   Scale:                          1.0000
Method:                             IRLS   Log-Likelihood:                -85.633
Date:                   Fri, 18 Feb 2022   Deviance:                       171.27
Time:                           21:46:01   Pearson chi2:                     341.
No. Iterations:                        8   Pseudo R-squ. (CS):             0.5374
Covariance Type:               nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       2.44

In [18]:
# We still get an accuracy of 0.86. Try only the lowest 2.
best_vars = 'US ~ Advertising+Population'

slr = smf.glm(formula=best_vars, data=x_train, family=sm.families.Binomial())
slr_fit = slr.fit()
print(slr_fit.summary())
prob_slr = slr_fit.predict(x_test)
pred_slr = ['Yes' if x < 0.5 else 'No' for x in prob_slr]
print('confusion matrix:\n', confusion_matrix(y_test, pred_slr).T)
print(classification_report(y_test, pred_slr))

                   Generalized Linear Model Regression Results                   
Dep. Variable:     ['US[No]', 'US[Yes]']   No. Observations:                  320
Model:                               GLM   Df Residuals:                      317
Model Family:                   Binomial   Df Model:                            2
Link Function:                     Logit   Scale:                          1.0000
Method:                             IRLS   Log-Likelihood:                -88.776
Date:                   Fri, 18 Feb 2022   Deviance:                       177.55
Time:                           21:46:01   Pearson chi2:                     438.
No. Iterations:                        8   Pseudo R-squ. (CS):             0.5282
Covariance Type:               nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.68

In [19]:
# learning method 2: LDA (use only the best 2 predictors: Advertising and Population)
lda = LinearDiscriminantAnalysis()
ldm = lda.fit(x_train[['Advertising', 'Population']], y_train)

print('Priors:', ldm.priors_)
print('Means:', ldm.means_)
print('Coefficients:', ldm.coef_)

pred = ldm.predict(x_test[['Advertising', 'Population']])
print(confusion_matrix(pred, y_test).T)
print(classification_report(y_test, pred))

Priors: [0.359375 0.640625]
Means: [[  0.51304348 245.37391304]
 [  9.84878049 267.91707317]]
Coefficients: [[ 0.42006905 -0.00359525]]
[[26  1]
 [ 6 47]]
              precision    recall  f1-score   support

          No       0.81      0.96      0.88        27
         Yes       0.98      0.89      0.93        53

    accuracy                           0.91        80
   macro avg       0.90      0.92      0.91        80
weighted avg       0.92      0.91      0.91        80



In [20]:
# learning method 3: QDA (also use Advertising and Population as predictors)
qda = QuadraticDiscriminantAnalysis()
qdm = qda.fit(x_train[['Advertising', 'Population']], y_train)

print('Priors:', qdm.priors_)
print('Means:', qdm.means_)

q_pred = qdm.predict(x_test[['Advertising', 'Population']])
print(confusion_matrix(q_pred, y_test).T)
print(classification_report(y_test, q_pred))


Priors: [0.359375 0.640625]
Means: [[  0.51304348 245.37391304]
 [  9.84878049 267.91707317]]
[[26  1]
 [ 5 48]]
              precision    recall  f1-score   support

          No       0.84      0.96      0.90        27
         Yes       0.98      0.91      0.94        53

    accuracy                           0.93        80
   macro avg       0.91      0.93      0.92        80
weighted avg       0.93      0.93      0.93        80



In [21]:
# I also tested knn to double check that I had the best model

for k in range(10):
    # Your code here (and delete the 'pass' line)
    knn = neighbors.KNeighborsClassifier(n_neighbors=k+1)
    pred = knn.fit(x_train[['Advertising', 'Population']], y_train).predict(x_test[['Advertising', 'Population']])
    print('KNN confusion matrix with k=', (k+1), ':\n', confusion_matrix(y_test, pred).T)
    print(classification_report(y_test, pred))

KNN confusion matrix with k= 1 :
 [[24  9]
 [ 3 44]]
              precision    recall  f1-score   support

          No       0.73      0.89      0.80        27
         Yes       0.94      0.83      0.88        53

    accuracy                           0.85        80
   macro avg       0.83      0.86      0.84        80
weighted avg       0.87      0.85      0.85        80

KNN confusion matrix with k= 2 :
 [[26 12]
 [ 1 41]]
              precision    recall  f1-score   support

          No       0.68      0.96      0.80        27
         Yes       0.98      0.77      0.86        53

    accuracy                           0.84        80
   macro avg       0.83      0.87      0.83        80
weighted avg       0.88      0.84      0.84        80

KNN confusion matrix with k= 3 :
 [[21  7]
 [ 6 46]]
              precision    recall  f1-score   support

          No       0.75      0.78      0.76        27
         Yes       0.88      0.87      0.88        53

    accuracy           

## Questions 6-9

(Each of the three questions below carries the same weight as the earlier questions.)

Question 6: What was the first method you tried, and what was its best overall accuracy?

Question 7: What was the second method you tried, and what was its best overall accuracy?

Question 8: What was the third method you tried, and what was its best overall accuracy?


6) The first method I tried was logistic regression. The overall accuracy with all of the variables was 0.91. When I narrowed down the number of variables to 4 and 2, the overall accuracy was 0.93.

7) The second method I tried was Least Discriminant Analysis (LDA). The overall accuracy was 0.91, so it was a good model but not as good as the logistic regression.

8) The third method I tried was Quadratic Discriminant Analysis (QDA). The overall accuracy was the same as LR, 0.93. So, LR and QDA were the best models.

Note) I also tried kNN for K=1-10, and got the best overall accuracy to be 0.85 at K=1. This was still lower than any of the other methods I tried.