# Chapter 4 - Classification

- [Load dataset](#Load-dataset)
- [4.4 Linear Discriminant Analysis](#4.4-Linear-Discriminant-Analysis)
- [Lab: 4.6.3 Linear Discriminant Analysis](#4.6.3-Linear-Discriminant-Analysis)
- [Lab: 4.6.4 Quadratic Discriminant Analysis](#4.6.4-Quadratic-Discriminant-Analysis)

In [24]:
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing

%matplotlib inline
plt.style.use('seaborn-white')

### Load dataset

In [25]:
# In R, I exported the dataset from package 'ISLR' to an Excel file
# df = pd.read_excel('Data/Default.xlsx')
df = pd.read_excel('https://github.com/borisgarbuzov/schulich_data_science_1/blob/master/Data/Default.xlsx?raw=true')

# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object.
# what is factorize?
df['default2'] = df.default.factorize()[0]
df['student2'] = df.student.factorize()[0]
df.head(3)

Unnamed: 0.1,Unnamed: 0,default,student,balance,income,default2,student2
0,1,No,No,729.526495,44361.625074,0,0
1,2,No,Yes,817.180407,12106.1347,0,1
2,3,No,No,1073.549164,31767.138947,0,0


## 4.4 Linear Discriminant Analysis
### Table 4.4 


In [26]:
# B
# Experiments with the code below
X1 = df[['balance', 'income', 'student2']].to_numpy()
# casting to numpy - removes column names or not?
X1


array([[7.29526495e+02, 4.43616251e+04, 0.00000000e+00],
       [8.17180407e+02, 1.21061347e+04, 1.00000000e+00],
       [1.07354916e+03, 3.17671389e+04, 0.00000000e+00],
       ...,
       [8.45411989e+02, 5.86361570e+04, 0.00000000e+00],
       [1.56900905e+03, 3.66691124e+04, 0.00000000e+00],
       [2.00922183e+02, 1.68629523e+04, 1.00000000e+00]])

In [27]:
# X = df[['balance', 'income', 'student2']].as_matrix()
X = df[['balance', 'income', 'student2']].to_numpy()


y = df.default2.to_numpy()

lda = LinearDiscriminantAnalysis(solver='svd')
# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
y_pred = lda.fit(X, y).predict(X)

df_ = pd.DataFrame({'True default status': y,
                    'Predicted default status': y_pred})
df_.replace(to_replace={0:'No', 1:'Yes'}, inplace=True)

tall_format_table = df_.groupby(['Predicted default status','True default status']).size()
tall_format_table

Predicted default status  True default status
No                        No                     9645
                          Yes                     254
Yes                       No                       22
                          Yes                      79
dtype: int64

In [28]:
# Pivot
wide_format_table = tall_format_table.unstack('True default status')
wide_format_table

True default status,No,Yes
Predicted default status,Unnamed: 1_level_1,Unnamed: 2_level_1
No,9645,254
Yes,22,79


Same confusion matrix but using the special functions. However, it is transposed with respect to the previous one. It has the standard order though. 

In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, y_pred)


array([[9645,   22],
       [ 254,   79]])

In [30]:
# B

print("y_pred:\n", y_pred)
oneCountY_pred = np.sum(y_pred)
allCountY_pred = len(y_pred)
print("oneCountY_pred =", oneCountY_pred)
print("allCountY_pred =", allCountY_pred)

print("y:\n", y)
oneCountY = np.sum(y)
allCountY = len(y)
print("oneCountY =", oneCountY)
print("allCountY =", allCountY)
#print("--------y.describe() : -------\n", y.describe())

import collections
collections.Counter(y), collections.Counter(y_pred)


y_pred:
 [0 0 0 ... 0 0 0]
oneCountY_pred = 101
allCountY_pred = 10000
y:
 [0 0 0 ... 0 0 0]
oneCountY = 333
allCountY = 10000


(Counter({0: 9667, 1: 333}), Counter({0: 9899, 1: 101}))

In [31]:
# Yet another way to return frequencies
pair = values, frequencies = np.unique(y, return_counts=True)
pair

(array([0, 1]), array([9667,  333]))

In [32]:
# Pack it the same way as before
dict(zip(*pair))

{0: 9667, 1: 333}

In [33]:
# Kind of the same way
dict(zip(values, frequencies))

{0: 9667, 1: 333}

In [34]:
print(classification_report(y, y_pred, target_names=['No', 'Yes']))

              precision    recall  f1-score   support

          No       0.97      1.00      0.99      9667
         Yes       0.78      0.24      0.36       333

    accuracy                           0.97     10000
   macro avg       0.88      0.62      0.67     10000
weighted avg       0.97      0.97      0.97     10000



### Table 4.5
Instead of using the probability of 50% as decision boundary, we say that a probability of default of 20% is to be classified as 'Yes'.

In [35]:
decision_prob = 0.2
y_prob = lda.fit(X, y).predict_proba(X)
trueFalseArray = y_prob[:,1] > decision_prob
trueFalseArray, y

(array([False, False, False, ..., False, False, False]),
 array([0, 0, 0, ..., 0, 0, 0]))

In [36]:
df_ = pd.DataFrame({'True default status': y,
                    'Predicted default status': trueFalseArray})
df_.replace(to_replace={0:'No', 1:'Yes', 'True':'Yes', 'False':'No'}, inplace=True)

df_.groupby(['Predicted default status','True default status']).size().unstack('True default status')

True default status,No,Yes
Predicted default status,Unnamed: 1_level_1,Unnamed: 2_level_1
No,9435,140
Yes,232,193


The accuracy above got worse.

#  Lab

### 4.6.3 Linear Discriminant Analysis

In [37]:

# df = pd.read_csv('Data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)
df = pd.read_csv('https://raw.githubusercontent.com/borisgarbuzov/schulich_data_science_1/master/Data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)
df

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
2001-01-01,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
2001-01-01,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
2001-01-01,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
2001-01-01,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...
2005-01-01,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
2005-01-01,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
2005-01-01,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
2005-01-01,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [38]:
X_train = df[:'2004'][['Lag1','Lag2']]
y_train = df[:'2004']['Direction']

X_test = df['2005':][['Lag1','Lag2']]
y_test = df['2005':]['Direction']

lda = LinearDiscriminantAnalysis()
pred = lda.fit(X_train, y_train).predict(X_test)

In [39]:
lda.priors_

array([0.49198397, 0.50801603])

In [40]:
lda.means_
# The same can be done by hand, using groupby y, computing the group function mean()

array([[ 0.04279022,  0.03389409],
       [-0.03954635, -0.03132544]])

In [41]:
# These do not seem to correspond to the values from the R output in the book?
lda.coef_
# Two coefficients corresponding to two features (lag 1, lag 2).
# Note that we have two classes, but these coefficients are for the decision function
# that is a difference of log-likelihood numerators: delta2(x)-delta1(x)

array([[-0.05544078, -0.0443452 ]])

In [42]:
confusion_matrix(y_test, pred).T
# Transpose is equivalent to change of order of y and y_pred
# But API documentation specifies y before y_pred
# Here, the authors transpose it to compare to their own manual construction,

array([[ 35,  35],
       [ 76, 106]])

In [43]:
print(classification_report(y_test, pred, digits=3))

              precision    recall  f1-score   support

        Down      0.500     0.315     0.387       111
          Up      0.582     0.752     0.656       141

    accuracy                          0.560       252
   macro avg      0.541     0.534     0.522       252
weighted avg      0.546     0.560     0.538       252



In [44]:
pred_p = lda.predict_proba(X_test)
pred_p[:5]

array([[0.49017925, 0.50982075],
       [0.4792185 , 0.5207815 ],
       [0.46681848, 0.53318152],
       [0.47400107, 0.52599893],
       [0.49278766, 0.50721234]])

In [45]:
np.unique(pred_p[:,1]>0.5, return_counts=True)

(array([False,  True]), array([ 70, 182]))

In [46]:
np.unique(pred_p[:,1]>0.9, return_counts=True)
# No trues. 

(array([False]), array([252]))

In [47]:
# it is interesting to find out the min and max of each column
print(np.min(pred_p[:, 0]), np.min(pred_p[:, 1]))
print(np.max(pred_p[:, 0]), np.max(pred_p[:, 1]))

0.4577867445481022 0.4797650494643845
0.5202349505356155 0.5422132554518978


In [48]:
print(np.min(pred_p, axis=0))
print(np.max(pred_p, axis=0))

[0.45778674 0.47976505]
[0.52023495 0.54221326]


### 4.6.4 Quadratic Discriminant Analysis

In [49]:
qda = QuadraticDiscriminantAnalysis()
# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html
pred = qda.fit(X_train, y_train).predict(X_test)
pred

array(['Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up',
       'Down', 'Up', 'Up', 'Up', 'Up', 'Up', 'Down', 'Up', 'Up', 'Up',
       'Down', 'Down', 'Up', 'Down', 'Down', 'Up', 'Up', 'Up', 'Down',
       'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Down', 'Down', 'Up',
       'Up', 'Up', 'Up', 'Down', 'Down', 'Up', 'Up', 'Up', 'Up', 'Up',
       'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Down',
       'Down', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up',
       'Up', 'Up', 'Down', 'Up', 'Down', 'Down', 'Up', 'Up', 'Up', 'Up',
       'Up', 'Down', 'Up', 'Down', 'Down', 'Up', 'Up', 'Up', 'Up', 'Up',
       'Up', 'Up', 'Down', 'Down', 'Down', 'Up', 'Up', 'Up', 'Up', 'Up',
       'Up', 'Up', 'Up', 'Down', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up',
       'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Up', 'Down',
       'Up', 'Up', 'Up', 'Down', 'Up', 'Up', 'Down', 'Down', 'Up', 'Up',
       'Up', 'Up', 'Up', 'Up', 'Down', 'Up', 'Up', 

In [50]:
qda.priors_

array([0.49198397, 0.50801603])

In [51]:
qda.means_

array([[ 0.04279022,  0.03389409],
       [-0.03954635, -0.03132544]])

In [52]:
confusion_matrix(y_test, pred).T

array([[ 30,  20],
       [ 81, 121]])

In [53]:
print(classification_report(y_test, pred, digits=3))

              precision    recall  f1-score   support

        Down      0.600     0.270     0.373       111
          Up      0.599     0.858     0.706       141

    accuracy                          0.599       252
   macro avg      0.600     0.564     0.539       252
weighted avg      0.599     0.599     0.559       252

