In [23]:
import pandas as pd
import numpy as np

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix, precision_score

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('/Users/areum/Documents/ISLR/Smarket.csv', index_col=0, usecols=range(1,10), parse_dates=True)

In [3]:
df.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2001-01-01,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2001-01-01,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
2001-01-01,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
2001-01-01,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [4]:
df.describe()

Unnamed: 0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
count,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0
mean,0.003834,0.003919,0.001716,0.001636,0.00561,1.478305,0.003138
std,1.136299,1.13628,1.138703,1.138774,1.14755,0.360357,1.136334
min,-4.922,-4.922,-4.922,-4.922,-4.922,0.35607,-4.922
25%,-0.6395,-0.6395,-0.64,-0.64,-0.64,1.2574,-0.6395
50%,0.039,0.039,0.0385,0.0385,0.0385,1.42295,0.0385
75%,0.59675,0.59675,0.59675,0.59675,0.597,1.641675,0.59675
max,5.733,5.733,5.733,5.733,5.733,3.15247,5.733


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1250 entries, 2001-01-01 to 2005-01-01
Data columns (total 8 columns):
Lag1         1250 non-null float64
Lag2         1250 non-null float64
Lag3         1250 non-null float64
Lag4         1250 non-null float64
Lag5         1250 non-null float64
Volume       1250 non-null float64
Today        1250 non-null float64
Direction    1250 non-null object
dtypes: float64(7), object(1)
memory usage: 87.9+ KB


## LDA Model

In [6]:
X_train = df[:'2004'][['Lag1', 'Lag2']]
y_train = df[:'2004']['Direction']

X_test = df['2005':][['Lag1', 'Lag2']]
y_test = df['2005':]['Direction']

In [7]:
lda = LinearDiscriminantAnalysis()
model = lda.fit(X_train, y_train)

In [8]:
print(model.priors_)

[ 0.49198397  0.50801603]


In [9]:
print(model.means_)

[[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]


In [10]:
print(model.coef_)

[[-0.05544078 -0.0443452 ]]


In [11]:
model.coef_[0][0]

-0.055440778179257286

In [12]:
model.coef_[0][1]

-0.044345199897942451

In [29]:
pred = model.predict(X_test)

In [30]:
print(np.unique(pred, return_counts=True))

(array(['Down', 'Up'],
      dtype='<U4'), array([ 70, 182]))


In [38]:
print(classification_report(y_test, pred, digits=3))

             precision    recall  f1-score   support

       Down      0.500     0.315     0.387       111
         Up      0.582     0.752     0.656       141

avg / total      0.546     0.560     0.538       252



In [39]:
print(confusion_matrix(pred, y_test, labels=['Down', 'Up']))

[[ 35  35]
 [ 76 106]]


In [41]:
prob = model.predict_proba(X_test)

In [45]:
np.unique(prob[:,1]>0.5, return_counts=True)

(array([False,  True], dtype=bool), array([ 70, 182]))

In [54]:
print(np.stack((prob[10:20,1], pred[10:20])).T)

[['0.5093037238790318' 'Up']
 ['0.4880011537380811' 'Down']
 ['0.510484773063352' 'Up']
 ['0.5293238777881214' 'Up']
 ['0.5255407143881711' 'Up']
 ['0.5200416608518921' 'Up']
 ['0.5064224705341396' 'Up']
 ['0.4969106228816935' 'Down']
 ['0.5021193878585957' 'Up']
 ['0.5113669134834818' 'Up']]


In [55]:
print(np.unique(prob[:,1]>0.9, return_counts=True))

(array([False], dtype=bool), array([252]))


In [57]:
max(prob[:,1])

0.5422132554518978

## Quadratic Discriminant Analysis

In [58]:
qda = QuadraticDiscriminantAnalysis()

In [60]:
model2 = qda.fit(X_train, y_train)

In [63]:
model2.priors_

array([ 0.49198397,  0.50801603])

In [66]:
model2.means_

array([[ 0.04279022,  0.03389409],
       [-0.03954635, -0.03132544]])

In [62]:
pred2 = model2.predict(X_test)

In [67]:
print(np.unique(pred2, return_counts=True))

(array(['Down', 'Up'], dtype=object), array([ 50, 202]))


In [68]:
print(confusion_matrix(pred2, y_test))

[[ 30  20]
 [ 81 121]]


In [69]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

       Down       0.60      0.27      0.37       111
         Up       0.60      0.86      0.71       141

avg / total       0.60      0.60      0.56       252

