In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
market_path="../../Data/Smarket.csv"
market=pd.read_csv(market_path)
market.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [3]:
market.describe()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
count,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0
mean,2003.016,0.003834,0.003919,0.001716,0.001636,0.00561,1.478305,0.003138
std,1.409018,1.136299,1.13628,1.138703,1.138774,1.14755,0.360357,1.136334
min,2001.0,-4.922,-4.922,-4.922,-4.922,-4.922,0.35607,-4.922
25%,2002.0,-0.6395,-0.6395,-0.64,-0.64,-0.64,1.2574,-0.6395
50%,2003.0,0.039,0.039,0.0385,0.0385,0.0385,1.42295,0.0385
75%,2004.0,0.59675,0.59675,0.59675,0.59675,0.597,1.641675,0.59675
max,2005.0,5.733,5.733,5.733,5.733,5.733,3.15247,5.733


In [4]:
market.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Year       1250 non-null   int64  
 1   Lag1       1250 non-null   float64
 2   Lag2       1250 non-null   float64
 3   Lag3       1250 non-null   float64
 4   Lag4       1250 non-null   float64
 5   Lag5       1250 non-null   float64
 6   Volume     1250 non-null   float64
 7   Today      1250 non-null   float64
 8   Direction  1250 non-null   object 
dtypes: float64(7), int64(1), object(1)
memory usage: 88.0+ KB


In [5]:
market.corr()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0


### Logistic Regression

In [6]:
# Encode the response as 0,1 for down/up
market['DirCoded'] = [0 if d == "Down" else 1 for d in market.Direction]
market.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction,DirCoded
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up,1
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up,1
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down,0
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up,1
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up,1


In [7]:
from sklearn.linear_model import LogisticRegression

X=market.drop(["Year","Today","Direction","DirCoded"],axis=1)
Y=market["Direction"]

lr=LogisticRegression()
lr.fit(X,Y)
print("Intercept:\t"+ str(lr.intercept_) + "\nCoefficient:\t" + str(lr.coef_))

Intercept:	[-0.12108463]
Coefficient:	[[-0.07284345 -0.04223481  0.0110163   0.00928427  0.01026372  0.13211221]]


In [8]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

model=smf.logit("DirCoded ~ " + "+".join(X.columns),data=market)

print("Degrees of freedom for the model:\t"+str(model.df_model))
print("Degrees of freedom for residual:\t"+str(model.df_resid))
print("Y:\t"+str(model.endog_names))
print("X:\t"+str(model.exog_names))

Degrees of freedom for the model:	6.0
Degrees of freedom for residual:	1243.0
Y:	DirCoded
X:	['Intercept', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']


In [9]:
result=model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:               DirCoded   No. Observations:                 1250
Model:                          Logit   Df Residuals:                     1243
Method:                           MLE   Df Model:                            6
Date:                Sat, 12 Jun 2021   Pseudo R-squ.:                0.002074
Time:                        16:09:41   Log-Likelihood:                -863.79
converged:                       True   LL-Null:                       -865.59
Covariance Type:            nonrobust   LLR p-value:                    0.7319
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1260      0.241     -0.523      0.601      -0.598       0.346
Lag1          -0.0731      0.

In [10]:
probs=result.predict(exog=X)
probs = ["Up" if p>0.5 else "Down" for p in probs]
probs[:10]

['Up', 'Down', 'Down', 'Up', 'Up', 'Up', 'Down', 'Up', 'Up', 'Down']

In [11]:
from sklearn.metrics import confusion_matrix

print(pd.DataFrame(confusion_matrix(Y,probs), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

             Predicted Down  Predicted Up
Actual Down             145           457
Actual Up               141           507


In [12]:
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score,classification_report

print("Precision: " + str(precision_score(Y,probs,average="macro")))
print("Recall: " + str(recall_score(Y,probs,average="macro")))
print("Accuracy: " + str(accuracy_score(Y,probs,normalize=True)))
print("F1: " +str(f1_score(Y,probs,average="macro")))

Precision: 0.5164633084757566
Recall: 0.5116355973914114
Accuracy: 0.5216
F1: 0.4778044173205464


In [13]:
print(classification_report(Y, probs))

              precision    recall  f1-score   support

        Down       0.51      0.24      0.33       602
          Up       0.53      0.78      0.63       648

    accuracy                           0.52      1250
   macro avg       0.52      0.51      0.48      1250
weighted avg       0.52      0.52      0.48      1250



### Linear Discriminant Ananlysis

In [14]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

X=market[["Lag1","Lag2"]]
Y=market["DirCoded"]
lda = LinearDiscriminantAnalysis()
lda.fit(X,Y)

print("Intercept:\t" + str(lda.intercept_))
print("Coefficient:\t" + str(lda.coef_))
print("Priors:\t" + str(lda.priors_))
print("Means:\t" + str(lda.means_))

Intercept:	[0.0742434]
Coefficient:	[[-0.07126095 -0.04433204]]
Priors:	[0.4816 0.5184]
Means:	[[ 0.05068605  0.03229734]
 [-0.03969136 -0.02244444]]


In [15]:
print(pd.DataFrame(confusion_matrix(Y,lda.predict(X)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

             Predicted Down  Predicted Up
Actual Down             114           488
Actual Up               102           546


In [16]:
print("Precision: " + str(precision_score(Y,lda.predict(X),average="macro")))
print("Recall: " + str(recall_score(Y,lda.predict(X),average="macro")))
print("Accuracy: " + str(accuracy_score(Y,lda.predict(X),normalize=True)))
print("F1: " +str(f1_score(Y,lda.predict(X),average="macro")))

Precision: 0.5279120997206104
Recall: 0.5159806816783561
Accuracy: 0.528
F1: 0.46397785846980394


In [17]:
print(classification_report(Y, lda.predict(X)))

              precision    recall  f1-score   support

           0       0.53      0.19      0.28       602
           1       0.53      0.84      0.65       648

    accuracy                           0.53      1250
   macro avg       0.53      0.52      0.46      1250
weighted avg       0.53      0.53      0.47      1250



### Quadratic Discriminant Analysis

In [18]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
qda.fit(X,Y)

print("Priors:\t" + str(qda.priors_))
print("Means:\t" + str(qda.means_))

Priors:	[0.4816 0.5184]
Means:	[[ 0.05068605  0.03229734]
 [-0.03969136 -0.02244444]]


In [19]:
print(pd.DataFrame(confusion_matrix(Y,qda.predict(X)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

             Predicted Down  Predicted Up
Actual Down             109           493
Actual Up                94           554


In [20]:
print("Precision: " + str(precision_score(Y,qda.predict(X),average="macro")))
print("Recall: " + str(recall_score(Y,qda.predict(X),average="macro")))
print("Accuracy: " + str(accuracy_score(Y,qda.predict(X),normalize=True)))
print("F1: " +str(f1_score(Y,qda.predict(X),average="macro")))

Precision: 0.5330383314278186
Recall: 0.5180006972642631
Accuracy: 0.5304
F1: 0.46224738452518366


In [21]:
print(classification_report(Y, qda.predict(X)))

              precision    recall  f1-score   support

           0       0.54      0.18      0.27       602
           1       0.53      0.85      0.65       648

    accuracy                           0.53      1250
   macro avg       0.53      0.52      0.46      1250
weighted avg       0.53      0.53      0.47      1250



### K-Nearest Neighbors

In [22]:
from sklearn.neighbors import KNeighborsClassifier as KNNC

knnc=KNNC(n_neighbors=1)
knnc.fit(X,Y)

KNeighborsClassifier(n_neighbors=1)

In [23]:
print(pd.DataFrame(confusion_matrix(Y,knnc.predict(X)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

             Predicted Down  Predicted Up
Actual Down             602             0
Actual Up                 0           648


In [24]:
print("Precision: " + str(precision_score(Y,knnc.predict(X),average="macro")))
print("Recall: " + str(recall_score(Y,knnc.predict(X),average="macro")))
print("Accuracy: " + str(accuracy_score(Y,knnc.predict(X),normalize=True)))
print("F1: " +str(f1_score(Y,knnc.predict(X),average="macro")))

Precision: 1.0
Recall: 1.0
Accuracy: 1.0
F1: 1.0


In [25]:
print(classification_report(Y, knnc.predict(X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       602
           1       1.00      1.00      1.00       648

    accuracy                           1.00      1250
   macro avg       1.00      1.00      1.00      1250
weighted avg       1.00      1.00      1.00      1250



In [26]:
knnc=KNNC(n_neighbors=3)
knnc.fit(X,Y)

KNeighborsClassifier(n_neighbors=3)

In [27]:
print(pd.DataFrame(confusion_matrix(Y,knnc.predict(X)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

             Predicted Down  Predicted Up
Actual Down             428           174
Actual Up               137           511


In [28]:
print("Precision: " + str(precision_score(Y,knnc.predict(X),average="macro")))
print("Recall: " + str(recall_score(Y,knnc.predict(X),average="macro")))
print("Accuracy: " + str(accuracy_score(Y,knnc.predict(X),normalize=True)))
print("F1: " +str(f1_score(Y,knnc.predict(X),average="macro")))

Precision: 0.7517537626768297
Recall: 0.749771851031541
Accuracy: 0.7512
F1: 0.7500981929286948


In [29]:
print(classification_report(Y, knnc.predict(X)))

              precision    recall  f1-score   support

           0       0.76      0.71      0.73       602
           1       0.75      0.79      0.77       648

    accuracy                           0.75      1250
   macro avg       0.75      0.75      0.75      1250
weighted avg       0.75      0.75      0.75      1250



### Application to Caravan Insurance Data

In [30]:
caravan_data="../../Data/Caravan.csv"
caravan=pd.read_csv(caravan_data)
caravan.head()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,No
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,No
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,No
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,No


In [31]:
caravan.describe()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,ALEVEN,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND
count,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,...,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0,5822.0
mean,24.253349,1.110615,2.678805,2.99124,5.773617,0.696496,4.626932,1.069907,3.258502,6.183442,...,0.076606,0.005325,0.006527,0.004638,0.570079,0.000515,0.006012,0.031776,0.007901,0.014256
std,12.846706,0.405842,0.789835,0.814589,2.85676,1.003234,1.715843,1.017503,1.597647,1.909482,...,0.377569,0.072782,0.080532,0.077403,0.562058,0.022696,0.081632,0.210986,0.090463,0.119996
min,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,1.0,2.0,2.0,3.0,0.0,4.0,0.0,2.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,30.0,1.0,3.0,3.0,7.0,0.0,5.0,1.0,3.0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,35.0,1.0,3.0,3.0,8.0,1.0,6.0,2.0,4.0,7.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,41.0,10.0,5.0,6.0,10.0,9.0,9.0,5.0,9.0,9.0,...,8.0,1.0,1.0,2.0,7.0,1.0,2.0,3.0,2.0,2.0


In [32]:
caravan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5822 entries, 0 to 5821
Data columns (total 86 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MOSTYPE   5822 non-null   int64 
 1   MAANTHUI  5822 non-null   int64 
 2   MGEMOMV   5822 non-null   int64 
 3   MGEMLEEF  5822 non-null   int64 
 4   MOSHOOFD  5822 non-null   int64 
 5   MGODRK    5822 non-null   int64 
 6   MGODPR    5822 non-null   int64 
 7   MGODOV    5822 non-null   int64 
 8   MGODGE    5822 non-null   int64 
 9   MRELGE    5822 non-null   int64 
 10  MRELSA    5822 non-null   int64 
 11  MRELOV    5822 non-null   int64 
 12  MFALLEEN  5822 non-null   int64 
 13  MFGEKIND  5822 non-null   int64 
 14  MFWEKIND  5822 non-null   int64 
 15  MOPLHOOG  5822 non-null   int64 
 16  MOPLMIDD  5822 non-null   int64 
 17  MOPLLAAG  5822 non-null   int64 
 18  MBERHOOG  5822 non-null   int64 
 19  MBERZELF  5822 non-null   int64 
 20  MBERBOER  5822 non-null   int64 
 21  MBERMIDD  5822

In [33]:
X=caravan.drop(["Purchase"],axis=1)
Y=caravan["Purchase"]

X_test=X[:1000]
X_train=X[1000:]

Y_test=Y[:1000]
Y_train=Y[1000:]

In [34]:
#standardizing data
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

numerical_pipeline = Pipeline([("standardization",StandardScaler())])
numerical_columns = X.columns

full_pipeline = ColumnTransformer([("numerical",numerical_pipeline,numerical_columns)])

X_train_final = full_pipeline.fit_transform(X_train)
X_test_final = full_pipeline.fit_transform(X_test)

In [35]:
#using K=1
knnc=KNNC(n_neighbors=1)
knnc.fit(X_train_final,Y_train)

KNeighborsClassifier(n_neighbors=1)

In [36]:
#metrics on the training data
print(pd.DataFrame(confusion_matrix(Y_train,knnc.predict(X_train_final)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

print("Precision: " + str(precision_score(Y_train,knnc.predict(X_train_final),average="macro")))
print("Recall: " + str(recall_score(Y_train,knnc.predict(X_train_final),average="macro")))
print("Accuracy: " + str(accuracy_score(Y_train,knnc.predict(X_train_final),normalize=True)))
print("F1: " +str(f1_score(Y_train,knnc.predict(X_train_final),average="macro")))

print(classification_report(Y_train, knnc.predict(X_train_final)))

             Predicted Down  Predicted Up
Actual Down            4514            19
Actual Up                19           270
Precision: 0.9650322853476658
Recall: 0.9650322853476658
Accuracy: 0.9921194525093322
F1: 0.9650322853476658
              precision    recall  f1-score   support

          No       1.00      1.00      1.00      4533
         Yes       0.93      0.93      0.93       289

    accuracy                           0.99      4822
   macro avg       0.97      0.97      0.97      4822
weighted avg       0.99      0.99      0.99      4822



In [37]:
#metrics on the training data
print(pd.DataFrame(confusion_matrix(Y_test,knnc.predict(X_test_final)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

print("Precision: " + str(precision_score(Y_test,knnc.predict(X_test_final),average="macro")))
print("Recall: " + str(recall_score(Y_test,knnc.predict(X_test_final),average="macro")))
print("Accuracy: " + str(accuracy_score(Y_test,knnc.predict(X_test_final),normalize=True)))
print("F1: " +str(f1_score(Y_test,knnc.predict(X_test_final),average="macro")))

print(classification_report(Y_test, knnc.predict(X_test_final)))

             Predicted Down  Predicted Up
Actual Down             872            69
Actual Up                51             8
Precision: 0.5243207496728624
Recall: 0.5311334858336786
Accuracy: 0.88
F1: 0.5266346882100479
              precision    recall  f1-score   support

          No       0.94      0.93      0.94       941
         Yes       0.10      0.14      0.12        59

    accuracy                           0.88      1000
   macro avg       0.52      0.53      0.53      1000
weighted avg       0.90      0.88      0.89      1000



In [38]:
#using K=3
knnc=KNNC(n_neighbors=3)
knnc.fit(X_train_final,Y_train)

KNeighborsClassifier(n_neighbors=3)

In [39]:
#metrics on the training data
print(pd.DataFrame(confusion_matrix(Y_train,knnc.predict(X_train_final)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

print("Precision: " + str(precision_score(Y_train,knnc.predict(X_train_final),average="macro")))
print("Recall: " + str(recall_score(Y_train,knnc.predict(X_train_final),average="macro")))
print("Accuracy: " + str(accuracy_score(Y_train,knnc.predict(X_train_final),normalize=True)))
print("F1: " +str(f1_score(Y_train,knnc.predict(X_train_final),average="macro")))

print(classification_report(Y_train, knnc.predict(X_train_final)))

             Predicted Down  Predicted Up
Actual Down            4502            31
Actual Up               229            60
Precision: 0.8054682582266602
Recall: 0.6003868593024472
Accuracy: 0.9460804645375362
F1: 0.643861921643487
              precision    recall  f1-score   support

          No       0.95      0.99      0.97      4533
         Yes       0.66      0.21      0.32       289

    accuracy                           0.95      4822
   macro avg       0.81      0.60      0.64      4822
weighted avg       0.93      0.95      0.93      4822



In [40]:
#metrics on the training data
print(pd.DataFrame(confusion_matrix(Y_test,knnc.predict(X_test_final)), ['Actual Down','Actual Up'], ['Predicted Down','Predicted Up']))

print("Precision: " + str(precision_score(Y_test,knnc.predict(X_test_final),average="macro")))
print("Recall: " + str(recall_score(Y_test,knnc.predict(X_test_final),average="macro")))
print("Accuracy: " + str(accuracy_score(Y_test,knnc.predict(X_test_final),normalize=True)))
print("F1: " +str(f1_score(Y_test,knnc.predict(X_test_final),average="macro")))

print(classification_report(Y_test, knnc.predict(X_test_final)))

             Predicted Down  Predicted Up
Actual Down             921            20
Actual Up                54             5
Precision: 0.5723076923076923
Recall: 0.531745888794827
Accuracy: 0.926
F1: 0.5402127448056467
              precision    recall  f1-score   support

          No       0.94      0.98      0.96       941
         Yes       0.20      0.08      0.12        59

    accuracy                           0.93      1000
   macro avg       0.57      0.53      0.54      1000
weighted avg       0.90      0.93      0.91      1000

