In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
Smarket = pd.read_csv("/Users/arpanganguli/Documents/Finance/ISLR/Datasets/Smarket.csv", index_col = 'SlNo')

In [3]:
Smarket.head()

Unnamed: 0_level_0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
SlNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
3,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
4,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
5,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = np.array(Smarket[['Lag1', 'Lag2']])
y = np.array(Smarket['Direction'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2016, random_state=101)

**K-Means without standardisation (K = 1)**

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
knn_1 = KNeighborsClassifier(n_neighbors=1)

In [18]:
knn_1.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [19]:
knn_1_pred = knn_1.predict(X_test)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix

In [21]:
print(confusion_matrix(y_test, knn_1_pred))

[[54 64]
 [68 66]]


In [23]:
print(classification_report(y_test, knn_1_pred))

              precision    recall  f1-score   support

        Down       0.44      0.46      0.45       118
          Up       0.51      0.49      0.50       134

   micro avg       0.48      0.48      0.48       252
   macro avg       0.48      0.48      0.47       252
weighted avg       0.48      0.48      0.48       252



**K-Means without standardisation (K = 3)**

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
knn_3 = KNeighborsClassifier()

In [27]:
knn_3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [28]:
knn_3_pred = knn_3.predict(X_test)

In [29]:
from sklearn.metrics import classification_report, confusion_matrix

In [30]:
print(confusion_matrix(y_test, knn_3_pred))

[[51 67]
 [58 76]]


In [31]:
print(classification_report(y_test, knn_3_pred))

              precision    recall  f1-score   support

        Down       0.47      0.43      0.45       118
          Up       0.53      0.57      0.55       134

   micro avg       0.50      0.50      0.50       252
   macro avg       0.50      0.50      0.50       252
weighted avg       0.50      0.50      0.50       252



*As we can see, increase the number of K marginally improves the precision of the model.*

**K-Means with standardisation (K = 1)**

**Why standardise?**
*Because KNN classifier classifies variables of different sizes, in which distances may vary on an absolute scale (e.g.
we might be classifying a variable based on house prices (where the distances could be in '000s of £ and age, where the distances could be a few years). Standardisation ensures that these distances are accounted for and there "standardised".*

In [34]:
from sklearn.preprocessing import StandardScaler

In [41]:
scaler_1 = StandardScaler()

In [42]:
scaler_1.fit(Smarket.drop(columns = 'Direction', axis = 1))

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [43]:
scaled_features_1 = scaler_1.transform(Smarket.drop(columns = 'Direction', axis = 1))

  """Entry point for launching an IPython kernel.


In [44]:
df_1 = pd.DataFrame(scaled_features_1, columns = Smarket.columns[:-1] )

In [68]:
df_1.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
0,-1.431356,0.332058,-0.172491,-2.306806,-0.928243,4.362679,-0.796765,0.841517
1,-1.431356,0.84093,0.331988,-0.170188,-2.306592,-0.924608,-0.504715,0.905784
2,-1.431356,0.905199,0.840869,0.333218,-0.170107,-2.292416,-0.186293,-0.551237
3,-1.431356,-0.551867,0.90514,0.841016,0.333267,-0.17227,-0.561626,0.537787
4,-1.431356,0.537191,-0.55195,0.905149,0.841034,0.327254,-0.756789,0.184757


In [46]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features_1,Smarket['Direction'],
                                                    test_size=0.30)

In [51]:
from sklearn.neighbors import KNeighborsClassifier

In [52]:
knn_s_1 = KNeighborsClassifier(n_neighbors=1)

In [53]:
knn_s_1.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [54]:
knn_s_1_pred = knn_s_1.predict(X_test)

In [55]:
from sklearn.metrics import classification_report, confusion_matrix

In [56]:
print(confusion_matrix(y_test, knn_s_1_pred))

[[151  40]
 [ 24 160]]


In [57]:
print(classification_report(y_test, knn_s_1_pred))

              precision    recall  f1-score   support

        Down       0.86      0.79      0.83       191
          Up       0.80      0.87      0.83       184

   micro avg       0.83      0.83      0.83       375
   macro avg       0.83      0.83      0.83       375
weighted avg       0.83      0.83      0.83       375



**K-Means with standardisation (K = 3)**

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
scaler_3 = StandardScaler()

In [61]:
scaler_3.fit(Smarket.drop(columns='Direction', axis = 1))

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [67]:
scaled_features_3 = scaler_3.transform(Smarket.drop(columns='Direction', axis = 1))

  """Entry point for launching an IPython kernel.


In [69]:
df_3 = pd.DataFrame(scaled_features_3, columns = Smarket.columns[:-1] )

In [70]:
df_3.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
0,-1.431356,0.332058,-0.172491,-2.306806,-0.928243,4.362679,-0.796765,0.841517
1,-1.431356,0.84093,0.331988,-0.170188,-2.306592,-0.924608,-0.504715,0.905784
2,-1.431356,0.905199,0.840869,0.333218,-0.170107,-2.292416,-0.186293,-0.551237
3,-1.431356,-0.551867,0.90514,0.841016,0.333267,-0.17227,-0.561626,0.537787
4,-1.431356,0.537191,-0.55195,0.905149,0.841034,0.327254,-0.756789,0.184757


In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features_3,Smarket['Direction'],
                                                    test_size=0.30)

In [73]:
from sklearn.neighbors import KNeighborsClassifier

In [76]:
knn_s_3 = KNeighborsClassifier(n_neighbors=3)

In [77]:
knn_s_3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [79]:
knn_s_3_pred = knn_s_3.predict(X_test)

In [80]:
from sklearn.metrics import classification_report, confusion_matrix

In [81]:
print(confusion_matrix(y_test, knn_s_3_pred))

[[145  34]
 [ 23 173]]


In [82]:
print(classification_report(y_test, knn_s_3_pred))

              precision    recall  f1-score   support

        Down       0.86      0.81      0.84       179
          Up       0.84      0.88      0.86       196

   micro avg       0.85      0.85      0.85       375
   macro avg       0.85      0.85      0.85       375
weighted avg       0.85      0.85      0.85       375



**As we can see, there is a significant improvement in results with standardisation (precision rate of 85% in models with standardisation as opposed to 47%-48% in models without standardisation).**