### Importing Libraries

In [2]:
import numpy as np
import pandas as pd

In [8]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

### Importing the data

In [11]:
df = pd.read_csv("../data/SPY.csv")

In [12]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-01-04,112.370003,113.389999,111.510002,113.330002,92.246048,118944600
1,2010-01-05,113.260002,113.68,112.849998,113.629997,92.490204,111579900
2,2010-01-06,113.519997,113.989998,113.43,113.709999,92.555328,116074400
3,2010-01-07,113.5,114.330002,113.18,114.190002,92.94606,131091100
4,2010-01-08,113.889999,114.620003,113.660004,114.57,93.255348,126402800


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2263 entries, 0 to 2262
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2263 non-null   object 
 1   Open       2263 non-null   float64
 2   High       2263 non-null   float64
 3   Low        2263 non-null   float64
 4   Close      2263 non-null   float64
 5   Adj Close  2263 non-null   float64
 6   Volume     2263 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 123.9+ KB


In [16]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,2263.0,2263.0,2263.0,2263.0,2263.0,2263.0
mean,185.670013,186.539434,184.69863,185.686465,167.857557,133202100.0
std,51.939383,52.050626,51.784357,51.904678,55.222908,75945940.0
min,103.110001,103.419998,101.129997,102.199997,83.926636,27856500.0
25%,135.18,135.805,134.345001,135.360001,114.648251,80729450.0
50%,190.460007,191.830002,188.860001,190.350006,171.772522,114695600.0
75%,216.684998,217.195,215.754997,216.644997,201.767876,162599200.0
max,293.089996,293.940002,291.809998,293.579987,283.4935,717828700.0


### Generating Naive Forecast

In [17]:
df['ClosePrediction'] = df['Close'].shift(1)

In [18]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,ClosePrediction
0,2010-01-04,112.370003,113.389999,111.510002,113.330002,92.246048,118944600,
1,2010-01-05,113.260002,113.68,112.849998,113.629997,92.490204,111579900,113.330002
2,2010-01-06,113.519997,113.989998,113.43,113.709999,92.555328,116074400,113.629997
3,2010-01-07,113.5,114.330002,113.18,114.190002,92.94606,131091100,113.709999
4,2010-01-08,113.889999,114.620003,113.660004,114.57,93.255348,126402800,114.190002


In [20]:
y_true = df.iloc[1:]['Close']
y_pred = df.iloc[1:]['ClosePrediction']

### Metrics

#### Squared Sum of Errors

In [21]:
(y_true- y_pred).dot(y_true - y_pred)

6330.3742894926045

#### Mean Squared Error

In [27]:
mean_squared_error(y_true, y_pred)

2.798573956451196

In [28]:
# caluclating it manually
(y_true- y_pred).dot(y_true - y_pred) / len(y_pred)

2.7985739564511958

#### Root Mean Squared Error

In [29]:
mean_squared_error(y_true, y_pred, squared = False)

1.672893886787562

In [30]:
# calculating manually
np.sqrt((y_true- y_pred).dot(y_true - y_pred) / len(y_pred))

1.6728938867875618

#### Mean Absolute Error

In [33]:
mean_absolute_error(y_true, y_pred)

1.1457559803120336

#### R-squared

In [35]:
r2_score(y_true, y_pred)

0.9989603259063914

This is a near perfect R2 score. This means no matter how good our metrics are, they should have a good reasoning behind it. Our model is a naive one, and is still giving very good score. Though it will fail miserable on a test set

#### MAPE (Mean Absolute Percentage Error)

In [36]:
mean_absolute_percentage_error(y_true, y_pred)

0.006494073151422373