In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
data=pd.read_csv("Amazon.csv",parse_dates=["Date"])

In [3]:
data.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1997-05-15,2.4375,2.5,1.927083,1.958333,1.958333,72156000
1,1997-05-16,1.96875,1.979167,1.708333,1.729167,1.729167,14700000
2,1997-05-19,1.760417,1.770833,1.625,1.708333,1.708333,6106800
3,1997-05-20,1.729167,1.75,1.635417,1.635417,1.635417,5467200
4,1997-05-21,1.635417,1.645833,1.375,1.427083,1.427083,18853200
5,1997-05-22,1.4375,1.447917,1.3125,1.395833,1.395833,11776800
6,1997-05-23,1.40625,1.520833,1.333333,1.5,1.5,15937200
7,1997-05-27,1.510417,1.645833,1.458333,1.583333,1.583333,8697600
8,1997-05-28,1.625,1.635417,1.53125,1.53125,1.53125,4574400
9,1997-05-29,1.541667,1.541667,1.479167,1.505208,1.505208,3472800


In [4]:
data.tail(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
6150,2021-10-21,3414.25,3440.280029,3403.0,3435.01001,3435.01001,1881400
6151,2021-10-22,3421.0,3429.840088,3331.300049,3335.550049,3335.550049,3133800
6152,2021-10-25,3335.0,3347.800049,3297.699951,3320.370117,3320.370117,2226000
6153,2021-10-26,3349.51001,3416.120117,3343.97998,3376.070068,3376.070068,2693700
6154,2021-10-27,3388.0,3412.0,3371.453369,3396.189941,3396.189941,1080291


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6155 entries, 0 to 6154
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6155 non-null   datetime64[ns]
 1   Open       6155 non-null   float64       
 2   High       6155 non-null   float64       
 3   Low        6155 non-null   float64       
 4   Close      6155 non-null   float64       
 5   Adj Close  6155 non-null   float64       
 6   Volume     6155 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 336.7 KB


In [6]:
data.shape

(6155, 7)

In [7]:
data.isnull().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [8]:
cols=["Open","Close","High","Low","Adj Close"]
for col in cols:
    data[col]=data[col].astype(float)


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6155 entries, 0 to 6154
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       6155 non-null   datetime64[ns]
 1   Open       6155 non-null   float64       
 2   High       6155 non-null   float64       
 3   Low        6155 non-null   float64       
 4   Close      6155 non-null   float64       
 5   Adj Close  6155 non-null   float64       
 6   Volume     6155 non-null   int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 336.7 KB


In [10]:
data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [11]:
data.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

Feature Engineering

In [12]:
#Return features
data["daily_return"]=data["Adj Close"].pct_change()
data["log_return"]=np.log(data["Adj Close"]/ data["Adj Close"].shift(1))


In [13]:
#Price Ratios
data["close_open_ratio"]=data["Close"]/data["Open"]
data["high_low_ratio"]=data["High"]/data["Low"]

In [14]:
#moving averages
data["simple_avg_5"]=data["Adj Close"].rolling(window=5).mean()
data["simple_avg_10"]=data["Adj Close"].rolling(window=10).mean()
data["exponential_avg_10"]=data["Adj Close"].ewm(span=10,adjust=False).mean()

In [15]:
#Volatility measures
data["volatility_5"]=data["daily_return"].rolling(window=5).std()
data["volatility_10"]=data["daily_return"].rolling(window=10).std()

In [16]:
#lag features
data["adj_close_lag1"]=data["Adj Close"].shift(1)
data["adj_close_lag5"]=data["Adj Close"].shift(5)
data["daily_return_lag1"]=data["daily_return"].shift(1)

In [17]:
data.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,daily_return,log_return,close_open_ratio,high_low_ratio,simple_avg_5,simple_avg_10,exponential_avg_10,volatility_5,volatility_10,adj_close_lag1,adj_close_lag5,daily_return_lag1
0,1997-05-15,2.4375,2.5,1.927083,1.958333,1.958333,72156000,,,0.803419,1.297298,,,1.958333,,,,,
1,1997-05-16,1.96875,1.979167,1.708333,1.729167,1.729167,14700000,-0.117021,-0.124454,0.878307,1.158537,,,1.916666,,,1.958333,,
2,1997-05-19,1.760417,1.770833,1.625,1.708333,1.708333,6106800,-0.012049,-0.012122,0.970414,1.089743,,,1.878788,,,1.729167,,-0.117021
3,1997-05-20,1.729167,1.75,1.635417,1.635417,1.635417,5467200,-0.042683,-0.04362,0.945783,1.070063,,,1.834538,,,1.708333,,-0.012049
4,1997-05-21,1.635417,1.645833,1.375,1.427083,1.427083,18853200,-0.127389,-0.136265,0.872611,1.196969,1.691667,,1.760456,,,1.635417,,-0.042683
5,1997-05-22,1.4375,1.447917,1.3125,1.395833,1.395833,11776800,-0.021898,-0.022141,0.971014,1.103175,1.579167,,1.694161,0.054211,,1.427083,1.958333,-0.127389
6,1997-05-23,1.40625,1.520833,1.333333,1.5,1.5,15937200,0.074627,0.071974,1.066667,1.140625,1.533333,,1.658859,0.072276,,1.395833,1.729167,-0.021898
7,1997-05-27,1.510417,1.645833,1.458333,1.583333,1.583333,8697600,0.055555,0.054067,1.048275,1.128571,1.508333,,1.645127,0.081273,,1.5,1.708333,0.074627
8,1997-05-28,1.625,1.635417,1.53125,1.53125,1.53125,4574400,-0.032895,-0.033448,0.942308,1.068027,1.4875,,1.624422,0.080474,,1.583333,1.635417,0.055555
9,1997-05-29,1.541667,1.541667,1.479167,1.505208,1.505208,3472800,-0.017007,-0.017153,0.976351,1.042254,1.503125,1.597396,1.602747,0.04956,,1.53125,1.427083,-0.032895


In [18]:
data[data.isnull().any(axis=1)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,daily_return,log_return,close_open_ratio,high_low_ratio,simple_avg_5,simple_avg_10,exponential_avg_10,volatility_5,volatility_10,adj_close_lag1,adj_close_lag5,daily_return_lag1
0,1997-05-15,2.4375,2.5,1.927083,1.958333,1.958333,72156000,,,0.803419,1.297298,,,1.958333,,,,,
1,1997-05-16,1.96875,1.979167,1.708333,1.729167,1.729167,14700000,-0.117021,-0.124454,0.878307,1.158537,,,1.916666,,,1.958333,,
2,1997-05-19,1.760417,1.770833,1.625,1.708333,1.708333,6106800,-0.012049,-0.012122,0.970414,1.089743,,,1.878788,,,1.729167,,-0.117021
3,1997-05-20,1.729167,1.75,1.635417,1.635417,1.635417,5467200,-0.042683,-0.04362,0.945783,1.070063,,,1.834538,,,1.708333,,-0.012049
4,1997-05-21,1.635417,1.645833,1.375,1.427083,1.427083,18853200,-0.127389,-0.136265,0.872611,1.196969,1.691667,,1.760456,,,1.635417,,-0.042683
5,1997-05-22,1.4375,1.447917,1.3125,1.395833,1.395833,11776800,-0.021898,-0.022141,0.971014,1.103175,1.579167,,1.694161,0.054211,,1.427083,1.958333,-0.127389
6,1997-05-23,1.40625,1.520833,1.333333,1.5,1.5,15937200,0.074627,0.071974,1.066667,1.140625,1.533333,,1.658859,0.072276,,1.395833,1.729167,-0.021898
7,1997-05-27,1.510417,1.645833,1.458333,1.583333,1.583333,8697600,0.055555,0.054067,1.048275,1.128571,1.508333,,1.645127,0.081273,,1.5,1.708333,0.074627
8,1997-05-28,1.625,1.635417,1.53125,1.53125,1.53125,4574400,-0.032895,-0.033448,0.942308,1.068027,1.4875,,1.624422,0.080474,,1.583333,1.635417,0.055555
9,1997-05-29,1.541667,1.541667,1.479167,1.505208,1.505208,3472800,-0.017007,-0.017153,0.976351,1.042254,1.503125,1.597396,1.602747,0.04956,,1.53125,1.427083,-0.032895


In [19]:
data=data.dropna()

In [20]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,daily_return,log_return,close_open_ratio,high_low_ratio,simple_avg_5,simple_avg_10,exponential_avg_10,volatility_5,volatility_10,adj_close_lag1,adj_close_lag5,daily_return_lag1
10,1997-05-30,1.5,1.510417,1.479167,1.5,1.5,2594400,-0.00346,-0.003466,1.0,1.021127,1.523958,1.551562,1.584065,0.04706,0.063633,1.505208,1.395833,-0.017007
11,1997-06-02,1.510417,1.53125,1.5,1.510417,1.510417,591600,0.006945,0.006921,1.0,1.020833,1.526042,1.529687,1.570675,0.033545,0.055091,1.5,1.5,-0.00346
12,1997-06-03,1.53125,1.53125,1.479167,1.479167,1.479167,1183200,-0.02069,-0.020907,0.965987,1.035211,1.505208,1.506771,1.554037,0.015483,0.055159,1.510417,1.583333,0.006945
13,1997-06-04,1.479167,1.489583,1.395833,1.416667,1.416667,3080400,-0.042254,-0.043172,0.957746,1.067164,1.482292,1.484896,1.529061,0.018665,0.055134,1.479167,1.53125,-0.02069
14,1997-06-05,1.416667,1.541667,1.375,1.541667,1.541667,5672400,0.088235,0.084557,1.088235,1.121212,1.489584,1.496354,1.531353,0.049733,0.046911,1.416667,1.505208,-0.042254


In [30]:

x= data.drop(["Date","Close","Open"], axis=1, inplace=False) 
y=data["Adj Close"]  
#train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler=MinMaxScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.fit_transform(x_test)                                                 



In [38]:
#reshape the data for LSTM
timesteps=60
def create_sequence(x, y, timesteps):
    x = np.array(x)
    y = np.array(y)
    x_seq, y_seq = [], []
    for i in range(len(x) - timesteps):
        x_seq.append(x[i:i + timesteps])
        y_seq.append(y[i + timesteps])   
    return np.array(x_seq), np.array(y_seq)
x_train_seq,y_train_seq= create_sequence(x_train_scaled,y_train, timesteps)
x_test_seq,y_test_seq= create_sequence(x_test_scaled,y_test, timesteps)

In [39]:
x.shape

(6145, 16)

In [40]:
x_test_seq.shape,x_train_seq.shape

((1169, 60, 16), (4856, 60, 16))

In [41]:
model1_lstm = Sequential()
model1_lstm.add(LSTM(64, return_sequences=True, input_shape=(x_train_seq.shape[1], x_train_seq.shape[2])))
model1_lstm.add(LSTM(64, return_sequences=False,))
model1_lstm.add(Dense(128, activation='relu'))
model1_lstm.add(Dropout(0.5))
model1_lstm.add(Dense(1))
model1_lstm.compile(optimizer='adam', loss='mean_squared_error')

  super().__init__(**kwargs)


In [42]:
#train the model
model1_lstm.fit(x_train_seq, y_train_seq, epochs=50, batch_size=32, validation_split=0.2)
y_pred_lstm = model1_lstm.predict(x_test_seq)


Epoch 1/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - loss: 974802.1875 - val_loss: 862911.6875
Epoch 2/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 67ms/step - loss: 836279.1875 - val_loss: 732287.8750
Epoch 3/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 66ms/step - loss: 765230.1250 - val_loss: 698786.1250
Epoch 4/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 66ms/step - loss: 751664.6250 - val_loss: 695967.0000
Epoch 5/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 67ms/step - loss: 750776.0000 - val_loss: 695503.1875
Epoch 6/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 66ms/step - loss: 754585.3125 - val_loss: 695448.6250
Epoch 7/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 71ms/step - loss: 753509.1250 - val_loss: 695457.5625
Epoch 8/50
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70