<a href="https://colab.research.google.com/github/adityasengar/LSTM_Stock_Prediction/blob/main/LSTM_stock_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## connect google drive

[link text](https://)## import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from datetime import datetime, timedelta

## get data

**bold text**# New Section

In [None]:
!wget https://transfer.sh//data.csv ##get data



## read dataset

In [None]:
df = pd.read_csv('data.csv') #data as date, price, open, high, low, vol., change %
df.head()

<h1> <a> data exploration </a> </h1>

In [None]:
print("number of samples is {}".format(df.shape[0]))
print("number of features is {}".format(df.shape[1]))

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe() #describe the data in more detail. freq is the frequence of most common data

In [None]:
df.isnull().sum()

### data types

In [None]:
for column in df.columns:
  print('{} data type is {}'.format(column, type(df[column][0])))


<h1> <a> data preprocessing </a> </h1>

* <a> drop unnecessary data </a>

In [None]:
df.drop('Change %', axis =1 , inplace=True) #remove column name change %
df.head()

* <a> remove "," from number </a>

In [None]:
# remove special character
df['Date']   =df['Date'].str.replace(',','-')
#df['Price']  =df['Price'].str.replace(',','')
#df['Open']   =df['Open'].str.replace(',','')
#df['High']   =df['High'].str.replace(',','')
#df['Low']    =df['Low'].str.replace(',','')

In [None]:
df.head(2)

* <a> change datatype of features: Remove "," from columns, change data type of columns like price, open, high, low, vol. to float </a>

In [None]:
df.columns

In [None]:
df = df.rename(columns={"Vol.": "Vol"})
df.Vol = (df.Vol.replace(r'[KM]+$', '', regex=True).astype(float) * \
           df.Vol.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1)\
           .replace(['K','M'], [10**3, 10**6]).astype(int))
df.head(2)

In [None]:
df['Price'] = df['Price'].astype(float)
df['Open'] = df['Open'].astype(float)
df['High'] = df['High'].astype(float)
df['Low'] = df['Low'].astype(float)


In [None]:
df.head(2)

In [None]:
for column in df.columns:
  print('{} data type is {}'.format(column, type(df[column][0])))

In [None]:
df.head(5)

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
for column in df.columns:
  print('{} data type is {}'.format(column, type(df[column][0])))

In [None]:
import calendar

**# We need to properly interpret the date column (remove alphabets, remove ",". So, an additional column named nDate is created.

We need to convert the months in date columns (Jan,Feb,Mar) to numbers 01, 02, 03... **

In [None]:
month_number = {month: index for index, month in enumerate(calendar.month_abbr) if month}#Create a reverse dictionary using the calendar module

In [None]:
month_number

In [None]:
for key, value in month_number.items():
    # do something with value
    if len(str(value)) == 1:
      month_number[key] = '0'+str(value)

In [None]:
month_number

In [None]:
str(month_number[df['Date'][100][:3]])+'-'+ df['Date'][0][3:]

In [None]:
df['nDate']= '_'

In [None]:
for i in range(df.shape[0]):
  df['nDate'][i] = str(month_number[df['Date'][i][:3]]) + df['Date'][i][3:]

In [None]:
df

In [None]:
df['nDate'] = df['nDate'].str.replace('-','')

In [None]:
df['nDate'] = df['nDate'].str.replace(' ','')

In [None]:
df.tail()

In [None]:
df.head()

In [None]:
df['nDate'] = pd.to_datetime(df['nDate'], format='%m%d%Y')

In [None]:
df.head()

In [None]:
df['Date'] = df['nDate'] #replace Date with nDate

In [None]:
df.drop(columns=['nDate'], inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
train_df = df.iloc[:2097, : ]#create training data
train_df.shape

In [None]:
test_df = df.iloc[2097:, : ]#create test data
test_df.shape

In [None]:
for column in df.columns:
  print('{} data type is {}'.format(column, type(df[column][0])))

In [None]:
df.isnull().sum()

In [None]:
df['Date'] = pd.to_datetime(df['Date']).dt.date


In [None]:
plt.figure(figsize = (20,5))
plt.plot(range(df.shape[0]),(df['Low']+df['High'])/2.0, color='blue')
plt.xticks(range(0,df.shape[0],50),df['Date'].loc[::50],rotation=90)
plt.yticks(range(0,16000,1000))
plt.xlabel('Date',fontsize=18)
plt.ylabel('Mid Price (high + low)/2',fontsize=18)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize = (20,5))
plt.plot(range(df.shape[0]),df['Price'], color='red')
plt.xticks(range(0,df.shape[0],50),df['Date'].loc[::50],rotation=90)
plt.yticks(range(0,16000,1000))
plt.xlabel('Date',fontsize=18)
plt.ylabel('Price',fontsize=18)
plt.grid()
plt.show()

<a> <h1> Feature Engineering </h1> </a>

In [None]:
training_data = train_df.iloc[: , 1:2].values
training_data

In [None]:
test_data = test_df.iloc[: , 1:2].values
test_data.shape

* <a> scale all values of train and test data between 0 and 1. </a>

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(training_data)

In [None]:
test_set_scaled = sc.fit_transform(test_data)


## Creating Data with Timesteps


LSTMs expect our data to be in a specific format, usually a 3D array. We start by creating data in 40 timesteps and converting it into an array using NumPy.

In [None]:
length = 60

In [None]:
X_train = []
y_train = []
for i in range(length, 2097):
    X_train.append(training_set_scaled[i-length:i, 0])
    y_train.append(training_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

In [None]:
X_test = []
y_test = []
for i in range(length, 400):
    X_test.append(test_set_scaled[i-length:i, 0])
    y_test.append(test_set_scaled[i, 0])
X_test, y_test = np.array(X_test), np.array(y_test)

X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

In [None]:
print("x_train shape is : ",X_train.shape )
print("y_train shape is : ",y_train.shape )
print("X_test shape is : ",X_test.shape )
print("y_test shape is : ",y_test.shape )

In [None]:
X_train[2]

## train_test_split

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
print("X_train shape is : " ,X_train.shape )
print("X_test shape is : "  , X_test.shape )
print("y_train shape is : " , y_train.shape)
print("y_test shape is : "  ,y_test.shape )

## build lstm using keras
Add the following to neural network:
> Sequential layer: a plain stack of layers where each layer has exactly one input tensor and one output tensor.
> LSTM layer
> Dropout layer to help prevent overfitting
> Dense layer is a fully connected 1 to 1 layer.



In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from keras.layers import LeakyReLU

In [None]:
def build_model():

  model = Sequential()

  # model.add(LSTM(units = 200, return_sequences = True,activation='tanh', input_shape = (X_train.shape[1], 1)))
  # model.add(Dropout(0.1))

  model.add(LSTM(units = 50, return_sequences = True,activation='tanh', input_shape = (X_train.shape[1], 1)))
  model.add(Dropout(0.1))

  model.add(LSTM(units = 50, return_sequences = True , activation='tanh'))
  model.add(Dropout(0.1))

  model.add(LSTM(units = 50, return_sequences = True , activation='tanh'))
  model.add(Dropout(0.1))

  model.add(LSTM(units = 10,  activation='tanh'))
  model.add(Dropout(0.2))

  model.add(Dense(units = 1 , activation='relu'))

  model.summary()


  return model


In [None]:
model = build_model()

In [None]:
BATCH_SIZE = 5
EPOCHS = 100
LR = 0.01

In [None]:
optim = Adam(lr = LR, decay = LR/EPOCHS)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

model.compile(optimizer = 'adam', loss = 'mean_squared_error')

history = model.fit(    X_train, y_train,
                        batch_size = BATCH_SIZE ,
                        steps_per_epoch=len(X_train)//BATCH_SIZE,
                        validation_data=(X_test, y_test),
                        validation_steps=len(X_test)//BATCH_SIZE,
                        epochs=EPOCHS,
                        callbacks=[callback],
                        verbose=1
          )

In [None]:
model.save('model.h5')


## evaluate model

In [None]:
model.evaluate(X_test, y_test)

In [None]:
print(history.history.keys())

In [None]:

#Loss plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()



## predict test data and transform it

In [None]:
X_test.shape

In [None]:
predicted_stock_price = model.predict(X_test)
predicted_stock_price = sc.inverse_transform(predicted_stock_price)


In [None]:
predicted_stock_price = np.array(predicted_stock_price).reshape(-1,1)
predicted_stock_price.shape

In [None]:
y_test = y_test.reshape(-1,1)


In [None]:
y_test.shape

## transform Y_label

In [None]:
y_test = sc.inverse_transform(y_test)

y_test = np.array(y_test).reshape(-1,1)
y_test.shape

In [None]:
predicted_stock_price[:10]

In [None]:
print(np.min(y_test))

## plot predict test with actual



In [None]:
window_size = 100
N = df.shape[0]

In [None]:
str(df['Date'][0])

In [None]:
test_df['Date'] = pd.to_datetime(test_df['Date']).dt.date


In [None]:
plt.figure(figsize = (20,5))
plt.plot(range(test_df.shape[0] - length),y_test, color='blue')
plt.plot(range(test_df.shape[0] - length),predicted_stock_price, color='red')

plt.xticks(range(0,test_df.shape[0],length),test_df['Date'].loc[::length],rotation=90)
plt.yticks(range(0,16000,1000))
plt.xlabel('Date',fontsize=18)
plt.ylabel('Price',fontsize=18)
plt.legend(['true_value', 'Predicted'], loc='upper right')
plt.grid()
plt.show()

In [None]:
df.Price.min()

## Week Prediction

In [None]:

def weekly_prediction(input_data, days_to_predict):
  '''
  Predict prices for comming week

  Input: Preevious prices data

  Return: - array of coming days(days_to_predict) prices.
          - array of coming days(days_to_predict.

  '''
  #clean latest prediction
  coming_week_Prices = []
  print(length)
  # get latest 30 days statistics
  week_df = df[0:length].reset_index()
  week_df = df[2497-60:2498].reset_index()
  #print(week_df)
  # extract peice feature
  input_week = week_df.Price

  # convert df into array
  input_week = np.array(input_week)

  # array into vector 2D
  input_week = input_week.reshape(-1,1)
  #print(input_week)
  # scaling the data
  weekly_set_scaled = sc.fit_transform(input_week)

  # convert shape into the shape of model (1,50,1)
  weekly_set_scaled = weekly_set_scaled.reshape(1,length,1)

  # get nxt day pedection
  # this done by inputing latest 30 days
  week_predicted_stock_price = model.predict(weekly_set_scaled)
  new_price =  sc.inverse_transform(week_predicted_stock_price)[0][0]

  # a new list to store weekly predicted prices
  coming_week_Prices = []
  coming_week_Prices.append(new_price)
  dates = []
  dates.append(week_df['Date'][0] + timedelta(days=1))
  for i in range(days_to_predict - 1):
    dates.append(week_df['Date'][0] + timedelta(days=i+2))
    latest_week_prices_list = list(weekly_set_scaled.reshape(1,-1)[0])
    latest_week_prices_list = latest_week_prices_list[1:]
    latest_week_prices_list.append(week_predicted_stock_price[0][0])
    weekly_set_scaled = np.array(latest_week_prices_list).reshape(1,length,1)
    week_predicted_stock_price = model.predict(weekly_set_scaled)
    coming_week_Prices.append(sc.inverse_transform(week_predicted_stock_price)[0][0])

  plt.figure(figsize = (20,10))
  plt.plot(range(days_to_predict),coming_week_Prices, color='red')

  plt.xticks(range(0,days_to_predict,1),dates,rotation=90)
  plt.yticks(range(int(np.min(coming_week_Prices) - 30), int(np.max(coming_week_Prices) + 30), 20))
  plt.xlabel('Date',fontsize=18)
  plt.ylabel('Price',fontsize=18)
  plt.legend(['true_value', 'Predicted'], loc='upper right')
  plt.grid()
  plt.savefig('sample.png')


  return coming_week_Prices

## it predict 3 days from 20/2/2021

In [None]:
weekly_prediction(df, 2)