<a href="https://colab.research.google.com/github/abhinandankatoch/Stock-Market-Prediction/blob/master/Stock_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Stock Market Prediction Model using TensorFlow**

## FETCHING DATA

In [1]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-0.1.67-py2.py3-none-any.whl (25 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.6.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 7.2 MB/s 
Installing collected packages: lxml, yfinance
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfully installed lxml-4.6.4 yfinance-0.1.67


In [2]:
#Importing libraries
import yfinance as yf
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
#Downloading the dataset
data = yf.download("GOOGL" , start = "2018-01-01" , interval = '1d')

[*********************100%***********************]  1 of 1 completed


In [4]:
data.shape

(989, 6)

In [19]:
data.head()

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,1073.209961,1588300
2018-01-03,1091.52002,1565900
2018-01-04,1095.76001,1302600
2018-01-05,1110.290039,1512500
2018-01-08,1114.209961,1232200


In [6]:
#Indexing the data
data.sort_index(inplace = True)

In [7]:
#Removing any duplicate index
data = data.loc[~data.index.duplicated(keep='first')]

In [18]:
data.tail()

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-11-29,2910.610107,1629800
2021-11-30,2837.949951,2103400
2021-12-01,2821.030029,1701000
2021-12-02,2859.320068,1459700
2021-12-03,2840.030029,2060800


In [10]:
#Checking for missing value
data.isnull().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 989 entries, 2018-01-02 to 2021-12-03
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       989 non-null    float64
 1   High       989 non-null    float64
 2   Low        989 non-null    float64
 3   Close      989 non-null    float64
 4   Adj Close  989 non-null    float64
 5   Volume     989 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 54.1 KB


In [13]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,989.0,989.0,989.0,989.0,989.0,989.0
mean,1542.927482,1558.544123,1527.537111,1543.733215,1543.733215,1778496.0
std,558.887048,562.31925,555.009829,558.943359,558.943359,833034.5
min,984.320007,1012.119995,977.659973,984.669983,984.669983,465600.0
25%,1141.77002,1151.579956,1130.199951,1143.5,1143.5,1239800.0
50%,1275.0,1290.0,1263.0,1277.420044,1277.420044,1567200.0
75%,1759.859985,1769.109985,1736.680054,1757.189941,1757.189941,2046200.0
max,2999.51001,3019.330078,2977.97998,2996.77002,2996.77002,6658900.0


In [14]:
import plotly.graph_objects as go

#Check the trend in closing values
fig = go.Figure()
fig.add_trace(go.Scatter(x = data.index , y = data['Close'] , mode = 'lines'))
fig.update_layout(height = 500 , width = 900, xaxis_title='Date' , yaxis_title='Close')
fig.show()

In [15]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = data.index , y = data['Volume'] , mode = 'lines'))
fig.update_layout(height = 500 , width = 900, xaxis_title='Date' , yaxis_title='Volume')
fig.show()

## PROCESSING DATA

In [16]:
#Importing libraries
from sklearn.preprocessing import MinMaxScaler 
import pickle 
from tqdm.notebook import tnrange

In [17]:
#Filtering required data
data = data[['Close' , 'Volume']]
data.head()

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,1073.209961,1588300
2018-01-03,1091.52002,1565900
2018-01-04,1095.76001,1302600
2018-01-05,1110.290039,1512500
2018-01-08,1114.209961,1232200


In [20]:
#Confirm the testing set length
test_length = data[(data.index >= '2020-09-01')].shape[0]

In [21]:
def CreateFeatures_and_Targets(data, feature_length):
    X = []
    Y = []

    for i in tnrange(len(data) - feature_length): 
        X.append(data.iloc[i : i + feature_length,:].values)
        Y.append(data["Close"].values[i+feature_length])

    X = np.array(X)
    Y = np.array(Y)

    return X , Y

In [22]:
X , Y = CreateFeatures_and_Targets(data , 32)

  0%|          | 0/957 [00:00<?, ?it/s]

In [23]:
#Check the shapes
X.shape , Y.shape

((957, 32, 2), (957,))

In [24]:
Xtrain , Xtest , Ytrain , Ytest = X[:-test_length] , X[-test_length:] , Y[:-test_length] , Y[-test_length:]

In [25]:
Xtrain.shape , Ytrain.shape

((639, 32, 2), (639,))

In [26]:
Xtest.shape , Ytest.shape

((318, 32, 2), (318,))

In [27]:
#Scalers to scale Vectors with Multiple Dimensions
class MultiDimensionScaler():
    def __init__(self):
        self.scalers = []

    def fit_transform(self , X):
        total_dims = X.shape[2]
        for i in range(total_dims):
            Scaler = MinMaxScaler()
            X[:, :, i] = Scaler.fit_transform(X[:, :, i])
            self.scalers.append(Scaler)
        return X

    def transform(self , X):
        for i in range(X.shape[2]):
            X[:, :, i] = self.scalers[i].transform(X[:,:,i])
        return X 

In [28]:
Feature_Scaler = MultiDimensionScaler()
Xtrain = Feature_Scaler.fit_transform(Xtrain)
Xtest = Feature_Scaler.transform(Xtest)

In [29]:
Target_Scaler = MinMaxScaler()
Ytrain = Target_Scaler.fit_transform(Ytrain.reshape(-1,1))
Ytest = Target_Scaler.transform(Ytest.reshape(-1,1))

In [31]:
def save_object(obj , name : str):
    pickle_out = open(f"{name}.pck","wb")
    pickle.dump(obj, pickle_out)
    pickle_out.close()

def load_object(name : str):
    pickle_in = open(f"{name}.pck","rb")
    data = pickle.load(pickle_in)
    return data

In [32]:
#Saving Objects

save_object(Feature_Scaler , "Feature_Scaler")
save_object(Target_Scaler , "Target_Scaler")

## Model

In [34]:
from tensorflow.keras.callbacks import ModelCheckpoint , ReduceLROnPlateau

save_best = ModelCheckpoint("best_weights.h5", monitor='val_loss', save_best_only=True, save_weights_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=4, min_lr=0.00001, verbose = 1)

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional

model = Sequential()

model.add(Bidirectional(LSTM(512 ,return_sequences = True , recurrent_dropout = 0.1, input_shape = (32, 2))))
model.add(LSTM(256,recurrent_dropout = 0.1))
model.add(Dropout(0.3))
model.add(Dense(64, activation = 'elu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation = 'elu'))
model.add(Dense(1, activation = 'linear'))



In [36]:
optimizer = tf.keras.optimizers.SGD(learning_rate = 0.002)
model.compile(loss='mse', optimizer=optimizer)

In [37]:
history = model.fit(Xtrain, Ytrain,
            epochs = 10,
            batch_size = 1,
            verbose = 1,
            shuffle = False ,
            validation_data = (Xtest , Ytest),
            callbacks = [reduce_lr , save_best])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (1, 32, 1024)            2109440   
 l)                                                              
                                                                 
 lstm_1 (LSTM)               (1, 256)                  1311744   
                                                                 
 dropout (Dropout)           (1, 256)                  0         
                                                                 
 dense (Dense)               (1, 64)                   16448     
                                                                 
 dropout_1 (Dropout)         (1, 64)                   0         
                                                                 
 dense_1 (Dense)             (1, 32)                   2080      
                                                        

In [39]:
model.load_weights("best_weights.h5")

##VISUALIZING PREDICTIONS

In [40]:
Predictions = model.predict(Xtest)

In [41]:
Predictions = Target_Scaler.inverse_transform(Predictions)
Actual = Target_Scaler.inverse_transform(Ytest)

In [42]:
Predictions = np.squeeze(Predictions , axis = 1)
Actual = np.squeeze(Actual , axis = 1)

In [44]:
#Sample Data Frame
test_dataframe_dict = {'Actual' : list(Actual) , 'Predicted' : list(Predictions)}
test_df = pd.DataFrame.from_dict(test_dataframe_dict)
test_df.index = data.index[-test_length:]
test_df.head()

Unnamed: 0_level_0,Actual,Predicted
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-09-01,1655.079956,1622.489868
2020-09-02,1717.390015,1629.148682
2020-09-03,1629.51001,1635.114136
2020-09-04,1581.209961,1633.932617
2020-09-08,1523.599976,1630.109741


In [45]:
fig = go.Figure()

fig.add_trace(go.Scatter(x = test_df.index , y = Actual , mode = 'lines' , name='Actual'))
fig.add_trace(go.Scatter(x = test_df.index , y = Predictions , mode = 'lines' , name='Predicted'))
fig.show()

## VISUALIZING PREDICTION FOR WHOLE DATA

In [46]:
Total_features = np.concatenate((Xtrain , Xtest) , axis = 0)

In [47]:
Total_Targets = np.concatenate((Ytrain , Ytest) , axis = 0)

In [48]:
Predictions = model.predict(Total_features)

In [49]:
Predictions = Target_Scaler.inverse_transform(Predictions)
Actual = Target_Scaler.inverse_transform(Total_Targets)

In [50]:
Predictions = np.squeeze(Predictions , axis = 1)
Actual = np.squeeze(Actual , axis = 1)

In [51]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = data.index , y = Actual , mode = 'lines' , name='Actual'))
fig.add_trace(go.Scatter(x = data.index , y = Predictions , mode = 'lines' , name='Predicted'))
fig.show()