In [None]:
# importing all imp libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import keras.backend as K
from keras.optimizers import Adam
from keras.models import load_model
from keras.layers import LSTM
np.random.seed(7)

In [None]:
# data is of netflix from date-(1-aug-2003)_to_(28-aug-2020) from yahoo finance
df = pd.read_csv("/content/stock_prediction_netflix.csv",header=0)
df = df.sort_index(ascending=True, axis=0)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,01-08-2003,1.851429,1.857143,1.782143,1.782143,1.782143,6339200
1,04-08-2003,1.782143,1.797143,1.735714,1.782143,1.782143,3676400
2,05-08-2003,1.792857,1.817857,1.732857,1.736429,1.736429,3084200
3,06-08-2003,1.742857,1.75,1.633571,1.668571,1.668571,8113000
4,07-08-2003,1.650714,1.667857,1.614286,1.649286,1.649286,7893200


In [None]:
df.shape

(4299, 7)

In [None]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,4299.0,4299.0,4299.0,4299.0,4299.0,4299.0
mean,85.137456,86.506299,83.731468,85.187151,85.187151,18163320.0
std,123.252283,125.243608,121.193015,123.33787,123.33787,20033040.0
min,1.3,1.317143,1.272857,1.29,1.29,1493800.0
25%,4.143572,4.22,4.067858,4.142857,4.142857,7197400.0
50%,22.384285,22.858572,21.722857,22.245714,22.245714,11964400.0
75%,105.120003,107.36,102.685001,105.280002,105.280002,21656600.0
max,567.97998,575.369995,521.25,548.72998,548.72998,323414000.0


In [None]:
df.isnull().any()

Date         False
Open         False
High         False
Low          False
Close        False
Adj Close    False
Volume       False
dtype: bool

In [None]:
fig = px.line(df, x='Date', y='Close')
fig.show()

In [None]:
# Taking diff indicators for prediction
# ohlc_avg is the average of open, high, low, close values
# hlc_avgs is the average of high, low, close value
# we will take only ohlc_avg data only in whole nb
ohlc_data = df.iloc[:, 1:5]
ohlc_avg = ohlc_data.mean(axis=1)
hlc_avg = df[['High', 'Low', 'Close']].mean(axis=1)
close = df.Close

In [None]:
fig1 = go.Figure()

fig1.add_trace(go.Scatter(x = df.index, y = ohlc_avg,
                  name='OHLC avg'))
fig1.add_trace(go.Scatter(x = df.index, y = hlc_avg,
                  name='HLC avg'))
fig1.add_trace(go.Scatter(x = df.index, y = close,
                  name='close column data'))
fig1.show()

In [None]:
pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [None]:
if not os.path.exists("images"):
    os.mkdir("images")


In [None]:
# we will create a new df which has only 2 column which is useful to predict data
new_data = pd.DataFrame(index=range(0,len(df)), columns=['Date', 'ohlc_avg'])
for i in range(0, len(df)):
  new_data['Date'][i] = df['Date'][i]
  new_data['ohlc_avg'][i] = ohlc_avg[i]

In [None]:
new_data.head()

Unnamed: 0,Date,ohlc_avg
0,01-08-2003,1.818214
1,04-08-2003,1.774286
2,05-08-2003,1.77
3,06-08-2003,1.69875
4,07-08-2003,1.645536


In [None]:
# setting index
new_data.index = new_data.Date
new_data.drop('Date', axis=1, inplace=True)

In [None]:
print(len(new_data))

4299


In [None]:
ds = new_data.values

In [None]:
# we will take 80% data in train and remaining in test
train = int(len(new_data)*0.8)
test = len(new_data) - train
train, test = new_data.iloc[0:train,:], new_data.iloc[train:len(new_data),:]

In [None]:
train.shape

(3439, 1)

In [None]:
# we have normalize the data cuz data is like 149...., 488..something like that
# so we have to normalize betwwen 0 and 1
scalar = MinMaxScaler(feature_range=(0, 1))
scaled_data = scalar.fit_transform(ds)

In [None]:
# splitting the data to x_train, y_train
# we will first train upto 60 and then predict on 61 and then
# we will train from 61 to 120 then predict on 121 likewise we will go
x_train, y_train = [], []
for i in range(60, len(train)):
  x_train.append(scaled_data[i-60:i,0])
  y_train.append(scaled_data[i,0])

x_train, y_train = np.array(x_train), np.array(y_train)

In [None]:
x_train.shape

(3379, 60)

In [None]:
# create and fit the lstm network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1],1)))
model.add(Dropout(0.25))
model.add(LSTM(units=50))
model.add(Dense(1))
model.add(Activation('linear'))
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
model.fit(x_train, y_train, epochs=50, batch_size=32, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ff788e99ba0>

In [None]:
# predicting 920 values, using past 60 from the train data
inputs = new_data[len(new_data)-len(test) - 60:].values
inputs = inputs.reshape(-1,1)
inputs = scalar.transform(inputs)

In [None]:
inputs.shape

(920, 1)

In [None]:
x_test = []
for i in range(60,inputs.shape[0]):
    x_test.append(inputs[i-60:i,0])
x_test = np.array(x_test)

In [None]:
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

In [None]:
predicted_price = model.predict(x_test)
# inverse transform for getting back all normal values from scaled values
predicted_price = scalar.inverse_transform(predicted_price)



In [None]:
rms=np.sqrt(np.mean(np.power((test-predicted_price),2)))
rms


In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'



ohlc_avg    14.587015
dtype: float64

In [None]:
# create a new column of predicted values
test['Prediction'] = predicted_price
test.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,ohlc_avg,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
30-03-2017,147.422497,148.298859
31-03-2017,147.842499,149.292694
03-04-2017,146.842499,149.512726
04-04-2017,146.079998,148.479065
05-04-2017,144.829998,147.777405


In [None]:
# Graph for comparing the results of model predicted and original value
fig2 = go.Figure()

fig2.add_trace(go.Scatter(x = train.index, y = train.ohlc_avg,
                  name='train'))
fig2.add_trace(go.Scatter(x = test.index, y = test.ohlc_avg,
                  name='test_ohlc_avg'))
fig2.add_trace(go.Scatter(x = test.index, y = test.Prediction,
                  name='test'))
fig2.show()

In [None]:
pip install -U kaleido



In [None]:
fig3 = px.line(df, x='Date', y='Close')
fig3.show()