# LSTM Time Series Forecasting for NVIDIA Stock Prices

## Data Extraction

In [None]:
pip install yfinance



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense
from datetime import datetime, timedelta
import math

import yfinance as yf

In [None]:
# For the Exploratory Data Analysis (EDA), we will focus on Apple and Nvidia stocks
stocks_list = ['AAPL','NVDA']

end = datetime.now()
start = datetime(end.year - 2, end.month, end.day)

for stock in stocks_list:
  globals()[stock] = yf.download(stock, start, end)


company_list = [AAPL, NVDA]
company_name = ["Apple", "NVIDA"]

for company, com_name in zip(company_list, company_name):
  company["company_name"] = com_name

df = pd.concat(company_list, axis=0)
df.tail(10)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Price,Adj Close,Close,High,Low,Open,Volume,company_name,Adj Close,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 7_level_1,NVDA,NVDA,NVDA,NVDA,NVDA,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
2024-10-15 00:00:00+00:00,,,,,,,NVIDA,131.600006,131.600006,138.570007,128.740005,137.869995,377831000.0
2024-10-16 00:00:00+00:00,,,,,,,NVIDA,135.720001,135.720001,136.619995,131.580002,133.979996,264879700.0
2024-10-17 00:00:00+00:00,,,,,,,NVIDA,136.929993,136.929993,140.889999,136.869995,139.339996,306435900.0
2024-10-18 00:00:00+00:00,,,,,,,NVIDA,138.0,138.0,138.899994,137.279999,138.669998,176090200.0
2024-10-21 00:00:00+00:00,,,,,,,NVIDA,143.710007,143.710007,143.710007,138.0,138.130005,264554500.0
2024-10-22 00:00:00+00:00,,,,,,,NVIDA,143.589996,143.589996,144.419998,141.779999,142.910004,226311600.0
2024-10-23 00:00:00+00:00,,,,,,,NVIDA,139.559998,139.559998,142.429993,137.460007,142.029999,285930000.0
2024-10-24 00:00:00+00:00,,,,,,,NVIDA,140.410004,140.410004,141.350006,138.460007,140.820007,172354900.0
2024-10-25 00:00:00+00:00,,,,,,,NVIDA,141.539993,141.539993,144.130005,140.800003,140.929993,205122100.0
2024-10-28 00:00:00+00:00,,,,,,,NVIDA,140.520004,140.520004,143.139999,140.050003,143.0,173031000.0


## Data Pre-Processing

In [None]:
# Create a new data frame with only the 'Close' column
nvidia_data = NVDA.filter(['Close'])
data = nvidia_data.values

In [None]:
# Standardize the data (standardization is less sensitive to outliers compared to normalization)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
# Split the scaled data into training and testing sets
training_data_len = int(np.ceil(len(scaled_data) * .8))
train_data = scaled_data[0:int(training_data_len), :]

# Split the data into x_train, y_train datasets
x_train, y_train = [], []
for i in range(60, len(train_data)):
    x_train.append(train_data[i - 60:i, 0])
    y_train.append(train_data[i,0])

# Convert the x_train and y_train to numppy array and reshape the data
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train.shape

## Long Short-Term Memory (LSTM)

In [None]:
# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(units=50, return_sequences=True))
model.add(LSTM(units=25))
model.add(Dense(units=1))
model.summary()

In [None]:
# Compile and train the LSTM model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, batch_size=1, epochs=20)

In [None]:
# Prepare test data and make predictions
test_data = scaled_data[training_data_len - 60:, :]
x_test = []
y_test = data[training_data_len:,:]

for i in range(60, len(test_data)):
    x_test.append(test_data[i -60:i, 0])

In [None]:
# Convert the data to a numpy array and reshape the data
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

In [None]:
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

In [None]:
# Evaluate model performance using Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

In [None]:
# Plot the predicted stock prices against actual prices
dataset = nvidia_data.filter(['Close'])
train = dataset[:training_data_len]
test = dataset[training_data_len:]
test['Predictions'] = predictions

# Visualize the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.plot(train['Close'])
plt.plot(test[['Close', 'Predictions']])
plt.legend(['Train', 'Test', 'Predictions'], loc='lower right')
plt.show()

In [None]:
# Show the test and predicted price
test

In [None]:
# Evaluate how well the LSTM model performs.
# If the predictions perfectly match the actual values, the points should align along a straight line with a slope of 1.

actual_values = y_test
predicted_values = predictions

plt.figure(figsize=(10, 6))

# Scatter plot of predicted vs. actual values
plt.subplot(2, 2, 1)
plt.scatter(predicted_values, actual_values, alpha=0.7)
plt.title('Predicted vs. Actual Values')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')

In [None]:
# Evaluate the relationship between observed and predicted values.
# Residuals distributed randomly around y = 0 indicate a well-captured data structure, whereas patterns or trends indicate model inadequacy.

# Calculate residuals
residuals = actual_values - predicted_values
plt.figure(figsize=(10, 6))

# Residuals vs. Predicted Values
plt.subplot(2, 2, 2)
plt.scatter(predicted_values, residuals, alpha=0.7)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residuals vs. Predicted Values')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')

In [None]:
plt.figure(figsize=(10, 6))

# Residuals Histogram
plt.subplot(2, 2, 3)
plt.hist(residuals, bins=20, edgecolor='black')
plt.title('Residuals Histogram')
plt.xlabel('Residuals')
plt.ylabel('Frequency')