In [None]:
# Stock market prediction: Helps you to determine the future value of the company stock and other financial instruments traded on an exchange
# Linear Regression: is a supervvised learning algorithm to predict the outcome of a continous variable
    # Y = mx + c
    # Regression line: is the best fit line which has the least squared distance between the original data points and predicted values.

### Stock market prediction: 5 YEAR S&P and NASDAQ 100 - Using Linear Regression

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import chart_studio.plotly as py
import plotly.graph_objs as go

In [None]:
# Loading the stock price dataset using pandas library 
snp = pd.read_csv('ES_5Years_8_11_2024.csv')
# To view the top 5 rows of the dataset
snp.head()

In [None]:
# Using info() function to identify the column and its properties
snp.info()

In [None]:
# Convert the 'time' column to a datetime format
snp['Time'] = pd.to_datetime(snp['Time'])

In [None]:
# Remove the time part, keeping only the date
snp['Time'] = snp['Time'].dt.date

In [None]:
# Rename the 'Time' column to 'Date' (in-place)
snp.rename(columns={'Time': 'Date'}, inplace=True)

In [None]:
# To view and review if the column has be renamed to Date
snp.info()

In [None]:
# Convert the 'Date' column to a datetime format
snp['Date'] = pd.to_datetime(snp['Date'])

In [None]:
# To view and review if the column has be renamed to Date
snp.head() # or use snp.info()
snp.info()

In [None]:
print(f'DataFrame contains stock prices between {snp.Date.min()} {snp.Date.max()}')
print(f'Total Days = {(snp.Date.max() - snp.Date.min()).days} days')

In [None]:
# Summary Statistics using the describe() function
snp.describe()

In [None]:
# Plot a box plot: To Check outliers
snp[['Open', 'High', 'Low', 'Close']].plot(kind='box')

In [None]:
# Setting the layout for our plot: Using go.Layout() function
# he layout object is used to define the overall structure, appearance, and labels for your plot. It includes titles for both the plot and the axes.
layout = go.Layout(
    title = 'Stock Prices of S&P500 and NASDAQ100',
    xaxis = dict(
        title='Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Price',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        ),
        dtick=500
    )
)

snp_data = [{'x':snp['Date'], 'y':snp['Close']}]
plot = go.Figure(data=snp_data, layout=layout)
plot

In [None]:
# Building the regression model
from sklearn.model_selection import train_test_split

# For preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# For model evaluation
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
# Split the data into train and test sets
# Define X as the relevant features (excluding Date and Close) and convert it into numpy array using .values
X = snp[['Open', 'High', 'Low', 'Volume']].values  #2-D array
# Y is still the 'Close' price
Y = snp['Close'].values

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

# Check the shapes of the resulting datasets
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
# Feature Scaling
scaler = StandardScaler().fit(X_train)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Creating a Linear Model
lm = LinearRegression()
lm.fit(X_train, Y_train)

In [None]:
# Make predictions on the test set
Y_test_pred = lm.predict(X_test)
Y_test_pred

In [None]:
# Make predictions on the training set
Y_train_pred = lm.predict(X_train)

In [None]:
# Scatter plot of actual vs predicted values
plt.scatter(Y_train, Y_train_pred, alpha=0.5, label='Data Points')

# Fit a regression line
m, b = np.polyfit(Y_train, Y_train_pred, 1)  # Fit a linear regression line (y = mx + b)
plt.plot(Y_train, m*Y_train + b, color='red', label='Regression Line')

# Add labels and title
plt.xlabel('Actual Prices (Train)')
plt.ylabel('Predicted Prices (Train)')
plt.title('Actual vs Predicted Stock Prices (Train Dataset)')
plt.legend()
plt.show()

In [None]:
# Calculate scores for model evaluation

# Make predictions on the test set
Y_test_pred = lm.predict(X_test)

# Mean Squared Error (MSE)
mse_value = mse(Y_test, Y_test_pred)
print(f'Mean Squared Error (MSE): {mse_value}')

# Root Mean Squared Error (RMSE)
rmse_value = np.sqrt(mse_value)
print(f'Root Mean Squared Error (RMSE): {rmse_value}')

# Mean Absolute Error (MAE)
mae_value = mean_absolute_error(Y_test, Y_test_pred)
print(f'Mean Absolute Error (MAE): {mae_value}')

# R-squared (R²)
r2_value = r2_score(Y_test, Y_test_pred)
print(f'R-squared (R²): {r2_value}')