In [None]:
import os
import pandas as pd
import numpy as np
import requests
import json
import yfinance as yf
import seaborn as sns
from dotenv import load_dotenv
from openai import OpenAI
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Load .env file
load_dotenv()

In [None]:
# Check if the environment variable is set for Open AI API
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise ValueError("The environment variable 'OPENAI_API_KEY' is not set.")

In [None]:
# Check if the environment variable is set for Vantage API
van_api_key = os.getenv("VANTAGE_API_KEY")
if van_api_key is None:
    raise ValueError("The environment variable 'VANTAGE_API_KEY' is not set.")

In [None]:
# Stock symbol for S&P 500
symbol = "SPY"

# API URL for real-time stock data
stock_url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={symbol}&outputsize=full&apikey={van_api_key}"

# Fetch stock data
response = requests.get(stock_url)

stock_data = response.json()

print(stock_data)

In [None]:
# Extract time series data
time_series = stock_data["Time Series (Daily)"]
# Prints daily stock price data
print(time_series)

In [None]:
# Convert to DataFrame
df = pd.DataFrame.from_dict(time_series, orient="index")
df

In [None]:
# Rename columns for readability
df.columns = ["open", "high", "low", "close", "volume"]
df

In [None]:
df.info()

In [None]:
# Convert data types
df = df.astype(float)

In [None]:
df.info()

In [None]:
# Reset index to make the date a column
df = df.reset_index()

In [None]:
# Rename the new column to "date" (if it’s not automatically named)
df = df.rename(columns={"index": "date"})
df

In [None]:
# Convert to datetime format (recommended)
df["date"] = pd.to_datetime(df["date"])

# Display the DataFrame
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Correlation between the variable
corr_matrix = df[['open', 'high', 'low', 'close', 'volume']].corr()
corr_matrix

In [None]:
# Visualize the correlation matrix
plt.figure(figsize=(5,5))
sns.heatmap(corr_matrix, annot=True)

In [None]:
# Visualization of the closing price and the date
plt.figure(figsize=(10,10))
plt.title('CLosing price history of SPY')
plt.plot(df['date'], df['close'])
plt.xlabel('Date')
plt.ylabel('Closing price (USD)')
plt.show()

In [None]:
df.head()

In [None]:
# Build the machine learning algorithm
X = df[['open', 'high', 'low', 'volume']].values
y = df['close'].values

In [None]:
# Data spliting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
X_train

In [None]:
# Feature scaling
# Standard scaler - mean of 0 and a standard deviation of 1
# Minmaxscaler 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Multi linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
# Prediction on the test set
y_pred_mlr = lin_reg.predict(X_test)

In [None]:
# Put in a dataframe
pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})

In [None]:
# model evaluation: LinearRegression
# r2, mean absolute error, mean square error, root mean square error
# 0, 1
# mae must have a lower mae 0. 
# mse --o.
# rmse 0. 
r2 = metrics.r2_score(y_test, y_pred_mlr)
meanABerr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R2:',r2)
print('Mean Absolute Error:',meanABerr)
print('Mean Square Error:',meanSqErr)
print('Root Mean Square Error:',rootMeanSqErr)

In [None]:
# Support vector regression
# Radial basis function
svr = SVR(kernel = 'linear')
svr.fit(X_train, y_train)

In [None]:
# Prediction on the test data
y_pred_svr = svr.predict(X_test)

In [None]:
# Put in a dataframe
pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_svr})

In [None]:
# Model evaluation: SVR
# r2, mean absolute error, mean square error, root mean square error
# 0, 1
# mae must have a lower mae 0. 
# mse --o.
# rmse 0. 
r2 = metrics.r2_score(y_test, y_pred_svr)
meanABerr = metrics.mean_absolute_error(y_test, y_pred_svr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_svr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr))
print('R2:',r2)
print('Mean Absolute Error:',meanABerr)
print('Mean Square Error:',meanSqErr)
print('Root Mean Square Error:',rootMeanSqErr)

In [None]:
# random forest regression
reg = RandomForestRegressor(n_estimators=10)
reg.fit(X_train, y_train)

In [None]:
# prediction on the test data
y_pred_rf = reg.predict(X_test)

In [None]:
# put in a dataframe
pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_rf })

In [None]:
# Model evaluation: RandomForestRegressor
# r2, mean absolute error, mean square error, root mean square error
# 0, 1
# mae must have a lower mae 0. 
# mse --o.
# rmse 0. 
r2 = metrics.r2_score(y_test, y_pred_rf)
meanABerr = metrics.mean_absolute_error(y_test, y_pred_rf)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_rf)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf))
print('R2:',r2)
print('Mean Absolute Error:',meanABerr)
print('Mean Square Error:',meanSqErr)
print('Root Mean Square Error:',rootMeanSqErr)

In [None]:


# Define scalers
scalers = {
    "Standard Scaler": StandardScaler(),
    "Min-Max Scaler": MinMaxScaler(),
    "Robust Scaler": RobustScaler()
}

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Regression": SVR(kernel='linear'),
    "RandomForestRegression": RandomForestRegressor(n_estimators=10, random_state=42),
    "XGBoost Regression": XGBRegressor(n_estimators=100, seed=42)
}

# Dictionary to store all results
all_results = {}

# Iterate through each scaler and evaluate models
for scaler_name, scaler in scalers.items():
    results = {"R² Score": {}, "MSE": {}, "RMSE": {}}
    
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)
        
        # Compute metrics
        r2 = r2_score(y_test, predictions)
        mse = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(mse)
        
        # Store results
        results["R² Score"][model_name] = r2
        results["MSE"][model_name] = mse
        results["RMSE"][model_name] = rmse
    
    all_results[scaler_name] = results

# Convert results to DataFrame format for R² Score
r2_df = pd.DataFrame({scaler: all_results[scaler]["R² Score"] for scaler in scalers})
mse_df = pd.DataFrame({scaler: all_results[scaler]["MSE"] for scaler in scalers})
rmse_df = pd.DataFrame({scaler: all_results[scaler]["RMSE"] for scaler in scalers})

# Print DataFrames
print("\nR² Scores:\n", r2_df)
print("\nMean Squared Errors (MSE):\n", mse_df)
print("\nRoot Mean Squared Errors (RMSE):\n", rmse_df)

# Plotting R² Scores
ax = r2_df.plot(kind='bar', figsize=(15, 6), colormap='viridis')
plt.title('Model R² Scores with Different Scalers', fontsize=14)
plt.ylabel('R² Score', fontsize=12)
plt.xlabel('Models', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title="Scaler", fontsize=10)

# Add text labels on top of each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', label_type='edge', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
# Plotting MSE Scores
ax = mse_df.plot(kind='bar', figsize=(15, 6), colormap='viridis')
plt.title('Model MSE with Different Scalers', fontsize=14)
plt.ylabel('MSE', fontsize=12)
plt.xlabel('Models', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title="Scaler", fontsize=10)

# Add text labels on top of each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', label_type='edge', fontsize=10)

In [None]:
# Plotting RMSE Scores
ax = rmse_df.plot(kind='bar', figsize=(15, 6), colormap='viridis')
plt.title('Model RMSE with Different Scalers', fontsize=14)
plt.ylabel('RMSE', fontsize=12)
plt.xlabel('Models', fontsize=12)
plt.xticks(rotation=0)
plt.legend(title="Scaler", fontsize=10)

# Add text labels on top of each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', label_type='edge', fontsize=10)