<a href="https://colab.research.google.com/github/btimper-du/3009-final-project/blob/main/stock_preds_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
def load_df_from_xlsx(url):
    df = pd.read_excel(url)  # Reads the first sheet by default
    return df

In [4]:
url = "https://github.com/btimper-du/3009-final-project/raw/main/Data/Stocks/stockHistoryNvda.xlsx"
ticker = 'NVDA'
df = load_df_from_xlsx(url)
df

Unnamed: 0,symbol,date,open,high,low,close,pre_market,after_hours,volume,status
0,NVDA,2025-11-03,208.0800,211.3350,205.5600,206.8800,203.9900,206.6000,180159632,OK
1,NVDA,2025-10-31,206.4500,207.9700,202.0700,202.4900,205.8900,202.6200,179798344,OK
2,NVDA,2025-10-30,205.1500,206.1600,201.4100,202.8900,207.0400,204.5499,178859778,OK
3,NVDA,2025-10-29,207.9800,212.1899,204.7750,207.0400,206.0200,207.6500,308822352,OK
4,NVDA,2025-10-28,193.0500,203.1500,191.9100,201.0300,191.2000,204.4300,297970590,OK
...,...,...,...,...,...,...,...,...,...,...
1250,NVDA,2020-11-10,13.6065,13.6065,12.6283,12.7700,13.7000,12.7745,644067200,OK
1251,NVDA,2020-11-09,14.5525,14.6915,13.6108,13.6308,14.8255,13.7500,582977880,OK
1252,NVDA,2020-11-06,14.1098,14.5888,13.8950,14.5620,13.8500,14.5498,383348600,OK
1253,NVDA,2020-11-05,14.1625,14.3390,13.9910,14.1600,14.0028,13.9250,319028320,OK


In [None]:
def sort_df_by_date(df):
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by='date', ascending = True, inplace = True)
    df.reset_index(inplace = True)


In [None]:
sort_df_by_date(df)
df

In [None]:
def add_simple_moving_averages(df):
    df['SMA_5'] = df['close'].rolling(window=5).mean()
    df['SMA_100'] = df['close'].rolling(window=100).mean()
    return df

In [None]:
df = add_simple_moving_averages(df)
df

In [None]:
df = df.dropna()
df

In [None]:
df.isnull().any()

In [None]:
def plot_column(df, column_name):
    plt.figure(num=None, figsize=(30,10), dpi=80, facecolor='w', edgecolor='k')
    plt.title(column_name, fontsize=30)
    plt.plot(df['date'], df[column_name])
    plt.xlabel('Date')
    plt.ylabel(column_name)
    plt.show()

In [None]:
plot_column(df, 'close')

In [None]:
plot_column(df, 'SMA_5')

In [None]:
plot_column(df, 'SMA_100')

In [None]:
plot_column(df, 'volume')

In [None]:
features = ['open', 'high', 'low', 'close', 'volume', 'SMA_5']

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Neural Network': MLPRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor()
}

In [None]:
days_to_predict = 1
days_to_predict *= -1

df = df.copy()
df.loc[:, 'target'] = df['close'].shift(days_to_predict)
df

In [None]:
df = df.dropna()

In [None]:
df.isnull().any()

In [None]:
# X is your input matrix
X = df[features]
X

In [None]:
# y is your label
y = df['target']
y

In [None]:
def train_and_predict(model, X_train, y_train, X_test, scaler):
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    return model, y_pred

In [None]:
def compute_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

In [None]:
def plot_actual_vs_pred(index, y_true, y_pred, model_name, date_range_str=None):
    plt.figure(figsize=(14, 6))
    plt.plot(index, y_true, label='Actual', linewidth=2)
    plt.plot(index, y_pred, label='Predicted', linewidth=2)
    plt.xlabel('Date')
    plt.ylabel('Target Value')
    title = f"{ticker}: {model_name} Actual vs. Predicted"
    if date_range_str is not None:
        title = title + f' - Test Dates: {date_range_str}'
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
def evaluate_model(df, X, y, model_name, model, test_size=0.2, plot=True):
    split_idx = int(len(X) * (1 - test_size))
    test_dates = df['date'].iloc[split_idx:]
    date_min = pd.to_datetime(test_dates.min()).strftime('%Y-%m-%d')
    date_max = pd.to_datetime(test_dates.max()).strftime('%Y-%m-%d')
    date_range_str = f"{date_min} to {date_max}"

    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    scaler = StandardScaler()
    fitted_model, y_pred = train_and_predict(model, X_train, y_train, X_test, scaler)

    metrics = compute_metrics(y_test, y_pred)

    results = {
        'metrics': metrics,
        'last_prediction': float(y_pred[-1]) if len(y_pred) > 0 else None,
        'predictions': y_pred,
        'scaler': scaler,
        'model': fitted_model
    }

    if plot:
        plot_actual_vs_pred(test_dates, y_test, y_pred, model_name, date_range_str)

    return results

In [None]:
test_size = 0.2
plotting = True

In [None]:
model_name = 'Linear Regression'
linear_regression_results = evaluate_model(df, X, y, model_name, models[model_name], test_size, plotting)

In [None]:
linear_regression_results['metrics']

In [None]:
model_name = 'Neural Network'
neural_network_results = evaluate_model(df, X, y, model_name, models[model_name], test_size, plotting)

In [None]:
neural_network_results['metrics']

In [None]:
neural_network_results = evaluate_model(df, X, y, model_name, MLPRegressor(max_iter = 10000), test_size, plotting)

In [None]:
neural_network_results['metrics']

In [None]:
model_name = 'Decision Tree'
decision_tree_results = evaluate_model(df, X, y, model_name, models[model_name], test_size, plotting)

In [None]:
decision_tree_results['metrics']

In [None]:
model_name = 'Random Forest'
random_forest_results = evaluate_model(df, X, y, model_name, models[model_name], test_size, plotting)

In [None]:
random_forest_results['metrics']

In [None]:
def plot_actual_vs_multiple_preds(index, y_true, preds_dict, date_range_str=None):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(14, 6))
    plt.plot(index, y_true, label='Actual', linewidth=2)
    for name, y_pred in preds_dict.items():
        plt.plot(index, y_pred, label=f'Predicted: {name}', linewidth=2)
    plt.xlabel('Date')
    plt.ylabel('Target Value')
    title = ticker + ": Actual vs. Multiple Predicted"
    if date_range_str is not None:
        title += f' - Test Dates: {date_range_str}'
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
preds_dict = {
    'Linear Regression': linear_regression_results['predictions'],
    'Neural Network': neural_network_results['predictions'],
    'Decision Tree': decision_tree_results['predictions'],
    'Random Forest': random_forest_results['predictions'],
}

In [None]:
split_idx = int(len(X) * (1 - test_size))
test_dates = df['date'].iloc[split_idx:]
date_min = pd.to_datetime(test_dates.min()).strftime('%Y-%m-%d')
date_max = pd.to_datetime(test_dates.max()).strftime('%Y-%m-%d')
date_range_str = f"{date_min} to {date_max}"
y_test = y.iloc[split_idx:]

In [None]:
plot_actual_vs_multiple_preds(test_dates, y_test, preds_dict, date_range_str)