In [36]:
#Import dependencies
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [37]:
# Define the data_load function to read and process the data
def data_load(data_path):
    data_list = []
    for file in os.listdir(data_path):
        if file.endswith('.csv'):
            file_path = os.path.join(data_path, file)
            data = pd.read_csv(file_path, parse_dates=['Date'])
            data_list.append(data)
    return pd.concat(data_list, ignore_index=True)

# Set path to Data and define a data variable
data_path = os.path.join(os.getcwd(), 'Data')
data = data_load(data_path)

  return pd.concat(data_list, ignore_index=True)


In [38]:
# Filter our new data to only use entries from January 1st, 2020 onward

start_date = datetime(2020, 1, 1)
data_filtered_2020s = data[data['Date'] >= start_date]

In [39]:
# Define new features in the data for the previous day's close and previous day's volume. Then use 'dropna()' to drop non-values.

data_filtered_2020s['Prev_Day_Close'] = data_filtered_2020s['Close'].shift(1)
data_filtered_2020s['Prev_Day_Vol'] = data_filtered_2020s['Volume'].shift(1)
data_filtered_2020s = data_filtered_2020s.dropna()

# Make sure data is sorted by Date
data_filtered_2020s = data_filtered_2020s.sort_values(by='Date')

# Define features (X) and target (y) for training purposes
X = data_filtered_2020s[['Prev_Day_Close', 'Prev_Day_Vol']]
y = data_filtered_2020s['Close']

# Split data into test and training sets chronologically
cutoff_date = datetime(2023, 12, 31)
X_train = X[data_filtered_2020s['Date'] <= cutoff_date]
y_train = y[data_filtered_2020s['Date'] <= cutoff_date]
X_test = X[data_filtered_2020s['Date'] > cutoff_date]
y_test = y[data_filtered_2020s['Date'] > cutoff_date]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered_2020s['Prev_Day_Close'] = data_filtered_2020s['Close'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered_2020s['Prev_Day_Vol'] = data_filtered_2020s['Volume'].shift(1)


In [40]:
# Train the stock_model
stock_model = RandomForestRegressor(n_estimators=100, random_state=42)
stock_model.fit(X_train, y_train)

In [41]:
# Evaluate stock_model using mse (mean squared error)
y_pred = stock_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1692.47385425496


In [42]:
#Predict future stock prices for the remainder of 2024 (Q3 to Q4)

future_dates = pd.date_range(start='2024-07-01', end='2024-12-31', freq='B') # freq = B restricts to Business Days
q3q4_data = pd.DataFrame({
    'Date': future_dates,
    'Prev_Day_Close': np.random.uniform(low=min(data_filtered_2020s['Close']), high=max(data_filtered_2020s['Close']), size=len(future_dates)),
    'Prev_Day_Vol': np.random.uniform(low=min(data_filtered_2020s['Volume']), high=max(data_filtered_2020s['Volume']), size=len(future_dates))
})

q3q4_X = q3q4_data[['Prev_Day_Close', 'Prev_Day_Vol']]
q3q4_predictions = stock_model.predict(q3q4_X)

# Add predictions to the q3q4_data DF
q3q4_data['Predicted_Close'] = q3q4_predictions
print(q3q4_data)

          Date  Prev_Day_Close  Prev_Day_Vol  Predicted_Close
0   2024-07-01     2054.426798  6.678763e+08      2066.759948
1   2024-07-02      544.307303  1.546090e+08       536.391608
2   2024-07-03      147.454241  4.253801e+08       155.818170
3   2024-07-04     2306.909771  5.037887e+08      2284.892109
4   2024-07-05     3458.272334  4.545665e+08      3498.300154
..         ...             ...           ...              ...
127 2024-12-25     2974.216398  3.395765e+08      2960.774961
128 2024-12-26      683.685167  6.817872e+08       672.819907
129 2024-12-27     2207.334955  3.306476e+08      2209.594961
130 2024-12-30     3759.727478  8.222475e+08      3505.165137
131 2024-12-31     2351.022471  5.402364e+08      2357.292424

[132 rows x 4 columns]
