In [1]:
# Import dependencies
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
from sklearn.exceptions import DataConversionWarning

# Filter out warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)

In [2]:
# Define the data_load function to read and process the data for a single stock
def data_load(file_path):
    data = pd.read_csv(file_path, parse_dates=['Date'])
    return data

# Set path to Data folder
data_path = 'Data'

In [3]:
# Create a list to store all predictions
stock_predictions = []

# Use a for loop to iterate data_load over each CSV in the Data folder to get individual results for each stock
for stock in os.listdir(data_path):
    if stock.endswith('.csv'):
        file_path = os.path.join(data_path, stock)
        data = data_load(file_path)

        # Filter this stock's data to only use entries from January 1st, 2020 onward
        start_date = datetime(2020, 1, 1)
        data_filtered_2020s = data[data['Date'] >= start_date].copy()

        # Define new features in the data for the previous day's close and previous day's volume
        data_filtered_2020s.loc[:, 'Prev_Day_Close'] = data_filtered_2020s['Close'].shift(1)
        data_filtered_2020s.loc[:, 'Prev_Day_Vol'] = data_filtered_2020s['Volume'].shift(1)

        # Drop rows with NaN values using dropna()
        data_filtered_2020s = data_filtered_2020s.dropna()

        # Ensure data is sorted by date
        data_filtered_2020s = data_filtered_2020s.sort_values(by='Date')

        # Define features (X) and target (y) for training purposes
        X = data_filtered_2020s[['Prev_Day_Close', 'Prev_Day_Vol']]
        y = data_filtered_2020s['Close']

        # Split data into test and training sets chronologically
        cutoff_date = datetime(2023, 12, 31)
        X_train = X[data_filtered_2020s['Date'] <= cutoff_date]
        y_train = y[data_filtered_2020s['Date'] <= cutoff_date]
        X_test = X[data_filtered_2020s['Date'] > cutoff_date]
        y_test = y[data_filtered_2020s['Date'] > cutoff_date]

        # Train stock_model
        stock_model = RandomForestRegressor(n_estimators=100, random_state=42)
        stock_model.fit(X_train, y_train)

        # Evaluate the stock_model using mse (mean squared error)
        y_pred = stock_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        print(f'{stock}: Mean Squared Error: {mse}')

        # Make full year predictions for this stock
        future_dates = pd.date_range(start='2024-07-01', end='2029-7-01', freq='B')  # 'B' restricts to business days
        future_data = pd.DataFrame({
            'Date': future_dates
        })

        # Initialize with the last known close
        last_known_close = data_filtered_2020s['Close'].iloc[-1]
        
        # Define the range for random volume generation
        min_vol = min(data_filtered_2020s['Volume'])
        max_vol = max(data_filtered_2020s['Volume'])
        
        # Iteratively predict each day's closing price using the previous day's projected close
        projected_closes = []
        for i in range(len(future_dates)):
            if i == 0:
                prev_close = last_known_close
            else:
                prev_close = projected_closes[-1]
            
            prev_vol = np.random.uniform(low=min_vol, high=max_vol)
            
            future_data.loc[i, 'Prev_Day_Close'] = prev_close
            future_data.loc[i, 'Prev_Day_Vol'] = prev_vol
            
            # Predict the closing price
            pred_close = stock_model.predict([[prev_close, prev_vol]])[0]
            projected_closes.append(pred_close)
        
        future_data['Projected_Close'] = projected_closes
        future_data['Stock'] = stock.replace('.csv', '')  # Add stock identifier
        stock_predictions.append(future_data)

# Concatenate all predictions into one DataFrame
stock_predictions_df = pd.concat(stock_predictions, ignore_index=True)

print(stock_predictions_df.head())

# Save predictions to a CSV file
stock_predictions_df.to_csv('fiveyr_future_stock_predictions.csv', index=False)
print('Stock projections exported to fiveyr_future_stock_predictions.csv')

AAPL.csv: Mean Squared Error: 7.545793172651929
ADBE.csv: Mean Squared Error: 237.13527766031015
ADI.csv: Mean Squared Error: 9.899333582396139
ADP.csv: Mean Squared Error: 8.453156352424996
ALGN.csv: Mean Squared Error: 59.8948731034754
AMD.csv: Mean Squared Error: 643.3912711906563
AMGN.csv: Mean Squared Error: 171.67951722131193
AMZN.csv: Mean Squared Error: 10.328291441020705
ANSS.csv: Mean Squared Error: 35.80984141449033
ASML.csv: Mean Squared Error: 4897.237497126069
AVGO.csv: Mean Squared Error: 24244.281594918208
BIIB.csv: Mean Squared Error: 211.36837458100948
BKNG.csv: Mean Squared Error: 9714.20489668023
CDNS.csv: Mean Squared Error: 879.2300271701076
CDW.csv: Mean Squared Error: 280.4203220611254
CHKP.csv: Mean Squared Error: 83.21595483387132
CHTR.csv: Mean Squared Error: 985.3039215071694
CMCSA.csv: Mean Squared Error: 0.4730639760639107
COST.csv: Mean Squared Error: 2982.711720673167
CPRT.csv: Mean Squared Error: 13.387377569404103
CSCO.csv: Mean Squared Error: 0.480066