In [6]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

df = pd.read_csv('all_stocks_5yr.csv')

df = df.dropna()

X = df[['open', 'high', 'low', 'volume']]
y = df['close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Random Forest Model Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")
print(f"Mean Absolute Percentage Error: {mape:.4f}%")

feature_importances = model.feature_importances_
feature_names = X.columns
sorted_indices = np.argsort(feature_importances)[::-1]

print("\nFeature Importance:")
for i in sorted_indices:
    print(f"{feature_names[i]}: {feature_importances[i]:.4f}")

random_indices = random.sample(range(len(y_test)), 5)

print("\nActual vs Predicted (Random Forest):")
print("Date\tActual\tPredicted")
for i in random_indices:
    print(f"{df.iloc[i+len(df)-len(y_test)]['date']}\t{y_test.iloc[i]:.2f}\t{y_pred[i]:.2f}")

Random Forest Model Performance:
Mean Squared Error: 0.5758
R-squared: 0.9999
Mean Absolute Percentage Error: 0.0044%

Feature Importance:
low: 0.5184
high: 0.4813
open: 0.0003
volume: 0.0000

Actual vs Predicted (Random Forest):
Date	Actual	Predicted
2014-01-10	55.14	55.25
2015-05-22	79.23	78.71
2014-04-07	70.16	70.50
2015-11-18	14.56	14.66
2017-03-08	63.21	63.51
