# **Stock Random Forest Model**

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import statsmodels as smt
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [2]:
stock = pd.read_csv("stockffads.CSV")
stock

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Mkt-RF,SMB,HML,...,SMA_50,EMA_12,EMA_26,MACD,RSI,ADS_Index,GM Yest_Close,TM Yest_Close,F Yest_Close,RACE Yest_Close
0,2017-01-04,14.316667,15.200000,14.287333,15.132667,15.132667,168202500,0.79,0.95,-0.16,...,15.132667,15.132667,15.132667,0.000000,100.000000,0.174783,30.369259,118.550003,9.082818,55.978443
1,2017-01-05,15.094667,15.165333,14.796667,15.116667,15.116667,88675500,-0.21,-0.89,-0.79,...,15.124667,15.130205,15.131481,-0.001276,0.000000,0.154241,32.045399,121.190002,9.501246,56.424824
2,2017-01-06,15.128667,15.354000,15.030000,15.267333,15.267333,82918500,0.29,-0.66,-0.31,...,15.172222,15.151302,15.141545,0.009757,91.024248,0.134146,31.440609,120.440002,9.212674,56.377335
3,2017-01-09,15.264667,15.461333,15.200000,15.418667,15.418667,59692500,-0.37,-0.30,-1.03,...,15.233833,15.192435,15.162072,0.030363,95.477309,0.076169,31.095015,120.129997,9.205460,55.978443
4,2017-01-10,15.466667,15.466667,15.126000,15.324667,15.324667,54900000,0.16,0.89,0.43,...,15.252000,15.212778,15.174116,0.038662,71.686831,0.057208,31.112295,119.739998,9.111673,55.351604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1648,2023-07-25,272.380005,272.899994,265.000000,265.279999,265.279999,112757300,0.25,-0.04,-0.79,...,239.121800,272.029478,264.035489,7.993989,51.187379,0.310831,39.194069,164.529999,13.960000,317.160004
1649,2023-07-26,263.250000,268.040009,261.750000,264.350006,264.350006,95856200,0.02,0.68,1.03,...,241.049200,270.848021,264.058787,6.789234,50.639496,0.307146,37.817787,165.559998,13.580000,316.109985
1650,2023-07-27,268.309998,269.130005,255.300003,255.710007,255.710007,103697300,-0.74,-0.90,0.27,...,242.836400,268.519095,263.440359,5.078737,45.741140,0.302536,38.186794,165.699997,13.670000,317.350006
1651,2023-07-28,259.859985,267.250000,258.230011,266.440002,266.440002,111446000,1.14,0.53,-0.33,...,244.834800,268.199235,263.662555,4.536680,51.956498,0.296997,38.864960,165.429993,13.730000,317.269989


In [3]:
X = stock.drop(['Adj Close', 'Close', 'Date'], axis=1)  # Features
y = stock['Adj Close']  # Target variable

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [5]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

In [6]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [7]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [8]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 1.8703 degrees.
Accuracy = 98.64%.


In [9]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 1.8551 degrees.
Accuracy = 98.64%.


In [10]:
y_pred = best_random.predict(X_test)
df_pred = pd.DataFrame(y_pred, columns=['Predicted'])
df_pred

Unnamed: 0,Predicted
0,160.726034
1,315.032464
2,237.368078
3,278.924148
4,53.304087
...,...
491,250.471665
492,148.208558
493,22.882340
494,307.093441


In [11]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 11.535560401256596


In [12]:
# Get feature importances from the trained model
feature_importances = best_random.feature_importances_

# Create a DataFrame with feature names and importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances * 100})

# Sort the features based on importances in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

Unnamed: 0,Feature,Importance
1,High,20.83827
2,Low,16.61515
0,Open,15.022998
12,EMA_12,10.196005
8,Yest_Close,9.977285
11,SMA_50,6.584013
13,EMA_26,5.974966
10,SMA_20,5.310201
20,RACE Yest_Close,3.427831
18,TM Yest_Close,2.118075


In [13]:
# Select the top k significant features
k = 10  # Number of significant features to select
significant_features = feature_importance_df.head(k)['Feature'].values

# Filter the data to keep only the significant features
X_significant_training = X_train[significant_features]
X_significant_testing = X_test[significant_features]

In [15]:
# Random search of parameters, using 6 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random_optimitized = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 6, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random_optimitized.fit(X_significant_training, y_train)

Fitting 6 folds for each of 100 candidates, totalling 600 fits


In [16]:
base_model_optimized = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model_optimized.fit(X_significant_training, y_train)
base_accuracy_optimized = evaluate(base_model_optimized, X_significant_testing, y_test)

Model Performance
Average Error: 2.1678 degrees.
Accuracy = 98.46%.


In [17]:
best_random_optimized = rf_random_optimitized.best_estimator_
random_accuracy_optimized = evaluate(best_random_optimized, X_significant_testing, y_test)

Model Performance
Average Error: 2.2690 degrees.
Accuracy = 98.39%.


In [18]:
y_pred_optimized = best_random_optimized.predict(X_significant_testing)
df_pred_optimized = pd.DataFrame(y_pred_optimized, columns=['Optimized Predicted'])
df_pred_optimized

Unnamed: 0,Optimized Predicted
0,160.298916
1,313.778818
2,237.905443
3,281.090406
4,53.243531
...,...
491,250.330935
492,149.382780
493,22.754246
494,306.564748


In [19]:
mse_optimized = mean_squared_error(y_test, y_pred_optimized)
print("Mean Squared Error:", mse_optimized)

Mean Squared Error: 16.013243865095987


In [20]:
df_test = pd.DataFrame(y_test, columns=['Adj Close'])
df_test.reset_index(drop=True, inplace=True)
df_test

Unnamed: 0,Adj Close
0,161.199997
1,319.503326
2,236.473328
3,273.843323
4,54.085999
...,...
491,251.943329
492,153.766663
493,22.933332
494,307.333344


In [21]:
# Concatenate the dataframes horizontally
merged_df = pd.concat([df_test, df_pred, df_pred_optimized], axis=1)
merged_df

Unnamed: 0,Adj Close,Predicted,Optimized Predicted
0,161.199997,160.726034,160.298916
1,319.503326,315.032464,313.778818
2,236.473328,237.368078,237.905443
3,273.843323,278.924148,281.090406
4,54.085999,53.304087,53.243531
...,...,...,...
491,251.943329,250.471665,250.330935
492,153.766663,148.208558,149.382780
493,22.933332,22.882340,22.754246
494,307.333344,307.093441,306.564748


In [23]:
'''# Plot the line chart
plt.figure(figsize=(25, 10))

plt.plot(merged_df['Adj Close'], color= 'red', label='Original Close Price',)
plt.plot(merged_df['Predicted'], color='blue', label='Predicted Close Price')
plt.plot(merged_df['Optimized Predicted'], color='green', label='Optimized Predicted Close Price')

# Set the labels and title
plt.ylabel('Close Price')
plt.title('Actual vs Predicted Close Price')
plt.legend()
plt.show()'''

import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=merged_df.index, 
    y=merged_df['Adj Close'],
    name='Original Close Price',
    line=dict(color='red')
))

fig.add_trace(go.Scatter(
    x=merged_df.index,
    y=merged_df['Predicted'],
    name='Predicted Close Price',
    line=dict(color='blue')  
))

fig.add_trace(go.Scatter(
    x=merged_df.index,
    y=merged_df['Optimized Predicted'],
    name='Optimized Predicted Close Price', 
    line=dict(color='green')
))

fig.update_layout(
    title='Actual vs Predicted Close Price',
    xaxis_title='Date',
    yaxis_title='Close Price',
    legend_title='Legend'
)

fig.show()