<a href="https://colab.research.google.com/github/amannain122/stock_price_analysis/blob/main/notebooks/models_stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

In [2]:
sp500 = pd.read_csv("S&P_500_data_cleaned.csv", index_col=0)
sp500.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,50_MA,Open_Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1928-03-13,17.92,17.92,17.92,17.92,17.92,17.473,17.93
1928-03-14,17.93,17.93,17.93,17.93,17.93,17.4784,18.07
1928-03-15,18.07,18.07,18.07,18.07,18.07,17.4846,18.26
1928-03-16,18.26,18.26,18.26,18.26,18.26,17.4954,18.360001
1928-03-19,18.360001,18.360001,18.360001,18.360001,18.360001,17.5116,18.459999


In [9]:
latest_data = pd.read_csv("S&P_500_data_last_row.csv", index_col = 0)
latest_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,50_MA,Open_Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-11-11,6008.859863,6017.310059,5986.689941,6001.350098,6001.350098,5735.436768,


In [None]:
latest_data = latest_data.drop(columns=["Open_Target"])

In [3]:
X = sp500.drop(columns=["Open_Target"])
y = sp500["Open_Target"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression


model = LinearRegression()

cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_scores2 = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print(-cv_scores.mean())
print(cv_scores2.mean())

56.429810460510964
0.9999515095730087


In [60]:
model.fit(X_train, y_train)

#Make Predictions and Evaluate
train_prediction = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(train_prediction, y_train))
print(f"Train RMSE: {rmse_train}")
r2_score(train_prediction, y_train)

Train RMSE: 7.481171820420204


0.9999517515505896

In [61]:
#Make Predictions and Evaluate
test_prediction = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(test_prediction, y_test))
print(f"Test RMSE: {rmse_test}")
r2_score(test_prediction, y_test)

Test RMSE: 8.200421975123794


0.999943067334076

In [65]:
lr_predict = model.predict(latest_data)
lr_predict

array([6012.19581703])

## Ridge/Lasso Regularization

In [50]:
from sklearn.linear_model import Ridge, Lasso

# Train a Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Make predictions
train_prediction = ridge_model.predict(X_train)
y_pred = ridge_model.predict(X_test)

# Calculate RMSE
rmse_train = np.sqrt(mean_squared_error(train_prediction, y_train))
print(f"Train RMSE: {rmse_train}")
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Ridge Regression RMSE: {ridge_rmse}")

# Alternatively, try Lasso Regression
lasso_model = Lasso(alpha=0.1)  # You can tune alpha here too
lasso_model.fit(X_train, y_train)

# Make predictions for Lasso
train_prediction_lasso = lasso_model.predict(X_train)
y_pred_lasso = lasso_model.predict(X_test)

# Calculate RMSE for Lasso
lasso_rmse_train = np.sqrt(mean_squared_error(train_prediction_lasso, y_train))
print(f"Train RMSE Lasso: {lasso_rmse_train}")
lasso_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print(f"Lasso Regression RMSE: {lasso_rmse}")

Train RMSE: 7.481171820431594
Ridge Regression RMSE: 8.200421886895045
Train RMSE Lasso: 12.491640177638834
Lasso Regression RMSE: 12.48418432006406


In [64]:
ridge_predict = ridge_model.predict(latest_data)
print(ridge_predict)
lasso_predict = lasso_model.predict(latest_data)
print(lasso_predict)

[6012.19582768]
[6011.06632819]


##XGBoost

In [44]:
import xgboost as xgb
from xgboost import XGBRegressor


# Define hyperparameter grid for XGBoost
xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform RandomizedSearchCV for XGBoost
xgb_random_search = RandomizedSearchCV(XGBRegressor(objective='reg:squarederror', eval_metric='rmse'),
                                       param_distributions=xgb_param_dist, n_iter=10, cv=5,
                                       verbose=2, n_jobs=-1)

# Fit the model with early stopping by using validation data
xgb_random_search.fit(X_train, y_train,
                      eval_set=[(X_test, y_test)], # Set up the evaluation set (test data)
                      verbose=False)                 # Show progress

# Get the best XGBoost model from RandomizedSearchCV
best_xgb_model = xgb_random_search.best_estimator_

# Make predictions and calculate RMSE for the best model
train_prediction_xgb = best_xgb_model.predict(X_train)
test_prediction_xgb = best_xgb_model.predict(X_test)

# Calculate RMSE for the best model
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, train_prediction_xgb))
print(f"Train RMSE XGBoost: {xgb_train_rmse}")
best_xgb_rmse = np.sqrt(mean_squared_error(y_test, test_prediction_xgb))
print(f"Test RMSE XGBoost: {best_xgb_rmse}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Train RMSE XGBoost: 12.662294147182667
Test RMSE XGBoost: 14.88757792453262


In [70]:
xgb_predict = best_xgb_model.predict(latest_data)
xgb_predict

array([5698.06], dtype=float32)

## RandomForest Regressor

In [46]:
from sklearn.ensemble import RandomForestRegressor


# # Hyperparameter tuning for RandomForest
# param_dist = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, 30, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# rf_random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_distributions=param_dist,warm_start = True, n_iter=10, cv=5, verbose=2, n_jobs=-1)
# rf_random_search.fit(X_train, y_train)

# # Get the best estimator
# best_rf_model = rf_random_search.best_estimator_

# Make predictions
train_prediction_rf = best_rf_model.predict(X_train)
test_prediction_rf = best_rf_model.predict(X_test)

#Calculate RMSE
rf_train_rmse = np.sqrt(mean_squared_error(y_train, train_prediction_rf))
print(f"Train RMSE Random Forest: {rf_train_rmse}")
test_rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_best_rf))
print(f"Test RMSE Random Forest: {best_rf_rmse}")

Train RMSE Random Forest: 3.2043208843157864
Test RMSE Random Forest: 7.198437386736223


In [48]:
rf_predict = best_rf_model.predict(latest_data)
rf_predict

array([5992.20144775])

## Stacked Model (Random -> Linear)

In [54]:
# Stack the Random Forest predictions with the original data features
X_train_stack = np.column_stack((train_prediction_rf, X_train))
X_test_stack = np.column_stack((test_prediction_rf, X_test))

In [55]:
stacked_model = LinearRegression().fit(X_train_stack, y_train)

# Use the Linear Regression model to predict based on the stacked features
train_pred_stack = stacked_model.predict(X_train_stack)
test_pred_stack = stacked_model.predict(X_test_stack)

rmse_train = np.sqrt(mean_squared_error(train_pred_stack, y_train))
print(f"Train RMSE Stacked Model: {rmse_train}")
r2_score(train_pred_stack, y_train)

Train RMSE Stacked Model: 2.9749353208056015


0.9999923707557702

In [56]:
rmse_test = np.sqrt(mean_squared_error(test_pred_stack, y_test))
print(f"Test RMSE Stacked Model: {rmse_test}")
r2_score(test_pred_stack, y_test)

Test RMSE Stacked Model: 7.460890120999841


0.9999529118065856

In [58]:
latest_stack_data = np.column_stack((rf_predict, latest_data))
stack_predict = stacked_model.predict(latest_stack_data)
stack_predict

array([5988.4585344])

## Model Dump

In [82]:
import joblib

# Save the Linear Regression model (assuming it's already trained)
joblib.dump(model, 'linear_regression_model.pkl')

# Save the Ridge Regression model
joblib.dump(ridge_model, 'ridge_regression_model.pkl')

# Save the Lasso Regression model
joblib.dump(lasso_model, 'lasso_regression_model.pkl')

# Save the XGBoost model
joblib.dump(best_xgb_model, 'xgboost_model.pkl')

# Save the trained Random Forest model
joblib.dump(best_rf_model, 'random_forest_model.pkl')

# Save the Stacked Model
joblib.dump(stacked_model, 'stacked_model.pkl')

print("Models saved successfully!")

Models saved successfully!


## Predictions to check

In [83]:
from datetime import datetime

# Step 1: Use the current date (today's date)
current_date = datetime.today().strftime('%Y-%m-%d')

# Step 1: Store the predictions in a DataFrame
predictions_df = pd.DataFrame({
    'Date': [current_date],
    'Linear_Regression_Prediction': lr_predict,
    'Random_Forest_Prediction': rf_predict,
    'Stacked_Model_Prediction': stack_predict,
    'XGBoost_Prediction': xgb_predict,
    'Ridge_Regression_Prediction': ridge_predict,
    'Lasso_Regression_Prediction': lasso_predict,
})

predictions_df.set_index("Date", inplace=True)
predictions_df.to_csv("predictions_to_check.csv")

## Update Dependencies

In [85]:
# Get the installed packages
installed_packages = !pip freeze

# Append to the existing requirements.txt
with open('requirements.txt', 'a') as f:
    for package in installed_packages:
        f.write(package + '\n')

print("requirements.txt updated!")

requirements.txt updated!
