Baseline models - consisting of Linear Regression, RFRegressor, XGBoostRegressor fed into a linear regression ensemble. 

In [308]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import xgboost as xg 


In [309]:
df_news = pd.read_csv('/Users/oryining/Documents/NUS MComp/Semester 2/CS5344 - Big Data Analytics Technology/CS5344_Project/1. Data Retrieval and Processing of News and Stock Prices/final_sample_finbert.csv')
columns_to_select = ['Date_hourly', 'weighted_positive_fb', 'weighted_negative_fb', 'weighted_neutral_fb','weighted_DocTone']
df_news = df_news[columns_to_select]
df_news['Date_hourly'] = df_news['Date_hourly'].astype(str).str.strip()

In [310]:
df_stock_price = pd.read_csv('/Users/oryining/Documents/NUS MComp/Semester 2/CS5344 - Big Data Analytics Technology/CS5344_Project/1. Data Retrieval and Processing of News and Stock Prices/us_2024_ticker combined versions/SPY_Combined_WithPriceChange.csv',index_col=0)

df_stock_price['<DATE>'] = df_stock_price['<DATE>'].astype(str)
df_stock_price['<TIME>'] = df_stock_price['<TIME>'].astype(str).str.zfill(6)

df_stock_price['Date_hourly'] = '20' + df_stock_price['<DATE>'] + df_stock_price['<TIME>'].str[:2]
df_stock_price=df_stock_price.iloc[:, -2:]

In [311]:
# Merge the datasets on the 'Date_hourly' column using an inner join to keep only matching rows
merged_df = pd.merge(df_news, df_stock_price, on='Date_hourly', how='inner')
merged_df.sort_values("weighted_negative_fb",ascending=False)

Unnamed: 0,Date_hourly,weighted_positive_fb,weighted_negative_fb,weighted_neutral_fb,weighted_DocTone,PriceChange
846,2024090311,0.360609,0.975117,0.142975,0.099508,0.048618
285,2024032613,0.346028,0.923799,0.126045,0.201941,-0.044156
999,2024101015,0.358249,0.897877,0.114422,0.048538,0.144273
31,2024011011,0.270661,0.858400,0.171439,0.212053,-0.071543
141,2024021411,0.358656,0.843256,0.181197,0.116001,-0.125013
...,...,...,...,...,...,...
766,2024081212,0.521942,0.093247,0.677781,0.518017,-0.375841
683,2024071714,0.443518,0.084362,0.636333,0.490017,0.030504
257,2024031915,0.762578,0.083681,0.411282,1.000000,0.172859
643,2024070814,0.285955,0.081419,0.618869,0.455164,-0.037826


In [312]:
# Train Test Split and creating of x_train, x_val, x and y_train, y_val, y_test
Y = merged_df[['PriceChange']]  
X = merged_df[['weighted_positive_fb', 'weighted_negative_fb', 'weighted_neutral_fb','weighted_DocTone']]  
x_train_val, x_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=1010)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=1010)  

Linear Regression in baseline form. 

In [313]:
def LinearRegressionBaseLine(x_train,x_test,y_train,y_test):
    
    #Building model with Train data
    reg_model= LinearRegression().fit(x_train, y_train)
    y_pred = reg_model.predict(x_test)
    val_rmse_reg_model = root_mean_squared_error(y_test, y_pred)
    r2_reg_model = r2_score(y_test,y_pred)
    
    print(f'Test RMSE: {val_rmse_reg_model}')
    print("Model details are coefficients of "+str(reg_model.coef_)+" intercept of "+str(reg_model.intercept_))

    return y_pred,reg_model


In [314]:
LinearRegressionBaseLine(x_train,x_test,y_train,y_test)

Test RMSE: 0.23882848229907336
Model details are coefficients of [[0.16457467 0.29276254 0.26588566 0.0468652 ]] intercept of [-0.31996411]


(array([[ 0.03764725],
        [-0.03417316],
        [-0.05020504],
        [ 0.00825133],
        [-0.02400353],
        [-0.00510704],
        [-0.0507307 ],
        [-0.02246767],
        [-0.0126114 ],
        [-0.04023939],
        [ 0.04848146],
        [-0.00148743],
        [-0.00339184],
        [ 0.01151228],
        [-0.00534724],
        [-0.02492132],
        [-0.01872636],
        [ 0.00311616],
        [-0.00153707],
        [-0.07215731],
        [-0.04021136],
        [-0.01614673],
        [-0.02327478],
        [ 0.01660807],
        [ 0.06045055],
        [-0.0327711 ],
        [-0.02378497],
        [-0.0195446 ],
        [ 0.00508203],
        [ 0.0503086 ],
        [ 0.00390766],
        [ 0.04740031],
        [-0.07470229],
        [-0.00850609],
        [ 0.01119912],
        [-0.02189475],
        [-0.03887898],
        [-0.01034819],
        [ 0.01380964],
        [-0.02207267],
        [-0.00858485],
        [ 0.01345361],
        [ 0.00992979],
        [ 0

Random Forest Regressor in baseline form

In [315]:
# TODO : Incoporating K Fold Cross Validation to Further Tune

In [316]:
def RandomForestRegressorBaseline(x_train,x_test,y_train,y_test):
    
    rf_model = RandomForestRegressor(random_state=30)
    rf_model.fit(x_train, y_train.values.ravel())
    y_pred = rf_model.predict(x_test)
    val_rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Test RMSE: {val_rmse}')

    return y_pred,rf_model

In [317]:
RandomForestRegressorBaseline(x_train,x_test,y_train,y_test)

Test RMSE: 0.23880072727449703


(array([-6.44100742e-02,  1.13194206e-02, -1.82287694e-02,  7.87493504e-02,
         1.36563090e-01,  9.08558774e-02, -2.09001306e-01, -1.24367097e-01,
        -5.47231258e-01, -4.34651084e-02,  7.45358927e-02,  1.00868412e-02,
         5.41275817e-02,  4.26267830e-02, -4.27154924e-02, -1.73633979e-01,
        -1.09051462e-01,  5.37475753e-02, -4.93037259e-02, -3.87593399e-02,
        -1.63378428e-01,  1.44148855e-03, -9.81491177e-02, -1.37192095e-01,
         9.31399615e-02,  3.20685548e-02,  4.23365884e-02,  8.05829358e-03,
         1.84681114e-02, -1.60837671e-02, -1.74240874e-01,  7.18618319e-02,
        -4.27099920e-03,  1.06345210e-01, -7.43130403e-02,  4.52580453e-02,
         1.94289393e-02,  1.28560192e-02,  8.91036143e-02, -8.46595933e-02,
        -1.36364738e-02, -2.33305180e-02, -2.65531585e-01,  3.08331681e-02,
         3.72911734e-02,  1.93796910e-02,  1.11587766e-02,  6.53535269e-02,
        -7.78647417e-02, -4.54532183e-02, -7.96928298e-02,  1.04582462e-02,
         6.7

XGBoost Regressor in baseline form 

In [318]:
def XGBoostRegressorBaseline(x_train,x_test,y_train,y_test):
    
    xgb_model = xg.XGBRegressor(random_state=42)
    xgb_model.fit(x_train, y_train.values.ravel())
    y_pred = xgb_model.predict(x_test)
    val_rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Test RMSE: {val_rmse}')

    return y_pred,xgb_model

Ensemble model taking in variables from 3 individual models to combine outputs into a linear regression model

In [319]:
def EnsembleBaseLine():

    #This section calls the initial models to return a series of predictions
    print("\n")
    print("===============Linear Regresion BaseLine Outputs=====================")
    y_pred_linear_regression_input, input_reg_model= LinearRegressionBaseLine(x_train,x_test,y_train,y_test)
    print("===============RandomForest Regressor Baseline Outputs================")
    y_pred_random_forest_regressor,input_rf_model = RandomForestRegressorBaseline(x_train,x_test,y_train,y_test)
    print("===============XGBoost Regressor Baseline Outputs================")
    y_pred_xgb_regressor,input_xgb_model = XGBoostRegressorBaseline(x_train,x_test,y_train,y_test)

    #Model predictions are combined to be passed into the ensemble
    y_pred_stacked_predictions = np.column_stack((y_pred_linear_regression_input, y_pred_random_forest_regressor,y_pred_xgb_regressor))
    y_pred_stacked_predictions_df = pd.DataFrame(y_pred_stacked_predictions, columns=['reg_pred1', 'rf_pred2','xgboost_pred3'])
    # print("\n")
    # print("===============Predicted Y Values as inputs to Ensemble===============")
    # print(y_pred_stacked_predictions_df)
    # print("===============y_test to be fitted against============================")
    # print(y_test)

    #Processing of data to be passed in as ensemble prediction using x_val 
    y_val_input_reg_model = input_reg_model.predict(x_val)
    y_val_input_rf_model = input_rf_model.predict(x_val)
    y_val_input_xgb_model = input_xgb_model.predict(x_val)

    y_val_input_model_predictions = np.column_stack((y_val_input_reg_model, y_val_input_rf_model,y_val_input_xgb_model))
    y_val_input_model_predictions_df = pd.DataFrame(y_val_input_model_predictions, columns=['reg_pred1', 'rf_pred2','xgboost_pred3'])
    
    
    #Ensemble Model 
    print("\n")
    print("===============Ensemble Baseline Model============================")
    ensemble_model= LinearRegression().fit(y_pred_stacked_predictions_df,y_test) # Predicted Values of the Models, and y_test is the Y variable
    y_ensemble = ensemble_model.predict(y_val_input_model_predictions_df) 

    val_rmse = root_mean_squared_error(y_ensemble, y_val)
    r2 = r2_score(y_ensemble, y_val)
    print(f'Validation RMSE: {val_rmse}')
    print("Model details are coefficients of "+str(ensemble_model.coef_)+" intercept of "+str(ensemble_model.intercept_))

    # print(y_ensemble)
    # print(y_val)

In [320]:
EnsembleBaseLine()



Test RMSE: 0.23882848229907336
Model details are coefficients of [[0.16457467 0.29276254 0.26588566 0.0468652 ]] intercept of [-0.31996411]
Test RMSE: 0.23880072727449703
Test RMSE: 0.2605630146533584


Validation RMSE: 0.2330173939102011
Model details are coefficients of [[-0.53368585  0.57707588 -0.06641947]] intercept of [0.02409318]
