In [13]:
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pickle
from scipy.special import expit
from linearmodels import PanelOLS
from dm_test import dm_test


In [14]:
files = ['ICLN','PBD','QCLN']
modelName = 'MR'
mtypes = ['SENT TWO','GSENT','ISENT','NO SENT']
lags = [1,3,5,7]
# predType = 'Garchvol'


In [15]:
# ['Date', 'log_ovx', 'log_return', 'log_volume', 'ovx_vol', 'log_navR',
#        'Garchvol', 'MAvol', 'GT Sent', 'INV Sent', 'GT_VAL_SENT',
#        'INV_VAL_SENT', 'GT_VAL_SENT_GVOL', 'INV_VAL_SENT_GVOL',
#        'GT_VAL_SENT_MVOL', 'INV_VAL_SENT_MVOL', 'ahead_Return']_

In [16]:
cols_gvol  = {'SENT TWO': ['Garchvol' ,'GT_VAL_SENT_GVOL', 'INV_VAL_SENT_GVOL'],
         'ISENTVAL': ['Garchvol', 'INV_VAL_SENT_GVOL','d1'],
         'GSENTVAL': ['Garchvol' , 'GT_VAL_SENT_GVOL','d2'],
         'GSENT': ['Garchvol',  'GT Sent', 'd2'],
         'ISENT': ['Garchvol', 'INV Sent','d1'],
         'DIFF' : ['Garchvol' , 'diff'],
        'NO SENT': ['Garchvol', ]}


cols_ret  = {'SENT TWO': ['log_return', 'log_navR','GT_VAL_SENT', 'INV_VAL_SENT'],
         'ISENTVAL': ['log_return', 'log_navR', 'INV_VAL_SENT','d1'],
         'GSENTVAL': ['log_return', 'log_navR', 'GT_VAL_SENT','d2'],
         'GSENT': ['log_return', 'log_navR', 'GT Sent', 'd2'],
         'ISENT': ['log_return', 'log_navR', 'INV Sent','d1'],
         'DIFF' : ['log_return', 'log_navR', 'diff'],
        'NO SENT': ['log_return', 'log_navR']}

In [17]:
def fetch(file,lag,predType): 
    data = pd.read_csv(f"../data/{file}/{file}_INPUT.csv")
    data['diff'] = data[predType] - data[predType].shift(1)
    data['d1'] = np.where(data['INV Sent'] > 0, 1, 0)
    data['d2'] = np.where(data['GT Sent'] > 0, 1, 0)
    data['ahead'] = data[predType].shift(-1*(lag))
    data = data[1:-1*(lag)]
    data.reset_index(drop=True,inplace=True)
    
    return data 

In [18]:
def decide_run_model(y_train,x_train):
    model = sm.OLS(y_train, x_train).fit()
    print(model.summary())
    return model

In [19]:
def calc_rmse(actual, pred) : 
    return np.sqrt(mean_squared_error(np.asarray(actual).reshape(-1), np.asarray(pred).reshape(-1)))

def percentage_change(sent_rmse, rmse): 
    return ((sent_rmse-rmse)/rmse)

In [53]:


def create_lagged_features(df, cols, window):

    # Create a copy of the DataFrame to avoid modifying the original data
    df_lagged = df.copy()

    # Create lagged columns for each specified column
    for col in cols:
        for lag in range(1, window + 1):
            df_lagged[f'{col}_lag{lag}'] = df[col].shift(lag)

    # Drop rows with NaN values created by lagging
    df_lagged.dropna(inplace=True)

    return df_lagged


def do_panel_ols(cols,outputfile,pred, predType, verbose=True):
    for lag in lags:
        data = [fetch(file,lag,predType) for file in files]
        for i,file in enumerate(files) : 
            data[i]["entity"] = file 
            data[i]["Date"] = pd.to_datetime(data[i].Date)
            data[i] = sm.add_constant(data[i])
        
        panel_data = pd.concat(data)
        panel_data = panel_data.set_index(['entity', 'Date'])
    
        preds = []
        panel_Y = panel_data[[pred]]
        
        for i,key in enumerate(cols) : 
            panelX = panel_data[cols[key]]
            # panelX = create_lagged_features(panelX, cols[key], 5)
            panel_model = PanelOLS(panel_Y, panelX, entity_effects=True)
            panel_results = panel_model.fit()
            preds.append(panel_results)
            verbose and print(panel_results)
    
        check = True 
        try:
            output = pd.read_csv(outputfile)
        except :
            output = pd.DataFrame(columns=['Model', 'shift'])
            check = False
    
    
        newrow = {'Model': modelName, 'shift': lag}
        for i,key in enumerate(cols): 
            newrow[f"rmse {key}"] = calc_rmse(actual=panel_Y[pred].values, pred=preds[i].predict(panel_data[cols[key]]).values)
        for i,key in enumerate(cols): 
            newrow[f'pc {key}'] = percentage_change(newrow[f"rmse {key}"], newrow['rmse NO SENT'])
        for i,key in enumerate(cols): 
            test = dm_test(panel_Y[pred].values.reshape(-1,),preds[-1].predict(panel_data[cols['NO SENT']]).values.reshape(-1,),preds[i].predict(panel_data[cols[key]]).values.reshape(-1,), h=lag, crit="MSE")
            newrow[f"DM {key}"] = test['DM']
        for i,key in enumerate(cols): 
            test = dm_test(panel_Y[pred].values.reshape(-1,),preds[-1].predict(panel_data[cols['NO SENT']]).values.reshape(-1,),preds[i].predict(panel_data[cols[key]]).values.reshape(-1,), h=lag, crit="MSE")
            newrow[f"pval {key}"] = test['p_value']
    
        verbose and print(newrow)
        if check : 
            exist = output.index[ (output['Model'] == newrow['Model']) & (output['shift'] == newrow['shift'])]
        
            if len(exist):
                output.loc[exist[0]] = newrow
            else:
                output.loc[len(output)] = newrow
        else : 
            output = pd.DataFrame(columns=newrow.keys())
            output.loc[len(output)] = newrow
            
        output.to_csv(outputfile, index=False)

    return  pd.read_csv(outputfile)



In [58]:
output = do_panel_ols(cols=cols_ret,outputfile='outputRet.csv',pred='ahead', predType='log_return', verbose=False)
output[['Model', 'shift', 'pc SENT TWO', 'pc ISENTVAL', 'pc GSENTVAL',
       'pc GSENT', 'pc ISENT', 'pc DIFF', 'DM SENT TWO', 'DM ISENTVAL',
       'DM GSENTVAL', 'DM GSENT', 'DM ISENT', 'DM DIFF',
       'pval SENT TWO',  'pval ISENTVAL', 'pval GSENTVAL', 'pval GSENT', 'pval ISENT',
       'pval DIFF']]

Unnamed: 0,Model,shift,pc SENT TWO,pc ISENTVAL,pc GSENTVAL,pc GSENT,pc ISENT,pc DIFF,DM SENT TWO,DM ISENTVAL,DM GSENTVAL,DM GSENT,DM ISENT,DM DIFF,pval SENT TWO,pval ISENTVAL,pval GSENTVAL,pval GSENT,pval ISENT,pval DIFF
0,MR,7,-4.3e-05,0.000106,-0.000176,-0.000931,-0.000241,-1.9e-05,0.00013,-0.000324,0.000536,0.002839,0.000734,5.8e-05,0.999896,0.999742,0.999572,0.997735,0.999415,0.999954
1,MR,1,-0.000431,-0.000429,2.6e-05,9.7e-05,-8.7e-05,-0.001305,0.00128,0.001274,-7.8e-05,-0.000289,0.000259,0.003875,0.998979,0.998984,0.999938,0.99977,0.999794,0.996908
2,MR,3,-0.000502,-0.000201,-9e-05,-0.000125,-0.000291,-0.000877,0.001534,0.000616,0.000277,0.000382,0.00089,0.002683,0.998776,0.999508,0.999779,0.999695,0.99929,0.997859
3,MR,5,-0.000245,0.000166,-0.000313,-0.000134,-0.000387,-2.9e-05,0.000748,-0.000506,0.000955,0.00041,0.001183,8.9e-05,0.999404,0.999596,0.999238,0.999673,0.999057,0.999929


In [59]:
output = do_panel_ols(cols=cols_gvol,outputfile='outputGvol.csv',pred='ahead', predType='Garchvol', verbose=False)
output[['Model', 'shift', 'pc SENT TWO', 'pc ISENTVAL', 'pc GSENTVAL',
       'pc GSENT', 'pc ISENT', 'pc DIFF', 'DM SENT TWO', 'DM ISENTVAL',
       'DM GSENTVAL', 'DM GSENT', 'DM ISENT', 'DM DIFF',
       'pval SENT TWO',  'pval ISENTVAL', 'pval GSENTVAL', 'pval GSENT', 'pval ISENT',
       'pval DIFF']]

Unnamed: 0,Model,shift,pc SENT TWO,pc ISENTVAL,pc GSENTVAL,pc GSENT,pc ISENT,pc DIFF,DM SENT TWO,DM ISENTVAL,DM GSENTVAL,DM GSENT,DM ISENT,DM DIFF,pval SENT TWO,pval ISENTVAL,pval GSENTVAL,pval GSENT,pval ISENT,pval DIFF
0,MR,1,0.008881,0.001514,-0.000111,0.003236,0.00598,0.010831,-0.000123,-2.1e-05,2e-06,-4.5e-05,-8.3e-05,-0.00015,0.999902,0.999983,0.999999,0.999964,0.999934,0.99988
1,MR,3,0.000298,-0.019783,-0.013069,-0.002361,-0.000686,-0.001592,-1.6e-05,0.001032,0.000684,0.000124,3.6e-05,8.4e-05,0.999987,0.999177,0.999454,0.999901,0.999971,0.999933
2,MR,5,-0.003696,-0.023355,-0.004983,-0.000942,-0.001533,-0.006919,0.000308,0.001927,0.000415,7.9e-05,0.000128,0.000576,0.999754,0.998463,0.999669,0.999937,0.999898,0.999541
3,MR,7,-0.007638,-0.022325,-0.008943,-0.00767,-0.007556,-0.009687,0.00083,0.002409,0.000972,0.000834,0.000821,0.001052,0.999338,0.998078,0.999225,0.999335,0.999345,0.999161
