In [76]:
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from scipy.special import expit

In [77]:
files = ['ICLN','PBD','QCLN']


In [78]:
T = 12

In [79]:

def calculate_rsi(data, period=14):
    rsi_values = []
    for i in range(len(data)):
        start = i - period
        if start < 0 : 
            rsi_values.append(-1)
            continue
        window = data[start:i]
        diffs = np.diff(window)
        gain = np.where(diffs > 0, diffs, 0)
        loss = -np.where(diffs < 0, diffs, 0)

        avg_gain = np.mean(gain)
        avg_loss = np.mean(loss)
        
        if avg_loss == 0:
            rs = np.inf  
        else:
            rs = avg_gain / avg_loss

        if np.isinf(rs):
            rsi = 100
        else:
            rsi = 100 - (100 / (1 + rs))

        rsi_values.append(rsi)

    return np.asarray(rsi_values)

def calculate_psy(data, period = 14):
    psy_values = []
    for i in range(len(data)):
        start = i - period
        if start < 0 : 
            psy_values.append(-1)
            continue
        window = np.array(data[start:i])
#         print(window[0])
        up_periods = sum(1 for j in range((len(window)-1)) if (window[j+1] > window[j]))
        psy = (up_periods / period) * 100
        psy_values.append(psy)

    return np.asarray(psy_values)




In [82]:


for file in files :
    stock = pd.read_csv(f'{file}/{file}.csv')
    nav = pd.read_csv(f'{file}/nav{file}.csv')
    Garchvol = pd.read_csv(f'{file}/Garch{file}.csv')
    MAvol = pd.read_csv(f'{file}/Ma{file}.csv')
    GTpca = pd.read_csv(f'{file}/{file}_GTI.csv')[['Date', 'first_component']]
    ovx = pd.read_csv(f'OVX.csv')
    ovxvol = pd.read_csv(f'GarchOVX.csv')


    data = pd.DataFrame()
    data['Date'] = pd.to_datetime(ovx['Date'])
    data['log_ovx'] = (ovx['Close']/ovx['Close'].shift(1)).apply(lambda x: math.log(x))
    data['log_return'] = (stock['Close']/stock['Close'].shift(1)).apply(lambda x: math.log(x))
    data['log_navR'] = (nav['NAV']/nav['NAV'].shift(1)).apply(lambda x: math.log(x))
    data['rsi'] = (calculate_rsi(stock.Close,T))
    data['psy'] = (calculate_psy(stock.Close,T))
    data['volume'] = (stock['Volume'].replace(0, 1)).apply(lambda x: math.log(x))

    Garchvol['Date'] = pd.to_datetime(Garchvol.Date)
    MAvol['Date'] = pd.to_datetime(MAvol.Date)
    GTpca['Date'] = pd.to_datetime(GTpca.Date)
    ovxvol['Date'] = pd.to_datetime(ovxvol.Date)

    result = pd.merge(data, ovxvol[['Date','vol']], on='Date', how='inner')
    result = pd.merge(result, Garchvol, on='Date', how='inner')
    result = pd.merge(result, MAvol, on='Date', how='inner')
    result = pd.merge(result, GTpca, on='Date', how='inner')

    sent1 = result[['rsi','psy','volume']]
    pcas = np.asarray(sent1)

    pca = PCA(n_components=1)
    invsent = pca.fit_transform(pcas)

    result['investor_sent'] = invsent

    print(result.columns) 
    # ['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       # 'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent']
    # break

    result.columns = ['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy','log_volume', 'ovx_vol',
           'Garchvol', 'Returns', 'MAvol','GT Sent','INV Sent']

    final = pd.DataFrame(result[['Date', 'log_ovx', 'log_return','log_volume','ovx_vol' ,'log_navR','Garchvol', 'MAvol','GT Sent','INV Sent']])

    final.loc[:,'d1-inv'] = np.where(result.loc[:,'INV Sent'] > 0, 1, 0)
    final.loc[:,'d2-gt'] = np.where(result.loc[:,'GT Sent'] > 0, 1, 0)

    final = final.assign(GT_VAL_SENT=((final['log_return'] - final['log_return'].shift(1)) *
                                       ((final['GT Sent']  - final['GT Sent'].shift(1))).apply(lambda x: expit(x))))

    final = final.assign(INV_VAL_SENT=((final['log_return'] - final['log_return'].shift(1)) *
                                       ((final['INV Sent'] - final['INV Sent'].shift(1))).apply(lambda x: expit(x))))

    final = final.assign(GT_VAL_SENT_GVOL=((final['Garchvol'] - final['Garchvol'].shift(1)) * 
                                       ((final['GT Sent']  - final['GT Sent'].shift(1)))))

    final = final.assign(INV_VAL_SENT_GVOL=((final['Garchvol'] - final['Garchvol'].shift(1)) * 
                                            (final['INV Sent'] - final['INV Sent'].shift(1)).apply(lambda x: expit(x))))

    final = final.assign(GT_VAL_SENT_MVOL=((final['MAvol'] - final['MAvol'].shift(1)) * 
                                       ((final['GT Sent']  - final['GT Sent'].shift(1))).apply(lambda x: expit(x))))

    final = final.assign(INV_VAL_SENT_MVOL=((final['MAvol'] - final['MAvol'].shift(1)) * 
                                            (final['INV Sent'] - final['INV Sent'].shift(1)).apply(lambda x: expit(x))))


    final = final[1:]

    final.to_csv(f"{file}/{file}_INPUT.csv",index=False)

Index(['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent'],
      dtype='object')
Index(['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent'],
      dtype='object')
Index(['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent'],
      dtype='object')


In [81]:
final

Unnamed: 0,Date,log_ovx,log_return,log_volume,ovx_vol,log_navR,Garchvol,MAvol,GT Sent,INV Sent,d1-inv,d2-gt,GT_VAL_SENT,INV_VAL_SENT,GT_VAL_SENT_GVOL,INV_VAL_SENT_GVOL,GT_VAL_SENT_MVOL,INV_VAL_SENT_MVOL
1,2014-07-22,0.007631,0.008532,11.156251,0.044016,0.007532,0.001346,0.012197,0.098332,-13.671786,0,1,0.000495,8.483304e-04,-0.000069,-1.183488e-04,0.000047,8.022346e-05
2,2014-07-23,-0.021875,-0.019173,11.536642,0.042924,-0.018683,0.001427,0.012784,0.475580,1.471055,1,1,-0.016435,-2.770523e-02,0.000048,8.144223e-05,0.000348,5.864657e-04
3,2014-07-24,-0.047745,-0.004084,10.257659,0.042126,-0.004086,0.001402,0.012716,-0.015085,1.655881,1,0,0.005730,8.239988e-03,-0.000010,-1.374220e-05,-0.000026,-3.692361e-05
4,2014-07-25,0.044152,-0.011317,11.905643,0.043819,-0.010288,0.001276,0.012231,-0.011562,-7.774020,0,0,-0.003623,-5.806882e-07,-0.000063,-1.014815e-08,-0.000243,-3.893704e-08
5,2014-07-28,-0.018160,0.008243,10.457373,0.044036,0.008239,0.001359,0.011978,-0.045499,-8.103942,0,0,0.009614,8.181225e-03,0.000041,3.498888e-05,-0.000125,-1.060010e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1367,2019-12-23,-0.016980,0.008120,9.883285,0.046024,0.008127,0.001621,0.008297,-0.252914,49.789289,1,0,0.000756,1.576056e-03,-0.000009,-1.886229e-05,-0.000314,-6.553823e-04
1368,2019-12-24,0.013842,0.000808,10.016816,0.044675,0.002021,0.001435,0.008206,0.035069,49.697339,1,1,-0.004179,-3.487926e-03,-0.000106,-8.877531e-05,-0.000052,-4.375791e-05
1369,2019-12-26,-0.055727,0.008449,9.487972,0.043369,0.009246,0.001427,0.007985,0.010421,58.391240,1,1,0.003773,7.639327e-03,-0.000004,-8.278193e-06,-0.000109,-2.201777e-04
1370,2019-12-27,0.043874,-0.006431,9.786954,0.045291,-0.007229,0.001335,0.007757,-0.165073,58.390674,1,0,-0.006789,-7.437848e-03,-0.000042,-4.569103e-05,-0.000104,-1.142843e-04


In [60]:
x = pd.read_csv(f"ICLN/ICLN_INPUT.csv")

In [61]:
x.columns

Index(['Date', 'log_ovx', 'log_return', 'log_volume', 'ovx_vol', 'log_navR',
       'Garchvol', 'MAvol', 'GT Sent', 'INV Sent', 'd1-inv', 'd2-gt',
       'GT_VAL_SENT', 'INV_VAL_SENT', 'GT_VAL_SENT_GVOL', 'INV_VAL_SENT_GVOL',
       'GT_VAL_SENT_MVOL', 'INV_VAL_SENT_MVOL'],
      dtype='object')