In [35]:
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from scipy.special import expit

In [36]:
files = ['ICLN','PBD','QCLN']


In [37]:
T = 12

In [38]:

def calculate_rsi(data, period=14):
    rsi_values = []
    for i in range(len(data)):
        start = i - period
        if start < 0 : 
            rsi_values.append(-1)
            continue
        window = data[start:i]
        diffs = np.diff(window)
        gain = np.where(diffs > 0, diffs, 0)
        loss = -np.where(diffs < 0, diffs, 0)

        avg_gain = np.mean(gain)
        avg_loss = np.mean(loss)
        
        if avg_loss == 0:
            rs = np.inf  
        else:
            rs = avg_gain / avg_loss

        if np.isinf(rs):
            rsi = 100
        else:
            rsi = 100 - (100 / (1 + rs))

        rsi_values.append(rsi)

    return np.asarray(rsi_values)

def calculate_psy(data, period = 14):
    psy_values = []
    for i in range(len(data)):
        start = i - period
        if start < 0 : 
            psy_values.append(-1)
            continue
        window = np.array(data[start:i])
#         print(window[0])
        up_periods = sum(1 for j in range((len(window)-1)) if (window[j+1] > window[j]))
        psy = (up_periods / period) * 100
        psy_values.append(psy)

    return np.asarray(psy_values)




In [41]:


for file in files :
    stock = pd.read_csv(f'{file}/{file}.csv')
    nav = pd.read_csv(f'{file}/nav{file}.csv')
    Garchvol = pd.read_csv(f'{file}/Garch{file}.csv')
    MAvol = pd.read_csv(f'{file}/Ma{file}.csv')
    GTpca = pd.read_csv(f'{file}/{file}_GTI.csv')[['Date', 'first_component']]
    ovx = pd.read_csv(f'OVX.csv')
    ovxvol = pd.read_csv(f'GarchOVX.csv')


    data = pd.DataFrame()
    data['Date'] = pd.to_datetime(ovx['Date'])
    data['log_ovx'] = (ovx['Close']/ovx['Close'].shift(1)).apply(lambda x: math.log(x))
    data['log_return'] = (stock['Close']/stock['Close'].shift(1)).apply(lambda x: math.log(x))
    data['log_navR'] = (nav['NAV']/nav['NAV'].shift(1)).apply(lambda x: math.log(x))
    data['rsi'] = (calculate_rsi(stock.Close,T))
    data['psy'] = (calculate_psy(stock.Close,T))
    data['volume'] = (stock['Volume'].replace(0, 1)).apply(lambda x: math.log(x))

    Garchvol['Date'] = pd.to_datetime(Garchvol.Date)
    MAvol['Date'] = pd.to_datetime(MAvol.Date)
    GTpca['Date'] = pd.to_datetime(GTpca.Date)
    ovxvol['Date'] = pd.to_datetime(ovxvol.Date)

    result = pd.merge(data, ovxvol[['Date','vol']], on='Date', how='inner')
    result = pd.merge(result, Garchvol, on='Date', how='inner')
    result = pd.merge(result, MAvol, on='Date', how='inner')
    result = pd.merge(result, GTpca, on='Date', how='inner')

    sent1 = result[['rsi','psy','volume']]
    pcas = np.asarray(sent1)

    pca = PCA(n_components=1)
    invsent = pca.fit_transform(pcas)

    result['investor_sent'] = invsent

    print(result.columns) 
    # ['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       # 'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent']
    # break

    result.columns = ['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy','log_volume', 'ovx_vol',
           'Garchvol', 'Returns', 'MAvol','GT Sent','INV Sent']

    final = pd.DataFrame(result[['Date', 'log_ovx', 'log_return','log_volume','ovx_vol' ,'log_navR','Garchvol', 'MAvol','GT Sent','INV Sent']])

    final.loc[:,'d1-inv'] = np.where(result.loc[:,'INV Sent'] > 0, 1, 0)
    final.loc[:,'d2-gt'] = np.where(result.loc[:,'GT Sent'] > 0, 1, 0)

    final = final.assign(GT_VAL_SENT=((final['log_return'] - final['log_return'].shift(1)) *
                                       ((final['GT Sent']  - final['GT Sent'].shift(1))).apply(lambda x: expit(x))))

    final = final.assign(INV_VAL_SENT=((final['log_return'] - final['log_return'].shift(1)) *
                                       ((final['INV Sent'] - final['INV Sent'].shift(1))).apply(lambda x: expit(x))))

    final = final.assign(GT_VAL_SENT_GVOL=((final['Garchvol'] - final['Garchvol'].shift(1)) * 
                                       ((final['GT Sent']  - final['GT Sent'].shift(1))).apply(lambda x: expit(x))))

    final = final.assign(INV_VAL_SENT_GVOL=((final['Garchvol'] - final['Garchvol'].shift(1)) * 
                                            (final['INV Sent'] - final['INV Sent'].shift(1)).apply(lambda x: expit(x))))

    final = final.assign(GT_VAL_SENT_MVOL=((final['MAvol'] - final['MAvol'].shift(1)) * 
                                       ((final['GT Sent']  - final['GT Sent'].shift(1))).apply(lambda x: expit(x))))

    final = final.assign(INV_VAL_SENT_MVOL=((final['MAvol'] - final['MAvol'].shift(1)) * 
                                            (final['INV Sent'] - final['INV Sent'].shift(1)).apply(lambda x: expit(x))))


    final = final[1:]

    final.to_csv(f"{file}/{file}_INPUT.csv",index=False)

Index(['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent'],
      dtype='object')
Index(['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent'],
      dtype='object')
Index(['Date', 'log_ovx', 'log_return', 'log_navR', 'rsi', 'psy', 'volume',
       'vol_x', 'vol_y', 'Returns', 'vol', 'first_component', 'investor_sent'],
      dtype='object')


In [40]:
final

Unnamed: 0,Date,log_ovx,log_return,log_volume,ovx_vol,log_navR,Garchvol,MAvol,GT Sent,INV Sent,d1-inv,d2-gt,GT_VAL_SENT,INV_VAL_SENT,GT_VAL_SENT_GVOL,INV_VAL_SENT_GVOL,GT_VAL_SENT_MVOL,INV_VAL_SENT_MVOL
1,2014-02-03,0.061925,-0.042467,12.752748,0.042425,-0.041445,0.002070,0.020894,42.683307,-9.826803,0,1,-3.980527e-02,-5.462728e-06,2.474160e-04,3.395446e-08,2.281010e-03,3.130374e-07
2,2014-02-04,-0.008349,0.008308,10.988677,0.044839,0.009421,0.001943,0.020818,6.624959,-26.153249,0,1,1.110982e-17,4.122526e-09,-2.777595e-20,-1.030683e-11,-1.653943e-20,-6.137293e-12
3,2014-02-05,-0.038941,-0.013885,11.265745,0.043534,-0.011651,0.001798,0.020935,-1.666746,-17.591436,0,0,-5.559929e-06,-2.218895e-02,-3.650263e-08,-1.456773e-04,2.932795e-08,1.170440e-04
4,2014-02-06,-0.049638,0.012229,10.797533,0.043237,0.009995,0.001867,0.021019,9.935652,-25.674090,0,1,2.611424e-02,8.062973e-06,6.881136e-05,2.124604e-08,8.385135e-05,2.588975e-08
5,2014-02-07,-0.032061,0.027787,11.329003,0.043830,0.025635,0.001996,0.021470,-13.526468,-26.938888,0,0,1.005735e-12,3.425070e-03,8.396385e-15,2.859423e-05,2.911950e-14,9.916764e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1484,2019-12-23,-0.016980,0.008120,9.883285,0.046024,0.008127,0.001621,0.008297,42.911986,49.704119,1,1,1.576192e-03,1.576050e-03,-1.886393e-05,-1.886223e-05,-6.554391e-04,-6.553800e-04
1485,2019-12-24,0.013842,0.000808,10.016816,0.044675,0.002021,0.001435,0.008206,-15.384126,49.611615,1,0,-3.518380e-28,-3.486917e-03,-8.955042e-30,-8.874961e-05,-4.413996e-30,-4.374524e-05
1486,2019-12-26,-0.055727,0.008449,9.487972,0.043369,0.009246,0.001427,0.007985,30.822334,58.261265,1,1,7.640607e-03,7.639269e-03,-8.279581e-06,-8.278131e-06,-2.202146e-04,-2.201760e-04
1487,2019-12-27,0.043874,-0.006431,9.786954,0.045291,-0.007229,0.001335,0.007757,11.543422,58.260764,1,1,-6.307756e-11,-7.438089e-03,-3.874882e-13,-4.569251e-05,-9.692018e-13,-1.142880e-04


In [7]:
x = pd.read_csv(f"ICLN/ICLN_INPUT.csv")

In [41]:
x.columns

Index(['Date', 'log_ovx', 'log_return', 'log_volume', 'ovx_vol', 'log_navR',
       'Garchvol', 'MAvol', 'GT Sent', 'INV Sent', 'd1-inv', 'd2-gt',
       'GT_VAL_SENT', 'INV_VAL_SENT', 'GT_VAL_SENT_GVOL', 'INV_VAL_SENT_GVOL',
       'GT_VAL_SENT_MVOL', 'INV_VAL_SENT_MVOL'],
      dtype='object')