In [None]:
import pandas as pd
import datetime as dt
import pickle
import numpy as np
import os
import matplotlib.pyplot as plt
os.chdir("/home/mohit/strategy_generation/rl_strategy")
os.system("pip install pyrcca")

import rcca
import random
random.seed(2)

to_csv_path = "/home/mohit/cca_output_pharma/"


## LOADING THE INDICATOR DATA (STRATEGY GENERATION)

In [None]:
with open('indicator_files/pharma_data/tdata_dtypes.pkl', 'rb') as f:
    dts = pickle.load(f)

chunksize = 30000
tfr = pd.read_csv('indicator_files/pharma_data/total_data.csv', chunksize=chunksize, iterator=True, dtype=dts, compression="gzip") #, dtype=dts
total_data = pd.concat(tfr, ignore_index=True)

keep_cols = [x for x in total_data.columns if (x[-1].isdigit())] + ["DateTime"]

total_data = total_data[keep_cols]
total_data = total_data.set_index("DateTime")


In [None]:
print(set([x.split("_")[0] for x in total_data.columns if x != "date"])), print(set([x.split("_")[1] for x in total_data.columns if x != "date"]))

In [None]:
non_train_start = dt.date(2016, 1, 1)
non_train_end = dt.date(2021, 1, 1)


total_data["date"] = total_data.index.map(lambda x: x.split(" ")[0])
total_data["date"] = pd.to_datetime(total_data["date"], format="%Y-%m-%d")
total_data = total_data[(total_data["date"].dt.date > non_train_start) & (total_data["date"].dt.date < non_train_end)]


In [None]:
date_ls = total_data["date"].dt.date.tolist()
date_ls = sorted(list(set(date_ls)))
# random.shuffle(date_ls)

split_ = 0.8
train_dates = date_ls[:int(len(date_ls)*split_)]
valid_dates = date_ls[int(len(date_ls)*split_):]


# BEGINNING CCA

## INDIVIDUAL ASSET AND INDICATOR


In [None]:
X_cols = [x for x in total_data.columns if (traded_asset not in x) and (indicator in x) and (x.split("_")[0] not in remove_cols)] + ["date"]
# Y_cols = [x for x in total_data.columns if (traded_asset in x) and (indicator in x)] + ["date"]
Y_cols = [x for x in total_data.columns if (traded_asset in x) and (indicator in x) and (int(x.split("_")[2]) < 3)] + ["date"]

Y_data = total_data[Y_cols]
X_data = total_data[X_cols]

print(len(Y_data))


In [None]:
Y_data_train = Y_data[Y_data["date"].isin(train_dates)].drop(["date"], axis=1)
Y_data_test = Y_data[Y_data["date"].isin(valid_dates)].drop(["date"], axis=1)

X_data_train = X_data[X_data["date"].isin(train_dates)].drop(["date"], axis=1)
X_data_test = X_data[X_data["date"].isin(valid_dates)].drop(["date"], axis=1)

print(Y_data_train.shape, X_data_test.shape)


In [None]:
## CROSS-VALIDATE

ccaCV = rcca.CCACrossValidate(kernelcca=False, 
                              numCV = 10,
                              numCCs = [x for x in range(1, 4)],
                              regs = [1e6, 1e8, 1e10, 1e14, 1e18]) #[0., 1e2, 1e4, 1e6], np.array(np.logspace(-1, 4, 10)

# Use the train() and validate() methods to run the analysis and perform cross-dataset prediction.
ccaCV.train([X_data_train.values, Y_data_train.values])


In [None]:
testcorrsCV = ccaCV.validate([X_data_test.values, Y_data_test.values])


In [None]:
print('Optimal number of components: %d\nOptimal regularization coefficient: %d' % (ccaCV.best_numCC, ccaCV.best_reg))


In [None]:
## CORRELATION BETWEEN COMPONENTS
ccaCV.cancorrs


In [None]:
# EXPLAINED VARIANCE
ev = ccaCV.compute_ev([X_data_test.values, Y_data_test.values])


### TESTING CORRELATION ON PREDICTED VARIABLES

In [None]:
df_corr_test = pd.DataFrame(testcorrsCV[0], index=X_data_train.columns, columns=["corr"])
df_corr_test["stock"] = df_corr_test.index.map(lambda x: x.split("_")[0])
df_corr_test = df_corr_test.groupby("stock").mean().sort_values(by="corr", ascending=False)
# df_corr_test.to_csv(to_csv_path+"{}_{}_test_corr.csv".format(traded_asset, indicator))

print(df_corr_test)


In [None]:
TOTAL_COMPONENTS = ccaCV.best_numCC

all_ev = pd.DataFrame()
all_wts = pd.DataFrame()
for i in range(TOTAL_COMPONENTS): #TOTAL_COMPONENTS
    
    col_ev = "ev_"+str(i)
    col_wts = "wts_"+str(i)
    
    df_ev = pd.DataFrame(ev[0][i], index=X_data_test.columns, columns=[col_ev]).sort_values(by=col_ev)
    df_ev["stock"] = df_ev.index.map(lambda x: x.split("_")[0])
    st_ev_df = df_ev.groupby("stock").sum().sort_index()
    all_ev = pd.concat([all_ev, st_ev_df], axis=1)
    
    df_loading = pd.DataFrame(ccaCV.ws[0].T[i], index=X_data_test.columns, columns=[col_wts])
    df_loading = df_loading.abs()
    df_loading["stock"] = df_loading.index.map(lambda x: x.split("_")[0])

    st_wts_df = df_loading.groupby("stock").sum()/df_loading.groupby("stock").sum().sum()
    st_wts_df = st_wts_df.sort_index()
    all_wts = pd.concat([all_wts, st_wts_df], axis=1)
    


In [None]:
df_loading = pd.DataFrame(ccaCV.ws[0].T[i], index=X_data_test.columns, columns=[col_wts])
df_loading = df_loading.abs()
df_loading["stock"] = df_loading.index.map(lambda x: x.split("_")[2])
df_loading.groupby("stock").sum()/df_loading.groupby("stock").sum().sum()


## CCA ON ALL THE ASSETS AND INDICATORS

In [None]:
traded_asset_ls =  ["AXISBANK", "BANKBARODA", "CANARABANK", "FEDERALBANK", "HDFCBANK", "ICICIBANK", 
                 "INDUSINDBANK", "KOTAKBANK",  "PNB", "SBI"]

indicator_ls = ['RSI', 'MACD.MACD', 'BBANDSSTDDIST', 'PEAKMINUSTROUGH', 'STOCH']
remove_cols = ["PHARMAFF33INDEX"] #"COALINDIA", "NMDC"
suffix = "" # "wo_pharmaff"

###
for traded_asset in traded_asset_ls:
    print("\n", traded_asset)
    for indicator in indicator_ls:
        print(indicator)

        Y_cols = [x for x in total_data.columns if (traded_asset in x) and (indicator in x)] + ["date"]
        X_cols = [x for x in total_data.columns if (traded_asset not in x) and (indicator in x) and (x.split("_")[0] not in remove_cols)] + ["date"]

        Y_data = total_data[Y_cols]
        X_data = total_data[X_cols]

        Y_data_train = Y_data[Y_data["date"].isin(train_dates)].drop(["date"], axis=1)
        Y_data_test = Y_data[Y_data["date"].isin(valid_dates)].drop(["date"], axis=1)

        X_data_train = X_data[X_data["date"].isin(train_dates)].drop(["date"], axis=1)
        X_data_test = X_data[X_data["date"].isin(valid_dates)].drop(["date"], axis=1)


        ### CV
        ccaCV = rcca.CCACrossValidate(kernelcca=False, 
                                  numCV = 10,
                                  numCCs = [x for x in range(1, 4)],
                                  regs = [1e6, 1e8, 1e10, 1e14, 1e18]) #[0., 1e2, 1e4, 1e6], np.array(np.logspace(-1, 4, 10)

        ccaCV.train([X_data_train.values, Y_data_train.values])

        testcorrsCV = ccaCV.validate([X_data_test.values, Y_data_test.values])
        ev = ccaCV.compute_ev([X_data_test.values, Y_data_test.values])

        ##
        df_corr_test = pd.DataFrame(testcorrsCV[0], index=X_data_train.columns, columns=["corr"])
        df_corr_test["stock"] = df_corr_test.index.map(lambda x: x.split("_")[0])
        df_corr_test = df_corr_test.groupby("stock").mean().sort_values(by="corr", ascending=False)
        df_corr_test.to_csv(to_csv_path+"{}_{}_test_corr_{}.csv".format(traded_asset, indicator, suffix))


        TOTAL_COMPONENTS = ccaCV.best_numCC

        all_ev = pd.DataFrame()
        all_wts = pd.DataFrame()
        for i in range(TOTAL_COMPONENTS):

            col_ev = "ev_"+str(i)
            col_wts = "wts_"+str(i)

            df_ev = pd.DataFrame(ev[0][i], index=X_data_test.columns, columns=[col_ev]).sort_values(by=col_ev)
            df_ev["stock"] = df_ev.index.map(lambda x: x.split("_")[0])
            st_ev_df = df_ev.groupby("stock").sum().sort_index()
            all_ev = pd.concat([all_ev, st_ev_df], axis=1)


            df_loading = pd.DataFrame(ccaCV.ws[0].T[i], index=X_data_test.columns, columns=[col_wts])
            df_loading = df_loading.abs()
            df_loading["stock"] = df_loading.index.map(lambda x: x.split("_")[0])
            
            st_wts_df = df_loading.groupby("stock").sum()/df_loading.groupby("stock").sum().sum()
            st_wts_df = st_wts_df.sort_index()
            all_wts = pd.concat([all_wts, st_wts_df], axis=1)
        
        all_wts = all_wts.mean(axis=1).to_frame().rename(columns={0:indicator})
        all_wts.to_csv(to_csv_path+"{}_{}_wts_{}.csv".format(traded_asset, indicator, suffix))
    
    
