In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

from Beta_VAE import VAE_Factor_Inference, QVAE_Factor_Inference
from AE import AE_Factor_Inference, QAE_Factor_Inference

In [None]:
def load_mat(factor_name, period, load_path):
    df = pd.read_csv(f'{load_path}/{factor_name}.csv', index_col = 'date')
    df = df.loc[period[0]:period[1], :]
    return df

In [None]:
def obtain_permno_list(factor_name, period, load_path, na_percent = 0.75):
    df = load_mat(factor_name, period, load_path)
    indice = (df.isna().sum(axis = 0) < df.shape[0] * na_percent)
    permno_list = indice.loc[indice].index.values
    return permno_list

In [None]:
def rank_normalize_C(C):
    for j in range(C.shape[1]):
        tem = C[:, j, :]
        tem_ = pd.DataFrame(tem)
        C[:, j, :] = (2 * tem_.rank()/(tem_.shape[0] + 1) - 1).values
    return C

In [None]:
def mean_matrix_dict(dic):
    matrix = 0
    for j in dic.keys():
        matrix += dic[j]
    matrix = matrix/len(dic.keys())
    return matrix

In [None]:
def fill_matrix(matrix, permno_list):
    trade_index = matrix.index
    matrix_filled = pd.DataFrame(columns = ['date'] + permno_list)
    matrix_filled['date'] = trade_index
    matrix_filled.set_index('date', inplace = True)
    matrix_filled[matrix.columns] = matrix
    return matrix_filled

In [None]:
factor_matrix_path = '/data/QAE/xiu_factor_matrix'
f_hidden_dim = 64
bandwidth = 10
factor_list = ['absacc','acc','aeavol' ,'age' ,'agr' ,'baspread' ,'beta' ,'betasq',
               'bm','bm_ia' ,'cash','cashdebt','cashpr','cfp','cfp_ia' ,'chatoia','chcsho' ,'chempia',
               'chinv','chmom' ,'chpmia' ,'chtx','cinvest','convind' ,'currat' ,'depr' ,'divi' ,
               'divo','dolvol' ,'dy','ear','egr','ep','gma','grcapx','grltnoa','herf','hire',
               'idiovol','ill','indmom','invest','lev','lgr','maxret','mom12m','mom1m','mom36m',
               'mom6m','ms','mvel1','mve_ia','nincr','operprof','orgcap','pchcapx_ia','pchcurrat',
               'pchdepr','pchgm_pchsale','pchquick','pchsale_pchinvt','pchsale_pchrect', 'pchsale_pchxsga', 
               'pchsaleinv', 'pctacc', 'pricedelay', 'ps', 'quick', 'rd', 'rd_mve', 'rd_sale', 
               'realestate', 'retvol', 'roaq', 'roavol', 'roeq', 'roic', 'rsup', 'salecash', 
               'saleinv', 'salerec', 'secured', 'securedind', 'sgr', 'sin', 'sp', 'std_dolvol', 
               'std_turn', 'stdacc', 'stdcf', 'tang', 'tb', 'turn', 'zerotrade']

In [None]:
full_permno_list = pd.read_csv(f'{factor_matrix_path}/absacc.csv', index_col = 'date').columns.to_list()

In [1]:
time_set = []
for i in range(1957, 2018, 5):
    time_set.append(int(str(i)+'0101'))
period_set = []
for i in range(len(time_set) - 6):
    period_set.append((time_set[i], time_set[i + 6]))
period_set = period_set[:-1]
log_set = []
for i in range(len(time_set) - 1):
    log_set.append((time_set[i], time_set[i + 1]))
log_set = log_set[6:]

In [2]:
period_set

[(19570101, 19870101),
 (19620101, 19920101),
 (19670101, 19970101),
 (19720101, 20020101),
 (19770101, 20070101),
 (19820101, 20120101)]

In [3]:
log_set

[(19870101, 19920101),
 (19920101, 19970101),
 (19970101, 20020101),
 (20020101, 20070101),
 (20070101, 20120101),
 (20120101, 20170101)]

## AE

In [None]:
torch.cuda.set_device(2)
K = 6
y_total_dict = dict.fromkeys(log_set, 0)
y_pred_dict = dict.fromkeys(log_set, 0)

In [None]:
infer_net = AE_Factor_Inference(factor_matrix_path = factor_matrix_path,
                                model_path = 'logs_real_data/AE/K6',
                                K = K,
                                f_hidden_dim = f_hidden_dim,
                                bandwidth = bandwidth,
                                factor_list = factor_list)

In [None]:
for i in tqdm(range(len(log_set)), desc = 'inference'):
    log_period = log_set[i]
    train_period = period_set[i]
    period_permno_list = obtain_permno_list('RET', train_period, '/data/QAE/xiu_factor_matrix_with_na')
    y_totals, y_preds = infer_net.inference(log_period, period_permno_list)
    y_total = mean_matrix_dict(y_totals)
    y_pred = mean_matrix_dict(y_preds)
    y_total = fill_matrix(y_total, full_permno_list)
    y_pred = fill_matrix(y_pred, full_permno_list)
    y_total_dict[log_period] = y_total
    y_pred_dict[log_period] = y_pred

In [None]:
r_total = pd.concat(list(y_total_dict.values()), axis = 0)

In [None]:
r_true = pd.read_csv('/data/QAE/xiu_factor_matrix_with_na/RET.csv', index_col = 'date').loc[r_total.index, :]

In [None]:
R_total = 1 - np.sum(np.sum(np.power(r_true - r_total, 2))) / np.sum(np.sum(np.power(r_true[r_total.notna()], 2)))

In [None]:
R_total

In [None]:
r_pred = pd.concat(list(y_pred_dict.values()), axis = 0)

In [None]:
R_pred = 1 - np.sum(np.sum(np.power(r_true - r_pred, 2))) / np.sum(np.sum(np.power(r_true[r_pred.notna()], 2)))

In [None]:
R_pred

# VAE

In [None]:
torch.cuda.set_device(2)
K = 6
y_total_dict = dict.fromkeys(log_set, 0)
y_pred_dict = dict.fromkeys(log_set, 0)

In [None]:
infer_net = VAE_Factor_Inference(factor_matrix_path = factor_matrix_path,
                                model_path = 'logs_real_data/VAE/K6',
                                K = K,
                                f_hidden_dim = f_hidden_dim,
                                bandwidth = bandwidth,
                                factor_list = factor_list)

In [None]:
for i in tqdm(range(len(log_set)), desc = 'inference'):
    log_period = log_set[i]
    train_period = period_set[i]
    period_permno_list = obtain_permno_list('RET', train_period, '/data/QAE/xiu_factor_matrix_with_na')
    y_totals, y_preds = infer_net.inference(log_period, period_permno_list)
    y_total = mean_matrix_dict(y_totals)
    y_pred = mean_matrix_dict(y_preds)
    y_total = fill_matrix(y_total, full_permno_list)
    y_pred = fill_matrix(y_pred, full_permno_list)
    y_total_dict[log_period] = y_total
    y_pred_dict[log_period] = y_pred

In [None]:
r_total = pd.concat(list(y_total_dict.values()), axis = 0)

In [None]:
r_true = pd.read_csv('/data/QAE/xiu_factor_matrix_with_na/RET.csv', index_col = 'date').loc[r_total.index, :]

In [None]:
R_total = 1 - np.sum(np.sum(np.power(r_true - r_total, 2))) / np.sum(np.sum(np.power(r_true[r_total.notna()], 2)))

In [None]:
R_total

In [None]:
r_total.to_csv('y_total/character/VAE_total.csv')

In [None]:
r_pred = pd.concat(list(y_pred_dict.values()), axis = 0)

In [None]:
R_pred = 1 - np.sum(np.sum(np.power(r_true - r_pred, 2))) / np.sum(np.sum(np.power(r_true[r_pred.notna()], 2)))

In [None]:
R_pred

## QAE

In [None]:
torch.cuda.set_device(2)
K = 3
y_total_dict = dict.fromkeys(log_set, 0)
y_pred_dict = dict.fromkeys(log_set, 0)

In [None]:
infer_net = QAE_Factor_Inference(factor_matrix_path = factor_matrix_path,
                                model_path = 'logs_real_data/QAE/K3',
                                K = K,
                                f_hidden_dim = f_hidden_dim,
                                bandwidth = bandwidth,
                                factor_list = factor_list)

In [None]:
for i in tqdm(range(len(log_set)), desc = 'inference'):
    log_period = log_set[i]
    train_period = period_set[i]
    period_permno_list = obtain_permno_list('RET', train_period, '/data/QAE/xiu_factor_matrix_with_na')
    y_totals, y_preds = infer_net.inference(log_period, period_permno_list)
    y_total = mean_matrix_dict(y_totals)
    y_pred = mean_matrix_dict(y_preds)
    y_total = fill_matrix(y_total, full_permno_list)
    y_pred = fill_matrix(y_pred, full_permno_list)
    y_total_dict[log_period] = y_total
    y_pred_dict[log_period] = y_pred

In [None]:
r_total = pd.concat(list(y_total_dict.values()), axis = 0)

In [None]:
r_true = pd.read_csv('/data/QAE/xiu_factor_matrix_with_na/RET.csv', index_col = 'date').loc[r_total.index, :]

In [None]:
R_total = 1 - np.sum(np.sum(np.power(r_true - r_total, 2))) / np.sum(np.sum(np.power(r_true[r_total.notna()], 2)))

In [None]:
R_total

In [None]:
r_pred = pd.concat(list(y_pred_dict.values()), axis = 0)

In [None]:
R_pred = 1 - np.sum(np.sum(np.power(r_true - r_pred, 2))) / np.sum(np.sum(np.power(r_true[r_pred.notna()], 2)))

In [None]:
R_pred

## QVAE

In [None]:
torch.cuda.set_device(2)
K = 6
y_total_dict = dict.fromkeys(log_set, 0)
y_pred_dict = dict.fromkeys(log_set, 0)

In [None]:
infer_net = QVAE_Factor_Inference(factor_matrix_path = factor_matrix_path,
                                model_path = 'logs_real_data/QVAE/K6',
                                K = K,
                                f_hidden_dim = f_hidden_dim,
                                bandwidth = bandwidth,
                                factor_list = factor_list)

In [None]:
for i in tqdm(range(len(log_set)), desc = 'inference'):
    log_period = log_set[i]
    train_period = period_set[i]
    period_permno_list = obtain_permno_list('RET', train_period, '/data/QAE/xiu_factor_matrix_with_na')
    y_totals, y_preds = infer_net.inference(log_period, period_permno_list)
    y_total = mean_matrix_dict(y_totals)
    y_pred = mean_matrix_dict(y_preds)
    y_total = fill_matrix(y_total, full_permno_list)
    y_pred = fill_matrix(y_pred, full_permno_list)
    y_total_dict[log_period] = y_total
    y_pred_dict[log_period] = y_pred

In [None]:
r_total = pd.concat(list(y_total_dict.values()), axis = 0)

In [None]:
r_true = pd.read_csv('/data/QAE/xiu_factor_matrix_with_na/RET.csv', index_col = 'date').loc[r_total.index, :]

In [None]:
R_total = 1 - np.sum(np.sum(np.power(r_true - r_total, 2))) / np.sum(np.sum(np.power(r_true[r_total.notna()], 2)))

In [None]:
R_total

In [None]:
r_total.to_csv('y_total/character/QVAE_total.csv')

In [None]:
r_pred = pd.concat(list(y_pred_dict.values()), axis = 0)

In [None]:
R_pred = 1 - np.sum(np.sum(np.power(r_true - r_pred, 2))) / np.sum(np.sum(np.power(r_true[r_pred.notna()], 2)))

In [None]:
R_pred