In [234]:
import numpy as np
import pandas as pd
import pickle

In [236]:
import torch
import argparse, json
from cross_exp.exp_crossformer import Exp_crossformer

# Define model and functions

Please update the filter for `testdf.index` in `__init__()` of Class, and `fcst` slicing in `getMetrics()` if necessary

In [245]:
class MY_MODEL():
    
    def __init__(self, testdata):
        """
        Load the trained model and all hyperparameters
        """
        #get mean & std
        file = open('submit/scale_statistic.pkl', 'rb') 
        data = pickle.load(file)
      
        #stock list to predict
        src=pd.read_csv('datasets/styAdj_small_mean.csv')
        self.traindf=src
        self.stock_pred_list = src.columns[1:]   # load stock_pred_list
        src=src.set_index('time')
        
        self.testdf=self.convert(testdata) ### newparquet
        self.testdf=self.testdf.loc[self.testdf.index>="2020-12-01"]### update as necessary
        self.testindex=self.testdf.index
        #concat train and test
        pd.concat([src,self.testdf], axis=0).to_csv('datasets/concat.csv', index=True)
        self.fulldf=pd.read_csv('datasets/concat.csv')
        
        print('loading parameters')
        config = json.load(open('submit/args.json'))  # load configuration arguments 
        config['scale_statistic']=data #get mean and sd
        self.args = argparse.Namespace(**config)
        
        self.args.data_split=[168,len(src)-168,len(self.testdf)]
        self.args.data_path="concat.csv"
       
        print('loading model')
        self.my_model = Exp_crossformer(self.args)
        self.my_model.model.load_state_dict(torch.load('submit/checkpoint.pth'))   # load trained model
       
    def convert(self, data):
        daily_data = pd.read_parquet(data)
        return_type = '1d_next_styAdj'
        transformed_1d = daily_data.pivot_table(index='stock_id', columns='time', values=return_type)
        test = transformed_1d.T.copy().fillna(0.0)
        test.columns = [str(id) for id in test.columns]
        test.index.name = 'date'
        smallCap = test.loc[:, self.stock_pred_list]
        return smallCap

    def eval(self):
        self.my_model.eval('pred',self.args.save_pred)
        mtest=np.load('results/pred/pred.npy')
        mdf=pd.DataFrame(mtest.squeeze())
        return mdf*self.args.scale_statistic['std']+self.args.scale_statistic['mean']


In [246]:
def weighted_correlation(X,Y,weight_column):   

    # Calculate the weighted means
    mean_X = np.sum(X * weight_column) / np.sum(weight_column)
    mean_Y = np.sum(Y * weight_column) / np.sum(weight_column)

    # Calculate the weighted covariance
    weighted_covariance = np.sum(weight_column * (X - mean_X) * (Y - mean_Y))

    # Calculate the weighted standard deviations
    weighted_std_X = np.sqrt(np.sum(weight_column * (X - mean_X)**2))
    weighted_std_Y = np.sqrt(np.sum(weight_column * (Y - mean_Y)**2))

    # Calculate the weighted correlation coefficient
    weighted_correlation = weighted_covariance / (weighted_std_X * weighted_std_Y)

    return weighted_correlation

In [247]:
def getMetrics(preddf, testindex):
    ### Load train and test data
    train=pd.read_csv('datasets/styAdj_small_mean.csv')
    responseData=pd.read_parquet(testparquet)
    print(train.shape)
    
    if train['time'].dtype != '<M8[ns]':
        train['time'] = pd.to_datetime(train['time'])
    train = train.set_index('time')

    pred=preddf
    #test_begin_date = train.tail(pred.shape[0])
    pred.columns = train.columns
    pred.index = testindex
    pred.index.names = ['time']

    # 将 DataFrame 从宽格式转换为长格式
    long_pred = pred.reset_index().melt(id_vars=['time'], var_name='stock_id', value_name='forecast')
    long_pred = long_pred.set_index('time')
    
    long_pred['stock_id'] = pd.to_numeric(long_pred['stock_id'])
    
    daily = responseData.copy()
    fcst = pd.merge(daily.reset_index(), long_pred.reset_index(), on=['time', 'stock_id'], how='left').fillna(0.0).set_index('time')
    fcst=fcst.loc[fcst.index>="2020-12-01 10:00:00"] #new slicing (change this to only predict certain period)
    
    # IC
    weight = fcst['liq_weight'].values
    IC = []
    for hz in [21, 10, 5, 1]:   
        mr = '%sd_next_mktAdj' % hz
        target = fcst[mr].values
        IC.append(weighted_correlation(fcst['forecast'].values, target, weight))
        
    y_hat = fcst['forecast'].values
    weight = fcst['liq_weight'].values   # always use liq_weight in evaluation

    portfolio = fcst[['stock_id','1d_next_mktAdj','liq_weight']].copy()
    portfolio['pos'] = weight * y_hat
    portfolio['pos'] = portfolio.groupby(portfolio.index)['pos'].transform(lambda x: x/np.sum(np.abs(x)))
    pnl = portfolio.groupby(portfolio.index).apply(lambda x: np.sum(x['pos'] * x['1d_next_mktAdj']))
    portfolio['prev_pos'] = portfolio.groupby(portfolio.stock_id)['pos'].shift().fillna(0.0)
    turnover = portfolio.groupby(portfolio.index).apply(lambda x: np.sum(np.abs(x['pos']-x['prev_pos'])))
        
    annual_mean = pnl.mean() * 242 * 100
    annual_volatility = pnl.std() * np.sqrt(242) * 100
    annual_sharpe_ratio = pnl.mean() / pnl.std() * np.sqrt(242)
    tr = turnover.mean()
    metrics = [annual_mean, annual_volatility, annual_sharpe_ratio, tr]

    m1=pd.DataFrame({'Raw forecast':IC}, index=(['IC_21d', 'IC_10d', 'IC_5d','IC_1d']))

    m2 = pd.DataFrame({'Raw forecast':metrics}, index=(['Annual Mean (%)', 'Annual Vol(%)', 'Annual Sharpe ratio','Turnover']))
    metrics = pd.concat([m1, m2])
    return metrics
    


# Generate Forecast and Metrics

In [248]:
testparquet='./results/test.parquet' #change it to the unseen test parquet 

In [249]:
m=MY_MODEL(testparquet)
preddf=m.eval()

loading parameters
loading model
Use CPU
mse:1.350561499595642, mae:0.7241932153701782


In [250]:
getMetrics(preddf, m.testindex)

(2651, 675)


Unnamed: 0,Raw forecast
IC_21d,0.024763
IC_10d,0.021729
IC_5d,0.019049
IC_1d,0.013278
Annual Mean (%),28.308912
Annual Vol(%),4.912838
Annual Sharpe ratio,5.762232
Turnover,0.788196
