In [None]:
# Load data

import pandas as pd
import numpy as np

# load dictionaries of data
dict_train=np.load('dict_of_data_train_Sept2025.npy', allow_pickle=True).item()
dict_test=np.load('dict_of_data_test_Sept2025.npy', allow_pickle=True).item()

# load embedding data
news_embedding_train = pd.read_csv('embedding_train_Sept2025.csv')
news_embedding_test = pd.read_csv('embedding_test_Sept2025.csv')

# get stock and day index limits
min_day_indx_train=min(dict_train['di'])
max_day_indx_train=max(dict_train['di'])
min_day_indx_test=min(dict_test['di'])
max_day_indx_test=max(dict_test['di'])

print('min_day_indx_train',min_day_indx_train)
print('max_day_indx_train',max_day_indx_train)
print('min_day_indx_test',min_day_indx_test)
print('max_day_indx_test',max_day_indx_test)

max_stock_indx_train=max(dict_train['si'])
max_stock_indx_test=max(dict_test['si'])
max_stock_indx=max(max_stock_indx_train,max_stock_indx_test)
print('max_stock_indx_test',max_stock_indx_test)
print('max_stock_indx_test',max_stock_indx_test)
print('max_stock_indx',max_stock_indx)


In [None]:
# Sample code to extract embedding vector as numpy array

emb_train=news_embedding_train['avg_cls_emb']

num_embedding_data=len(emb_train)

str1=emb_train[1]
str2 = str1.replace("[", "")
str2 = str2.replace("]", "")
embedding_item=np.fromstring(str2, sep=',')
print(embedding_item.shape)
print(embedding_item[0:10])


In [None]:
# Functions to construct and simulate a long-short portfolio

def simulate_portfolio(postA,returns):
    postA_orig=np.copy(postA)
    postA=postA*0.0
    postA=postA_orig
    #postA[:,1:]=postA_orig[:,0:-1]

    stock_pnl=postA*returns
    dailypnl=np.nansum(stock_pnl,axis=0)

    longsize=np.nansum(postA*(postA>0),axis=0)
    shortsize=np.nansum(postA*(postA<0),axis=0)

    dailytvr=0*dailypnl

    n_days=dailytvr.size
    for i in range(1,n_days):
        yest_position=postA[:,i-1]
        today_position=postA[:,i]
        trades=today_position-yest_position
        booksize=np.sum(abs(yest_position))
        total_traded=np.sum(abs(trades))

        if booksize>0:
            tvr_today=total_traded/booksize
        else:
            tvr_today=0.0

        dailytvr[i]=tvr_today

    simres={}
    simres['dailypnl']=dailypnl
    simres['dailytvr']=dailytvr
    simres['longsize']=longsize
    simres['shortsize']=shortsize
    return simres



def booksize(postA,booksize):

    n_days=postA.shape[1]
    postA_booksized=postA*0.0

    for di in range(n_days):
        current_positions=postA[:,di]
        if np.nansum(abs(current_positions))>0:
            booksized_positions=booksize_day(postA[:,di],booksize)
            postA_booksized[:,di]=booksized_positions
        else:
            postA_booksized[:,di]=0.0
    return postA_booksized


def booksize_day(positions,booksize):

    booksized_positions=positions*0
    long_positions=positions*(positions>0)
    short_positions=positions*(positions<0)
    long_positions_norm=booksize*long_positions/np.sum(abs(long_positions))
    short_positions_norm=booksize*short_positions/np.sum(abs(short_positions))
    booksized_positions=long_positions_norm+short_positions_norm
    return booksized_positions


In [None]:
# Construct and simulate an alpha made from news embeddings

import matplotlib.pyplot as plt
import matplotlib.pylab as pyl
from matplotlib.pyplot import figure
%matplotlib inline

### construct alpha from news embeddings
alpha=np.zeros((max_stock_indx,max_day_indx_test))

emb_train=news_embedding_train['avg_cls_emb']
num_emb_data=len(emb_train)
emb_train_si=news_embedding_train['si']
emb_train_di=news_embedding_train['di']

for i in range(num_emb_data):
    if np.mod(i,10000)==0:
        print('i',i)

    str1=emb_train[i]
    str2 = str1.replace("[", "")
    str2 = str2.replace("]", "")
    embedding_item=np.fromstring(str2, sep=',')
    mean_embedding=np.mean(embedding_item)
    si_i=emb_train_si[i]
    di_i=emb_train_di[i]

    alpha[si_i,di_i]=alpha[si_i,di_i]+(-mean_embedding)

returns_to_use=np.zeros((max_stock_indx,max_day_indx_test))
returns_to_use[dict_train['si'],dict_train['di']]=dict_train['y_data'].ravel()

### scale long and short positions to be market neutral and constant booksize each day
alpha_booksized=booksize(alpha,1e6)

sim_result=simulate_portfolio(alpha_booksized,returns_to_use)
print(sim_result.keys())

%matplotlib inline
figsize = (14, 8)
fig = plt.figure(facecolor=(245/255.0, 245/255.0, 245/255.0), figsize=figsize)     
pnl_to_sim=sim_result['dailypnl'][min_day_indx_train:max_day_indx_train]
mean_pnl=np.mean(pnl_to_sim)
std_pnl=np.std(pnl_to_sim)

### calculate information ratio
ir=np.round(mean_pnl/std_pnl,3)
sr=np.round(np.sqrt(252)*mean_pnl/std_pnl,3)
print('information ratio (ir)',ir)
print('annualized Sharpe ratio',sr)
plt.plot(np.cumsum(pnl_to_sim), color='r', label='pnl')
plt.grid()
plt.legend(loc='upper left',  fontsize=12)
plt.title('Simulation of a 1m long 1m short portfolio', fontsize=18)
plt.xlabel("days",  fontsize=16)
plt.ylabel("cumulative pnl (dollars)",  fontsize=16)

longsize=np.mean(sim_result['longsize'][min_day_indx_train:max_day_indx_train])
print('longsize (m)',longsize/1e6)
shortsize=np.mean(sim_result['shortsize'][min_day_indx_train:max_day_indx_train])
print('shortsize (m)',shortsize/1e6)
tvr=np.mean(sim_result['dailytvr'][min_day_indx_train:max_day_indx_train])
print('tvr',np.round(tvr,3))

plt.show()


In [None]:
# Construct and simulate signal from volume data

import matplotlib.pyplot as plt
import matplotlib.pylab as pyl
from matplotlib.pyplot import figure
%matplotlib inline

### construct alpha from volume data
alpha=np.zeros((max_stock_indx,max_day_indx_test))
volume_signal=dict_train['volume']/dict_train['average_volume']
volume_signal=volume_signal-1
alpha[dict_train['si'],dict_train['di']]=volume_signal.ravel()

returns_to_use=np.zeros((max_stock_indx,max_day_indx_test))
returns_to_use[dict_train['si'],dict_train['di']]=dict_train['y_data'].ravel()

### scale long and short positions to be market neutral and constant booksize each day
alpha_booksized=booksize(alpha,1e6)

sim_result=simulate_portfolio(alpha_booksized,returns_to_use)
print(sim_result.keys())

%matplotlib inline
figsize = (14, 8)
fig = plt.figure(facecolor=(245/255.0, 245/255.0, 245/255.0), figsize=figsize)     
pnl_to_sim=sim_result['dailypnl'][min_day_indx_train:max_day_indx_train]
mean_pnl=np.mean(pnl_to_sim)
std_pnl=np.std(pnl_to_sim)

### calculate information ratio
ir=np.round(mean_pnl/std_pnl,3)
sr=np.round(np.sqrt(252)*mean_pnl/std_pnl,3)
print('information ratio (ir)',ir)
print('annualized Sharpe ratio',sr)
plt.plot(np.cumsum(pnl_to_sim), color='r', label='pnl')
plt.grid()
plt.legend(loc='upper left',  fontsize=12)
plt.title('Simulation of a 1m long 1m short portfolio', fontsize=18)
plt.xlabel("days",  fontsize=16)
plt.ylabel("cumulative pnl (dollars)",  fontsize=16)

longsize=np.mean(sim_result['longsize'][min_day_indx_train:max_day_indx_train])
print('longsize (m)',longsize/1e6)
shortsize=np.mean(sim_result['shortsize'][min_day_indx_train:max_day_indx_train])
print('shortsize (m)',shortsize/1e6)
tvr=np.mean(sim_result['dailytvr'][min_day_indx_train:max_day_indx_train])
print('tvr',np.round(tvr,3))

plt.show()


In [None]:
# Construct and simulate one alpha signal

import matplotlib.pyplot as plt
import matplotlib.pylab as pyl
from matplotlib.pyplot import figure
%matplotlib inline

### construct and simulate one alpha
alpha=np.zeros((max_stock_indx,max_day_indx_test))
alpha[dict_train['si'],dict_train['di']]=dict_train['x_data'][:,0].ravel()

returns_to_use=np.zeros((max_stock_indx,max_day_indx_test))
returns_to_use[dict_train['si'],dict_train['di']]=dict_train['y_data'].ravel()

### scale long and short positions to be market neutral and constant booksize each day
alpha_booksized=booksize(alpha,1e6)

sim_result=simulate_portfolio(alpha_booksized,returns_to_use)
print(sim_result.keys())

%matplotlib inline
figsize = (14, 8)
fig = plt.figure(facecolor=(245/255.0, 245/255.0, 245/255.0), figsize=figsize)     
pnl_to_sim=sim_result['dailypnl'][min_day_indx_train:max_day_indx_train]
mean_pnl=np.mean(pnl_to_sim)
std_pnl=np.std(pnl_to_sim)

### calculate information ratio
ir=np.round(mean_pnl/std_pnl,3)
sr=np.round(np.sqrt(252)*mean_pnl/std_pnl,3)
print('information ratio (ir)',ir)
print('annualized Sharpe ratio',sr)
plt.plot(np.cumsum(pnl_to_sim), color='r', label='pnl')
plt.grid()
plt.legend(loc='upper left',  fontsize=12)
plt.title('Simulation of a 1m long 1m short portfolio', fontsize=18)
plt.xlabel("days",  fontsize=16)
plt.ylabel("cumulative pnl (dollars)",  fontsize=16)

longsize=np.mean(sim_result['longsize'][min_day_indx_train:max_day_indx_train])
print('longsize (m)',longsize/1e6)
shortsize=np.mean(sim_result['shortsize'][min_day_indx_train:max_day_indx_train])
print('shortsize (m)',shortsize/1e6)
tvr=np.mean(sim_result['dailytvr'][min_day_indx_train:max_day_indx_train])
print('tvr',np.round(tvr,3))

plt.show()


In [None]:
# Construct and simulate average of all alphas

import matplotlib.pyplot as plt
import matplotlib.pylab as pyl
from matplotlib.pyplot import figure
%matplotlib inline

### construct and simulate average of all alphas
alpha=np.zeros((max_stock_indx,max_day_indx_test))
alpha[dict_train['si'],dict_train['di']]=np.mean(dict_train['x_data'],axis=1).ravel()

returns_to_use=np.zeros((max_stock_indx,max_day_indx_test))
returns_to_use[dict_train['si'],dict_train['di']]=dict_train['y_data'].ravel()

### scale long and short positions to be market neutral and constant booksize each day
alpha_booksized=booksize(alpha,1e6)

sim_result=simulate_portfolio(alpha_booksized,returns_to_use)
print(sim_result.keys())

%matplotlib inline
figsize = (14, 8)
fig = plt.figure(facecolor=(245/255.0, 245/255.0, 245/255.0), figsize=figsize)     
pnl_to_sim=sim_result['dailypnl'][min_day_indx_train:max_day_indx_train]
mean_pnl=np.mean(pnl_to_sim)
std_pnl=np.std(pnl_to_sim)

### calculate information ratio
ir=np.round(mean_pnl/std_pnl,3)
sr=np.round(np.sqrt(252)*mean_pnl/std_pnl,3)
print('information ratio (ir)',ir)
print('annualized Sharpe ratio',sr)
plt.plot(np.cumsum(pnl_to_sim), color='r', label='pnl')
plt.grid()
plt.legend(loc='upper left',  fontsize=12)
plt.title('Simulation of a 1m long 1m short portfolio', fontsize=18)
plt.xlabel("days",  fontsize=16)
plt.ylabel("cumulative pnl (dollars)",  fontsize=16)

longsize=np.mean(sim_result['longsize'][min_day_indx_train:max_day_indx_train])
print('longsize (m)',longsize/1e6)
shortsize=np.mean(sim_result['shortsize'][min_day_indx_train:max_day_indx_train])
print('shortsize (m)',shortsize/1e6)
tvr=np.mean(sim_result['dailytvr'][min_day_indx_train:max_day_indx_train])
print('tvr',np.round(tvr,3))

plt.show()


In [None]:
# Submission
sample_submission = pd.DataFrame()
sample_submission['id']=dict_test['id']
sample_submission['target_feature']=np.mean(dict_test['x_data'],axis=1).ravel()
#sample_submission.to_csv('sample_submission.csv', index=False)
