In [1]:
#!pip install tqdm

In [37]:
import numpy as np
from portfoliolab.bayesian import RobustBayesianAllocation
from data_utils import read_all_candles, group_dates_df
from submission_utils import general_weights_fixer
import pandas as pd
from submission_utils import general_weights_fixer, get_submission_markowitz, test_submission

In [75]:
candles = read_all_candles("trainTimeSeries/trainTimeSeries/TrainCandles")

Getting candles data for Darwins...: 100%|██████████| 96/96 [00:03<00:00, 29.68it/s]


In [76]:
candles["ZVQ"].head()

Unnamed: 0_level_0,close,max,min,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-12-19 01:00:00,519.23,519.95,518.47,518.47
2018-12-19 02:00:00,519.63,519.93,519.07,519.28
2018-12-19 03:00:00,519.95,520.72,519.56,519.8
2018-12-19 04:00:00,520.09,520.39,519.87,519.96
2018-12-19 05:00:00,520.06,520.12,519.89,520.12


In [77]:
candles = {k: group_dates_df(v, mincol="min", opencol="open", highcol="max", closecol="close") for k,v in candles.items()}

In [78]:
candles["ZVQ"].head()

Unnamed: 0_level_0,close,low,high,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-12-19,531.91,517.72,534.25,518.47
2018-12-20,531.91,531.91,531.91,531.91
2018-12-21,,,,
2018-12-22,,,,
2018-12-23,,,,


In [79]:
def compute_returns(v):
    v = v.fillna(method="ffill") #.fillna(method="bfill")
    v["returns"] = (v["close"] - v["open"])/v["open"]
    v.drop([col for col in v.columns if col != "returns"], axis=1, inplace=True)
    return v

In [80]:
candles = {k:compute_returns(v)["returns"] for k,v in candles.items()} # compute_returns(v)

In [81]:
def filter_candles(candles, remove):
    return {k:v for k,v in candles.items() if k not in remove}

def process_data(candles, remove=None):
    if remove is not None:
        candles = filter_candles(candles, remove)
    df = pd.concat(candles.values(),keys= candles.keys(),axis=1)
    df = df.resample("D").aggregate("mean")
    df = df.fillna(method="ffill")#.fillna(method="bfill")
    return df

In [82]:
def get_returns_and_cov(df):
    cov = df.cov().values
    exp_returns = df.mean().values.reshape(-1, 1)
    return cov, exp_returns

In [83]:
df = process_data(candles)

In [84]:
df.head()

Unnamed: 0_level_0,ZVQ,NCT,YAX,OOS,GFJ,FIR,USX,FSK,TMF,TDD,...,AZG,OJG,WWT,BOT,TXR,RAT,DIG,SRI,PHI,RWJ
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-19,0.025922,-0.002931,,,,0.0,,0.000259,,-0.000516,...,0.006307,,0.004996,-0.028355,0.000916,-0.000712,,0.000868,0.0,
2018-12-20,0.0,0.003528,,,,0.0,,0.014443,,0.00818,...,0.006497,,0.01476,0.011485,-0.001197,0.0,,-0.000542,0.0,
2018-12-21,0.0,0.003528,,,,-0.005509,,-0.013407,,0.00818,...,-0.00038,,0.004371,0.011485,-0.001197,0.0,,-0.000542,0.0,
2018-12-22,0.0,0.003528,,,,-0.005509,,-0.013407,,0.00818,...,-0.00038,,0.004371,0.011485,-0.001197,0.0,,-0.000542,0.0,
2018-12-23,0.0,0.0,,,,0.003555,,0.004142,,0.00818,...,-0.00038,,0.000375,0.0,0.0,0.0,,-0.000542,0.0,


In [85]:
cov, exp_rets = get_returns_and_cov(df.dropna())

In [86]:
exp_rets.shape

(96, 1)

In [87]:
prior_covariance = np.diag(np.diag(cov))

In [88]:
prior_covariance.shape

(96, 96)

In [89]:
prior_mean = 0.5 * cov.dot(np.ones((96, 1))) / 96


In [90]:
bayes_allocator = RobustBayesianAllocation(discretisations=50)

In [91]:
bayes_allocator.allocate(
    sample_mean=exp_rets,
    sample_covariance=cov,
    prior_mean=prior_mean,
    prior_covariance=prior_covariance,
    relative_confidence_in_prior_mean=1e-5,
    relative_confidence_in_prior_covariance=1e-5,
    max_volatility=0.8*max(exp_rets),
    sample_size=df.shape[0]
)

In [92]:
bayes_allocator.weights

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,0.026779,0.0,0.0,0.0,0.011062,5.305763000000001e-17,0.068251,5.305763000000001e-17,0.0,0.034276,...,0.033101,8.358109999999999e-19,5.305763000000001e-17,0.0,5.305763000000001e-17,3.190088e-16,5.673338000000001e-17,1.118271e-16,0.007067,5.305763000000001e-17


In [93]:
# Get the weights
portfolio_weights = bayes_allocator.weights

In [94]:
portfolio_weights.values[0]

array([2.67786262e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.10619259e-02, 5.30576312e-17, 6.82512259e-02, 5.30576312e-17,
       0.00000000e+00, 3.42756385e-02, 0.00000000e+00, 3.82557397e-02,
       1.02375695e-16, 1.41026381e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 6.73765041e-16, 3.24675759e-02,
       0.00000000e+00, 0.00000000e+00, 1.41802767e-01, 1.23699152e-02,
       1.54973307e-16, 3.75999600e-16, 2.32427178e-16, 3.59346133e-16,
       5.12225102e-16, 0.00000000e+00, 5.30576312e-17, 0.00000000e+00,
       0.00000000e+00, 3.39308350e-02, 2.59212753e-03, 0.00000000e+00,
       1.07253589e-16, 0.00000000e+00, 6.74129848e-02, 0.00000000e+00,
       7.31653541e-17, 1.17632816e-02, 3.66513756e-17, 3.53867446e-17,
       1.93551276e-16, 3.01840323e-16, 3.03871856e-17, 0.00000000e+00,
       0.00000000e+00, 1.60120315e-16, 0.00000000e+00, 3.67963490e-02,
       7.14329738e-17, 7.37745008e-17, 5.30576312e-17, 3.83385894e-02,
      

In [59]:
weights = np.clip(0,1, weights)

In [60]:
weights = general_weights_fixer(portfolio_weights.values[0])

In [61]:
weights = weights / sum(weights)

In [63]:
ws = {asset:w for asset, w in zip([k for k in candles.keys()], weights)}

In [64]:
submission = get_submission_markowitz(ws, [k for k in candles.keys()])

In [65]:
sum(weights)

1.0

In [71]:
test_submission(submission)

In [70]:
submission = submission.div(submission.sum(axis=1), axis=0)

In [72]:
submission.to_csv("submission_2003.csv", header=True, index=True)

In [73]:
from portfoliolab.estimators import RiskEstimators, ReturnsEstimators

In [74]:
import pandas as pd
from mlfinlab.clustering import onc

In [95]:
df_corr = df.corr()

In [96]:
assets_corr_onc, clusters, silh_scores = onc.get_onc_clusters(df_corr, repeat=100)

In [99]:
clusters.keys()


dict_keys([0, 1])

In [None]:
# Import dataframe of returns for assets
asset_returns = pd.read_csv(DATA_PATH, index_col='Date', parse_dates=True)

# Calculate correlation matrix of returns
assets_corr = asset_returns.corr()

# Output of the ONC algorithm with 10 simulations for each number of clusters tested
assets_corr_onc, clusters, silh_scores = onc.get_onc_clusters(assets_corr, repeat=10)