In [7]:
!pip install pyarrow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
import numpy as np
import pandas as pd
import os
import sys
import pyarrow.parquet as pq
import re
import yfinance as yf
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
import math
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize

In [9]:
month_dict = { 1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
folder_path = r'/data/workspace_files/parquet_files'

In [10]:
def get_date_str_for_hp(start_year, end_year):
    date_strs = []
    for yr in range(start_year, end_year + 1):
        for month in range(1,13):
            if yr == 1996 and month <= 4:
                continue
            date_strs.append(str(month) + str(yr))
    return date_strs

In [11]:
def get_monthly_returns(returns, hold_period):
    return (1+ret)^(1/hold_period) - 1

In [12]:
def get_long_or_short_side(portfolio, pos_type):
    portfolio = portfolio.copy(deep = True)
    side = portfolio[portfolio['pos_type'] == pos_type].copy(deep = True)
    weights = np.array(side['weight']).reshape(-1,)*pos_type
    ret = np.array(side['ret']).reshape(-1,)
    ret_clipped = np.array(side['ret_clipped']).reshape(-1,)
    ret_clipped2 = np.array(side['ret_clipped2']).reshape(-1,)
    total_return = (weights*ret).sum()
    total_return_clipped = (weights*ret_clipped).sum()
    total_return_clipped2 = (weights*ret_clipped2).sum()

    return total_return, total_return_clipped, total_return_clipped2

In [16]:
def compute_portfolio_returns(score_type, lead, date_strs, quartile = None):
    tot_returns = []
    folder_path_req = f'{folder_path}/portfolio_lead_lag/{score_type}/{lead}'
    for date_str_idx in range(0, len(date_strs), 1):
            date_str = date_strs[date_str_idx]
            date_obj = datetime.strptime(date_str, '%m%Y')
            date_for_files = date_obj.strftime('%b%Y')
            if quartile != None:
                date_for_files = f'Q{quartile}/{date_for_files}'
            else:
                date_for_files = f'LS/{date_for_files}'
            score_path = f'{folder_path_req}/{date_for_files}.parquet'
            print(score_path)
            portfolio = pq.read_table(score_path).to_pandas().dropna(subset=['weight'])
            #print(portfolio)
            portfolio['ret_clipped2'] = winsorize(portfolio['ret'], limits=[0.01, 0.01])
            long_ret, long_ret_clipped, long_ret_clipped2 = get_long_or_short_side(portfolio, 1)
            short_ret, short_ret_clipped, short_ret_clipped2 = get_long_or_short_side(portfolio, -1)
            # for time_stamp_idx in range(hold_period):
            #     date_obj_req = date_obj + relativedelta(months = time_stamp_idx)
            tot_returns.append({'date': date_obj, 'return': long_ret - short_ret, 'return_clipped': long_ret_clipped - short_ret_clipped, 'return_clipped2': long_ret_clipped2 - short_ret_clipped2, 'num_holdings': len(portfolio),
                                    'long_return': long_ret, 'long_return_clipped': long_ret_clipped, 'long_return_clipped2': long_ret_clipped2,
                                    'short_return': short_ret, 'short_return_clipped': short_ret_clipped, 'short_return_clipped2': short_ret_clipped2})

    ret_df = pd.DataFrame(tot_returns)
    return ret_df

In [17]:
date_strs = get_date_str_for_hp(1996, 2017)
for score_type in ['cosine_similarity', 'jaccard_similarity']:
    for lead in range(1,13):
        for num_bucket in [5]:
            rets = compute_portfolio_returns(score_type, lead, date_strs)
            file_path_req = f'{folder_path}/portfolio_lead_lag/{score_type}/{lead}/LS/ret.parquet'
            rets.to_parquet(file_path_req)

/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/May1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Jun1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Jul1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Aug1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Sep1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Oct1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Nov1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Dec1996.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Jan1997.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosine_similarity/1/LS/Feb1997.parquet
/data/workspace_files/parquet_files/portfolio_lead_lag/cosin