In [None]:
import os
from pathlib import Path

try:
  import google.colab
  IN_COLAB = True
  print("Running on a Colab Notebook environment")
except:
  IN_COLAB = False

print("os.environ.get('KAGGLE_URL_BASE') == ", os.environ.get('KAGGLE_URL_BASE'))
if 'kaggle' in os.environ.get('KAGGLE_URL_BASE','localhost'):
    print("Running on a Kaggle notebook environment")
    IN_KAGGLE = True
else:
    IN_KAGGLE = False

print(IN_COLAB)
print(IN_KAGGLE)
if IN_COLAB:
    if(os.path.isdir('input/jpx-tokyo-stock-exchange-prediction')==False):
        from google.colab import files
        files.upload()
        ! mkdir ~/.kaggle 
        ! cp kaggle.json ~/.kaggle/ 
        ! chmod 600 ~/.kaggle/kaggle.json 
        ! kaggle competitions download -c jpx-tokyo-stock-exchange-prediction
        ! mkdir input
        ! mkdir input/jpx-tokyo-stock-exchange-prediction
        ! unzip jpx-tokyo-stock-exchange-prediction.zip -d input/jpx-tokyo-stock-exchange-prediction

In [None]:
import warnings, gc
import numpy as np 
import pandas as pd
import matplotlib.colors
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from datetime import datetime, timedelta
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error,mean_absolute_error
#from lightgbm import LGBMRegressor
from decimal import ROUND_HALF_UP, Decimal
warnings.filterwarnings("ignore")
import plotly.figure_factory as ff
from tqdm.auto import tqdm
import copy
import math
import random
import matplotlib.pyplot as plt

init_notebook_mode(connected=True)
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))
colors=px.colors.qualitative.Plotly
root=Path('../input')
if IN_COLAB:
    root =Path('input')
if not IN_COLAB and not IN_KAGGLE:
    root=Path('input')
train_path=root/'jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv'


stock_list_path=root/"jpx-tokyo-stock-exchange-prediction/stock_list.csv"

train=pd.read_csv(train_path, parse_dates=['Date'])
stock_list=pd.read_csv(stock_list_path)

print("The training data begins on {} and ends on {}.\n".format(train.Date.min(),train.Date.max()))
display(train.describe().style.format('{:,.2f}'))

In [None]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        #print(purchase - short)
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:


def get_label(price, code):
    """ Labelizer
    Args:
        price (pd.DataFrame): dataframe of stock_price.csv
        code (int): Local Code in the universe
    Returns:
        df (pd.DataFrame): label data
    """
    df = price.loc[price["SecuritiesCode"] == code].copy()
    df.loc[:, "label"] = df["Target"]

    return df.loc[:, ["SecuritiesCode", "label"]]



In [None]:
train.head()

In [None]:
stock_list

In [None]:
train_date=train.Date.unique()
returns=train.groupby('Date')['Target'].mean().mul(100).rename('Average Return')
close_avg=train.groupby('Date')['Close'].mean().rename('Closing Price')
vol_avg=train.groupby('Date')['Volume'].mean().rename('Volume')

fig = make_subplots(rows=3, cols=1, 
                    shared_xaxes=True)
for i, j in enumerate([returns, close_avg, vol_avg]):
    fig.add_trace(go.Scatter(x=train_date, y=j, mode='lines',
                             name=j.name, marker_color=colors[i]), row=i+1, col=1)
fig.update_xaxes(rangeslider_visible=False,
                 rangeselector=dict(
                     buttons=list([
                         dict(count=6, label="6m", step="month", stepmode="backward"),
                         dict(count=1, label="1y", step="year", stepmode="backward"),
                         dict(count=2, label="2y", step="year", stepmode="backward"),
                         dict(step="all")])),
                 row=1,col=1)
fig.update_layout(template=temp,title='JPX Market Average Stock Return, Closing Price, and Shares Traded', 
                  hovermode='x unified', height=700, 
                  yaxis1=dict(title='Stock Return', ticksuffix='%'), 
                  yaxis2_title='Closing Price', yaxis3_title='Shares Traded',
                  showlegend=False)
fig.show()

In [None]:
sample=pd.read_csv('input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv')
sample

For each dat, assign a number from 1 to n to stocks

In [None]:
train_subset=train.loc[train['Date']>'2021-02-02']
train_subset

In [None]:
# i need a function to generate portfolios:
random_sort=train_subset.SecuritiesCode.unique()
#random_sort['Rank']=np.random.rand(len(random_sort))
random_sort=pd.DataFrame(random_sort,columns=['SecuritiesCode'])
random_sort['Rank']=np.random.rand(len(random_sort))
random_sort.sort_values("Rank")
random_sort['Rank']=(random_sort['Rank'].rank(method='dense',ascending=True).astype(int))-1
random_sort

In [None]:
SecuritiesCode=train_subset.SecuritiesCode.unique()
SecuritiesCode


In [None]:
def generate_random_ranking_dict(securities):
    order=np.arange(len(securities))
    np.random.shuffle(order)
    randomdict={s:r for s,r in zip(securities,order)}
    #print(randomdict)
    return randomdict

def generate_random_ranking(securities):
#random_sort['Rank']=np.random.rand(len(random_sort))
    
    random_sort=pd.DataFrame(securities,columns=['SecuritiesCode'])
    random_sort['Rank']=np.random.rand(len(random_sort))
    random_sort.sort_values("Rank")
    random_sort['Rank']=(random_sort['Rank'].rank(method='dense',ascending=True).astype(int))-1
    random_sort
    return random_sort

In [None]:
randomrank=generate_random_ranking_dict(SecuritiesCode)
train_subset['Rank']=train_subset['SecuritiesCode'].map(randomrank)
train_subset

In [None]:
sharpes=[]

In [None]:
def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): spread return
    """
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    #print(purchase - short)
    return purchase - short

In [None]:
def generate_n_portfolios(SecuritiesCode,n):
    portfolios=[]
    for i in range(n):
        portfolios.append(generate_random_ranking_dict(SecuritiesCode))
    return portfolios

In [None]:
portfolios=generate_n_portfolios(SecuritiesCode,10000)


In [None]:
bufs=[]
for port in tqdm(portfolios):
    train_subset['Rank']=train_subset['SecuritiesCode'].map(port)
    buf = train_subset.groupby('Date').apply(calc_spread_return_per_day)
    bufs.append(buf)

In [None]:
arr=np.array(bufs)
arr

In [None]:
plt.rcParams['lines.markersize']

In [None]:
std=np.std(arr,axis=0)

In [None]:
with open('daily_spreads.npy','wb') as f:
    np.save(f,arr)

In [None]:
#i want to 

In [None]:
arr.shape

In [None]:
n=1000

for i in tqdm(range(n)):
    randomrank=generate_random_ranking_dict(SecuritiesCode)
    train_subset['Rank']=train_subset['SecuritiesCode'].map(randomrank)
    sharpe=calc_spread_return_sharpe(train_subset)
    sharpes.append(sharpe)

In [None]:
plt.hist(sharpes,bins=30)
plt.show()

References:

https://www.kaggle.com/code/smeitoma/jpx-competition-metric-definition

https://www.kaggle.com/code/smeitoma/submission-demo

https://www.kaggle.com/code/smeitoma/train-demo

https://www.kaggle.com/code/kellibelcher/jpx-stock-market-analysis-prediction-with-lgbm