# Mean Reversion multiple clusters
## Future returns are correlated with past returns

Daily return 
$$R_i = ln\frac{P_{i}(t_2)}{P_{i}(t_1)}$$
\
Average return of cointegrated stocks N
$$ \bar{R}= \frac{1}{N} \sum \limits _{i=1} ^{N} R_{i}$$
\
Mean Difference
$$ \tilde{R_{i}} = R_{i} - \bar{R}$$

Short positive mean difference , buy negative mean difference

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)
from decimal import ROUND_HALF_UP, Decimal
from statsmodels.api import OLS
import random
import statsmodels.api as sm


In [2]:
stock_list = pd.read_csv('ds/stock_list.csv')

In [3]:
train_stock_prices = pd.read_csv('ds/train_files/stock_prices.csv')
supplemental_stock_prices = pd.read_csv('ds/supplemental_files/stock_prices.csv')
data_stock_prices = pd.concat([train_stock_prices,supplemental_stock_prices],ignore_index=True)

In [4]:
train_financials = pd.read_csv('ds/train_files/financials.csv',low_memory=False)
supplemental_financials = pd.read_csv('ds/supplemental_files/financials.csv')

In [5]:
def calc_adjusted_close(df):
    df = df.sort_values("Date",ascending=False)
    df.loc[:,"cummulative_adjustment_factor"] = df["AdjustmentFactor"].cumprod()
    df.loc[:,"adjusted_close"] = (df["cummulative_adjustment_factor"]*df["Close"]).map(lambda x: float(Decimal(str(x)).quantize(Decimal("0.1"),rounding=ROUND_HALF_UP)))
    df = df.sort_values("Date")
    df.loc[df["adjusted_close"]==0,"adjusted_close"] = np.nan
    df.loc[:,"adjusted_close"] = df.loc[:,"adjusted_close"].ffill()
    return df

In [6]:

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        #Target is the rate of change 
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    buf.plot()
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [26]:
def create_features(df):
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.drop(["RowId"],axis=1)
    df = df[df["Date"]!="2020-10-01"]
    df = df.groupby("SecuritiesCode").apply(calc_adjusted_close).reset_index(drop=True).sort_values(["Date","SecuritiesCode"]).reset_index(drop=True)
    df["previous_adjusted_close"] = df.groupby("SecuritiesCode")["adjusted_close"].shift(1)
    df["daily_return"] = (df["adjusted_close"]/df["previous_adjusted_close"]) - 1
    df = df.join(stock_list[["SecuritiesCode","33SectorName"]].set_index(["SecuritiesCode"]),on=["SecuritiesCode"])
    df = df.set_index(["Date","33SectorName"])
    df["Mean"] = df.groupby(['Date','33SectorName'])['daily_return'].mean()
    df = df.reset_index()
    df["Mean_Difference"] = df["daily_return"] - df["Mean"]
    df = df.dropna(subset=["Mean_Difference"])
    df["Rank"]=df.groupby('Date')["Mean_Difference"].rank(method='first')-1
    return df

In [27]:
test = create_features(supplemental_stock_prices)

In [29]:
test[test["Date"]=='2021-12-07']

Unnamed: 0,Date,33SectorName,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,cummulative_adjustment_factor,adjusted_close,previous_adjusted_close,daily_return,Mean,Mean_Difference,Rank
2000,2021-12-07,"Fishery, Agriculture and Forestry",1301,2998.0,3065.0,2990.0,3065.0,19100,1.0,,False,0.009820,1.0,3065.0,2971.0,0.031639,0.013088,0.018551,1794.0
2001,2021-12-07,"Fishery, Agriculture and Forestry",1332,569.0,569.0,535.0,556.0,6449200,1.0,,False,-0.019964,1.0,556.0,589.0,-0.056027,0.013088,-0.069115,9.0
2002,2021-12-07,"Fishery, Agriculture and Forestry",1333,2382.0,2417.0,2371.0,2409.0,127300,1.0,,False,-0.008386,1.0,2409.0,2377.0,0.013462,0.013088,0.000374,1017.0
2003,2021-12-07,"Fishery, Agriculture and Forestry",1375,1227.0,1266.0,1227.0,1264.0,128600,1.0,,False,-0.004819,1.0,1264.0,1224.0,0.032680,0.013088,0.019592,1809.0
2004,2021-12-07,"Fishery, Agriculture and Forestry",1376,1374.0,1395.0,1366.0,1395.0,5800,1.0,,False,0.004289,1.0,1395.0,1351.0,0.032568,0.013088,0.019480,1808.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,2021-12-07,Retail Trade,9990,526.0,535.0,524.0,535.0,57800,1.0,,False,0.005556,1.0,535.0,517.0,0.034816,0.023124,0.011693,1601.0
3996,2021-12-07,Wholesale Trade,9991,795.0,806.0,792.0,805.0,48500,1.0,,False,0.000000,1.0,805.0,785.0,0.025478,0.028871,-0.003393,791.0
3997,2021-12-07,Retail Trade,9993,1640.0,1640.0,1620.0,1620.0,6600,1.0,,False,0.005491,1.0,1620.0,1627.0,-0.004302,0.023124,-0.027426,91.0
3998,2021-12-07,Retail Trade,9994,2437.0,2440.0,2423.0,2440.0,5200,1.0,,False,-0.001231,1.0,2440.0,2418.0,0.009098,0.023124,-0.014025,315.0


In [30]:
    df = supplemental_stock_prices.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.drop(["RowId"],axis=1)
    df = df[df["Date"]!="2020-10-01"]
    df = df.groupby("SecuritiesCode").apply(calc_adjusted_close).reset_index(drop=True).sort_values(["Date","SecuritiesCode"]).reset_index(drop=True)
    df["previous_adjusted_close"] = df.groupby("SecuritiesCode")["adjusted_close"].shift(1)
    df["daily_return"] = (df["adjusted_close"]/df["previous_adjusted_close"]) - 1
    df = df.join(stock_list[["SecuritiesCode","33SectorName"]].set_index(["SecuritiesCode"]),on=["SecuritiesCode"])

In [31]:
df.groupby(['Date','33SectorName'])['daily_return'].mean()


Date        33SectorName                                 
2021-12-06  Air Transportation                                    NaN
            Banks                                                 NaN
            Chemicals                                             NaN
            Construction                                          NaN
            Electric Appliances                                   NaN
                                                               ...   
2022-04-28  Services                                         0.006695
            Textiles and Apparels                            0.033947
            Transportation Equipment                         0.045872
            Warehousing and Harbor Transportation Service    0.028895
            Wholesale Trade                                  0.027715
Name: daily_return, Length: 3234, dtype: float64

In [None]:
calc_spread_return_sharpe(test)