# 1. Import

In [173]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression

from matplotlib import pyplot as plt

from pathlib import Path
from tqdm import tqdm

# 2. Load Data

In [235]:
# load daily industry returns
df_daily = pd.read_csv(str(Path().absolute()) + "/data/48_Industry_Portfolios_Daily.CSV", index_col = 0)
df_daily.index = pd.to_datetime(df_daily.index, format = "%Y%m%d")
df_daily = df_daily / 100

# handle NA values
mask = (df_daily <= -0.99)
df_daily[mask] = np.nan

In [205]:
# load Fama French Daily Factors
ff = pd.read_csv(str(Path().absolute()) + "/data/F-F_Research_Data_Factors_daily.CSV", index_col = 0)
ff.index = pd.to_datetime(ff.index, format = "%Y%m%d")
ff = ff / 100

In [236]:
# load 4 dataframes (returns, nb industries, avg firm size, Sum of BE / Sum of ME)
df = pd.read_csv(str(Path().absolute()) + "/data/48_Industry_Portfolios.CSV") 

# split these dataframes
df_list = np.split(df, df[df.isnull().all(1)].index, axis = 0) 

# clean data and convert date column to index
for i in range(len(df_list)):
    df_list[i] = pd.DataFrame(df_list[i])  
    df_list[i] = df_list[i].dropna()  
    df_list[i].loc[:, "Date"] = df_list[i].loc[:, "Date"].astype("int")  
    df_list[i] = df_list[i].set_index("Date")  
    
    # last data frame has yearly data
    if i == (len(df_list) - 1): 
        df_list[i].index = pd.to_datetime(df_list[i].index, format = "%Y")
        df_list[i].index = df_list[i].index + pd.DateOffset(months = 6)
    else:
        df_list[i].index = pd.to_datetime(df_list[i].index, format = "%Y%m")

# create a dataframe of excess returns, nb of industries and avg sizes
df = df_list[0] / 100
mask = (df <= -0.99)
df[mask] = np.nan

nb_industries = df_list[1]
nb_industries[mask] = np.nan

avg_size = df_list[2]
avg_size[mask] = np.nan

be_over_me = df_list[3]
be_over_me[mask] = np.nan

  df_list[i].loc[:, "Date"] = df_list[i].loc[:, "Date"].astype("int")
  df_list[i].loc[:, "Date"] = df_list[i].loc[:, "Date"].astype("int")
  df_list[i].loc[:, "Date"] = df_list[i].loc[:, "Date"].astype("int")
  df_list[i].loc[:, "Date"] = df_list[i].loc[:, "Date"].astype("int")


# 3. Calculate Characteristics

## 3.1 Market Cap, Book-to-Market, Momentum

In [237]:
# market cap of each industry over time
mkt_cap = nb_industries * avg_size

# book value to market value
be_over_me = be_over_me.resample("1MS").ffill()

# momentum with monthly data
momentum = df.rolling(12).mean()

## 3.2 Beta

In [190]:
def ols(subset, df, industry):
        
        df_roll = df.loc[subset.index, :]
        
        return linregress(df_roll.loc[:,"Mkt-RF"], df_roll.loc[:,industry]).slope

In [115]:
betas = pd.DataFrame(index=df_daily.index, columns=df_daily.columns)

for industry, industry_timeseries in tqdm(df_daily.items()):
    
    temp_df = pd.concat([(industry_timeseries - ff.loc[:, "RF"]), ff.loc[:, "Mkt-RF"]], axis=1)
    temp_df.columns =[industry, "Mkt-RF"]
    betas.loc[:, industry] = temp_df["Mkt-RF"].rolling(12).apply(ols, args = (temp_df, industry))


  betas.loc[:, industry] = temp_df["Mkt-RF"].rolling(12).apply(ols, args = (temp_df, industry))
1it [00:08,  8.77s/it]


KeyboardInterrupt: 

In [None]:
# resample to select end of month data and then convert the index to start of month data
betas = betas.resample("M").last().resample("MS").last()

In [None]:
indices = pd.date_range("1927-06-01", "2024-12-31", freq = "M")
betas = pd.DataFrame(index=indices, columns=df_daily.columns)

for industry, industry_timeseries in tqdm(df_daily.items()):
    for date in indices:
        
        # start and end date
        # we take last year of data (remove 1 day since it includes last day of a month we don't want)
        start = date - pd.DateOffset(months = 12) + pd.Timedelta(days = 1)
        end = date
        
        X = ff.loc[start:end,["Mkt-RF"]]
        y = industry_timeseries.loc[start:end] - ff.loc[start:end, "RF"]
        
        try:
            beta = LinearRegression(fit_intercept=True).fit(X, y).coef_[0]
        except ValueError:
            beta = np.nan
        
        betas.loc[end, industry] = beta

        
# convert the month ends to starts of months
betas = betas.resample("MS").last()

48it [00:36,  1.31it/s]


## 3.3 Idiosyncratic Risk

In [240]:
# Get unique year-month pairs
monthly_periods = df_daily.resample('MS').mean().index

risk = pd.DataFrame(index=indices, columns=df_daily.columns)

for date in tqdm(monthly_periods):
    # Filter data for the entire month
    mask = (df_daily.index.year == date.year) & (df_daily.index.month == date.month)
    ret_data = df_daily[mask]
    factors_data = ff[mask]

    # Ensure aligned data
    aligned_data = ret_data.join(factors_data, how='inner').dropna()

    for industry, industry_timeseries in ret_data.items():

            X = aligned_data.loc[:,["Mkt-RF", "SMB", "HML"]]
            y = industry_timeseries - aligned_data.loc[:, "RF"]
            
            try:
                reg = LinearRegression(fit_intercept=True).fit(X, y)
                resid = y - reg.predict(X)
            except ValueError:
                resid = np.nan
            risk.loc[date, industry] = np.std(resid)
            

100%|███████████████████████████████████████| 1182/1182 [00:40<00:00, 29.22it/s]


## 3.4 Long-Short Portfolios

In [241]:
def position(df, upper_cutoff, lower_cutoff):
    df_pos = df.rank(axis = 1)
    df_pos[df_pos <= lower_cutoff] = 1
    df_pos[df_pos >= upper_cutoff] = -1
    df_pos[(df_pos <= upper_cutoff) & (df_pos >= lower_cutoff)] = 0
    
    return df_pos

In [242]:
# long-short portfolio positions
pos_mkt_cap = position(mkt_cap, len(mkt_cap) - 5, 5)
pos_be_over_me = position(be_over_me, len(mkt_cap) - 5, 5)
pos_mom = position(momentum, len(mkt_cap) - 5, 5)
pos_betas = position(betas, len(mkt_cap) - 5, 5)
pos_risks = position(risk, len(mkt_cap) - 5, 5)

In [252]:
def equal_weight_rets(df, position, n_longs):
    
    performance = ((df * position) / n_longs).sum(axis = 1)
    return performance


def performance_measure(rets, rf):
    avg_ret = rets.mean()
    sharpe = (rets.mean() - rf)/ rets.std()
    ALPHA NEEDED
    return (avg_ret, sharpe)

In [250]:
rets = equal_weight_rets(df, pos_mkt_cap, 5)

In [256]:
performance_measure(rets, ff.loc[:, "RF"].mean())

(0.012563384094754652, 0.14497092146645713)

In [257]:
rets = equal_weight_rets(df, pos_be_over_me, 5)

In [258]:
performance_measure(rets, ff.loc[:, "RF"].mean())

(0.00920979695431472, 0.15877517034817096)