# Pairs Traiding through Unsupervised Learning

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

tqdm.pandas()

FILEPATH = "./data/historic_characteristics.csv"
FILEPATH_PARQ = "./data/historic_characteristics.parquet"
FILEPATH_MOM_PARQ = "./data/data_mom.parquet"
FILEPATH_CLEAN_PARQ = "./data/data_cleaning.parquet"
FILEPATH_PRE_PARQ = "./data/data_preprocessed.parquet"

MIN_YEAR=1980
MAX_YEAR=2021
CHUNKS = 10000

FEATURES = [
    "DATE", "absacc", "acc", "aeavol", "age", "agr", "baspread", "beta", "betasq", "bm",
    "bm_ia", "cash", "cashdebt", "cashpr", "cfp", "cfp_ia", "chatoia", "chcsho", "chempia",
    "chinv", "chmom", "chpmia", "chtx", "cinvest", "convind", "currat", "depr", "divi",
    "divo", "dolvol", "dy", "ear", "egr", "ep", "gma", "herf", "hire", "idiovol", "ill",
    "indmom", "invest", "lev", "lgr", "maxret", "mom1m", "ms", "mve_ia", "mvel1", "nincr",
    "operprof", "pchcapx_ia", "pchcurrat", "pchdepr", "pchgm_pchsale", "pchquick",
    "pchsale_pchrect", "pctacc", "permno", "pricedelay", "ps", "quick", "rd", "retvol",
    "roaq", "roeq", "roic", "rsup", "salecash", "salerec", "securedind", "sgr", "sic2",
    "sin", "sp", "std_dolvol", "std_turn", "tang", "tb", "turn", "zerotrade"
]
WINDOW = 48
MOM_FEATURES = [f"mom{i}m" for i in range(1, WINDOW + 1)]

  from .autonotebook import tqdm as notebook_tqdm


# Data Wrangling

## Parquet Dataset Creation

The dataset is large, around 3GB of company characteristics from 1985 to 2021. This dataset has been currated for the papers ["Empirical Asset Pricing via Machine Learning"](https://dachxiu.chicagobooth.edu/download/ML.pdf)(2018) and ["Autoencoder Asset Pricing Models." ](https://www.sciencedirect.com/science/article/abs/pii/S0304407620301998)(2019) by Shihao Gu, Bryan Kelly and Dacheng Xiu. The raw format is available for download from the authors [personal website](https://sites.google.com/view/agoyal145) (or reach out to me for a currated dataset). The dataset has 94 1 month Lagged Firm Characteristics (as the CRSP releases these with a month delay, from the notes in their papers). Note that this CRSP datasets don't have tickers or company names, but use a permanent indentifier instead, which if you have a bloomberg terminal or access to a research site that brokers this data, you can easily convert to the company ticker.

In [2]:
import pyarrow as pa
import pyarrow.parquet as pq
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd

CONVERT = False
if CONVERT:
    # In case its in CSV, convert it to parquet for easy processing.
    chars_df = pd.read_csv(FILEPATH)[FEATURES]
    chars_df['DATE'] = pd.to_datetime(chars_df['DATE'], format='%Y%m%d')
    chars_df = chars_df[(chars_df['DATE'].dt.year >= MIN_YEAR) & (chars_df['DATE'].dt.year <= MAX_YEAR)]
    chars_df = chars_df.sort_values("DATE")
    chars_df.to_parquet(FILEPATH_PARQ, index=False, compression="snappy")
else:
    chars_df = pd.read_parquet(FILEPATH_PRE_PARQ) # use FILEPATH_PARQ without preproc or FILEPATH_PRE_PARQ

chars_df.head(5)

Unnamed: 0,DATE,absacc,acc,aeavol,age,agr,baspread,beta,betasq,bm,...,mom39m,mom40m,mom41m,mom42m,mom43m,mom44m,mom45m,mom46m,mom47m,mom48m
0,2018-01-31,0.059018,-0.059018,1.667971,31.0,-0.064006,0.025255,0.643161,0.413656,0.275043,...,0.098912,0.090917,0.143577,0.219788,0.216057,0.146124,0.141801,0.069936,0.03304,-0.043765
1,2018-02-28,0.059018,-0.059018,1.667971,31.0,-0.064006,0.024363,0.617265,0.381017,0.275043,...,0.098912,0.090917,0.143577,0.219788,0.216057,0.146124,0.141801,0.069936,0.03304,-0.043765
2,2018-03-29,0.059018,-0.059018,1.667971,31.0,-0.064006,0.029453,0.585592,0.342918,0.275043,...,0.098912,0.090917,0.143577,0.219788,0.216057,0.146124,0.141801,0.069936,0.03304,-0.043765
3,2018-04-30,0.055709,-0.055709,1.667971,32.0,-0.097081,0.024292,0.649596,0.421975,0.298639,...,0.098912,0.090917,0.143577,0.219788,0.216057,0.146124,0.141801,0.069936,0.03304,-0.043765
4,2018-05-31,0.055709,-0.055709,0.595033,32.0,-0.097081,0.020537,0.673609,0.453749,0.298639,...,0.098912,0.090917,0.143577,0.219788,0.216057,0.146124,0.141801,0.069936,0.03304,-0.043765


## Sanitinzation and Feature Engineering

To sanitinize we drop any company with insufficiant data to fill a window, and we fill any missing characteristic with the median of that window. We then perform a rolling window to calculate the MOM factor for 2 to 64 months, the data already has a rolling 1 month momentum. To read about momentum stratgies check out the article [Momentum and Reversion Trading Signals Analysis](https://medium.com/call-for-atlas/momentum-and-reversion-the-poor-mans-trading-strategies-9b8e1e6d3496).


In [3]:
def interpolate_with_median(group):
    rolling_median = group.rolling(window=WINDOW, min_periods=1).median()
    group= group.fillna(rolling_median).bfill()

    return group


PRE_PROC=False
if PRE_PROC:
    valid_groups = chars_df.groupby('permno').filter(lambda x: len(x) >= WINDOW and x['mom1m'].isna().sum() <= 2)
    for i in tqdm(range(2, WINDOW + 1), desc="moms"):
        rolling_func = lambda x: (x + 1).rolling(window=i).apply(np.prod, raw=True) - 1
        valid_groups[f'mom{i}m'] = valid_groups.groupby('permno')['mom1m'].transform(rolling_func)

    numerical_columns = valid_groups.select_dtypes(include=['float64', 'int64']).columns

    tqdm.pandas(desc="interpolate_with_median")
    valid_groups[numerical_columns]= valid_groups.groupby('permno')[numerical_columns].progress_transform(lambda x: interpolate_with_median(x))
    valid_groups.to_parquet(FILEPATH_PRE_PARQ, index=False, compression="snappy")
    chars_df = valid_groups

chars_df.tail(5)

Unnamed: 0,DATE,absacc,acc,aeavol,age,agr,baspread,beta,betasq,bm,...,mom39m,mom40m,mom41m,mom42m,mom43m,mom44m,mom45m,mom46m,mom47m,mom48m
216715,2021-08-31,0.1208,-0.1208,-0.526523,11.0,-0.519951,0.036227,1.461826,2.136934,0.033035,...,10.070791,11.226,8.484363,8.183217,9.450315,9.534905,10.278725,10.888254,16.077829,16.549386
216716,2021-09-30,0.1208,-0.1208,-0.526523,11.0,-0.519951,0.0269,1.454301,2.114992,0.033035,...,11.234167,10.852448,12.089221,9.154009,8.831601,10.188162,10.278725,10.888254,16.077829,16.549386
216717,2021-10-29,0.1208,-0.1208,-0.106147,11.0,-0.519951,0.025149,1.436531,2.063623,0.033035,...,9.706188,11.895329,11.492982,12.796592,9.702755,9.362923,10.792796,10.888254,16.077829,16.549386
216718,2021-11-30,0.1208,-0.1208,-0.106147,11.0,-0.519951,0.031566,1.508117,2.274417,0.033035,...,16.691311,14.379756,17.524523,16.946539,18.819214,14.374825,13.886646,15.940702,16.077829,16.549386
216719,2021-12-31,0.1208,-0.1208,-0.106147,11.0,-0.519951,0.056249,1.504264,2.262812,0.033035,...,16.967672,17.179807,14.804426,18.036026,17.442083,19.366467,14.799359,14.2977,16.408472,16.549386


## Firm Characteristics

| Acronym  | Firm characteristic                                           | Acronym    | Firm characteristic                                       |
|----------|--------------------------------------------------------------|------------|----------------------------------------------------------|
| absacc   | Absolute accruals                                            | invest     | Capital expenditures and inventory                        |
| acc      | Working capital accruals                                     | IPO        | New equity issue                                          |
| aeavol   | Abnormal earnings announcement volume                        | lev        | Leverage                                                  |
| age      | # years since first Compustat coverage                       | lgr        | Growth in long-term debt                                  |
| agr      | Asset growth                                                  | maxret     | Maximum daily return                                      |
| baspread | Bid-ask spread                                               | ms         | Financial statement score                                 |
| beta     | Beta                                                         | mve        | Size                                                      |
| betasq   | Beta squared                                                 | mve ia     | Industry-adjusted size                                    |
| bm       | Book-to-market                                               | nincr      | Number of earnings increases                              |
| bm ia    | Industry-adjusted book to market                             | operprof   | Operating profitability                                   |
| cash     | Cash holdings                                                | pchcapx ia | Industry adjusted % change in capital expenditures        |
| cashdebt | Cash flow to debt                                            | pchcurrat  | % change in current ratio                                 |
| cashpr   | Cash productivity                                            | pchdepr    | % change in depreciation                                  |
| cfp      | Cash flow to price ratio                                     | pchgm      | % change in gross margin                                  |
| cfp ia   | Industry-adjusted cash flow to price ratio                   | pchsale    | % change in sales                                         |
| chatoia  | Industry-adjusted change in asset turnover                   | pchquick   | % change in quick ratio                                   |
| chcsho   | Change in shares outstanding                                 | pctacc     | Percent accruals                                          |
| chempia  | Industry-adjusted change in employees                        | pricedelay | Price delay                                               |
| chinv    | Change in inventory                                          | ps         | Financial statements score                                |
| chmom    | Change in 6-month momentum                                   | quick      | Quick ratio                                               |
| chpmia   | Industry-adjusted change in profit margin                    | rd         | R&D increase                                              |
| chtx     | Change in tax expense                                        | retvol     | Return volatility                                         |
| cinvest  | Corporate investment                                         | roaq       | Return on assets                                          |
| convind  | Convertible debt indicator                                   | roeq       | Return on equity                                          |
| currat   | Current ratio                                                | roic       | Return on invested capital                                |
| depr     | Depreciation / PP&E                                          | rsup       | Revenue surprise                                          |
| divi     | Dividend initiation                                          | sgr        | Sales growth                                              |
| divo     | Dividend omission                                            | sin        | Sin stocks                                                |
| dolvol   | Dollar trading volume                                        | SP         | Sales to price                                            |
| dy       | Dividend to price                                            | std dolvol | Volatility of liquidity (dollar trading volume)          |
| ear      | Earnings announcement return                                 | std turn   | Volatility of liquidity (share turnover)                  |
| egr      | Growth in common shareholder equity                          | sue        | Unexpected quarterly earnings                             |
| ep       | Earnings to price                                            | tang       | Debt capacity/firm tangibility                            |
| gma      | Gross profitability                                          | tb         | Tax income to book income                                 |
| herf     | Industry sales concentration                                 | turn       | Share turnover                                            |
| hire     | Employee growth rate                                         | zerotrade  | Zero trading days                                         |
| idiovol  | Idiosyncratic return volatility                              |            |                                                           |
| ill      | Illiquidity                                                  |            |                                                           |
| indmom   | Industry momentum                                            |            |                                                           |


Additionally, there are PERMNO columns to ID the company, and a **SIC code** to ID the industry from [NAICS](https://www.naics.com/sic-codes-industry-drilldown/) to compliment the industires momentum **INDMOM**.


## Dim Reduction with PCA

We perform standardization and PCA at 95% variance, to center the data's means for the clustering algorithims and reduce its dimensionality.

In [8]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

chars_pca_df = chars_df.copy()

scaler = StandardScaler()
pca = PCA(0.95)

features_df = chars_pca_df.drop(['DATE', 'permno'], axis=1).bfill()

pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
pca_df = pipeline.fit_transform(features_df)

pca_result_df = pd.DataFrame(data=pca_df, index=chars_pca_df.index)
pca_components_cols = pca_result_df.columns
pca_result_df['permno'] = chars_pca_df['permno']
pca_result_df.index = chars_pca_df['DATE']
pca_result_df[MOM_FEATURES] = chars_pca_df[MOM_FEATURES]
pca_result_df.tail(1)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,mom39m,mom40m,mom41m,mom42m,mom43m,mom44m,mom45m,mom46m,mom47m,mom48m
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-31,74.16544,-19.202222,7.136681,9.836218,-1.114852,2.553955,-4.042337,1.81147,-19.015538,6.504837,...,,,,,,,,,,


# Cluster Agglomerative

In [9]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import AgglomerativeClustering

models_dfs = []
labels = []
cluster_membership = []
for month, data in tqdm(pca_result_df.groupby(pd.Grouper(freq='M')), desc="Processing months"):
    pca_data = data[pca_components_cols]
    if len(pca_data) < 2:
        print(f"Skipping {month} due to insufficient data.")
        continue

    neigh = NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(pca_data)
    distances, indices = nbrs.kneighbors(pca_data)

    distances = np.sort(distances, axis=0)
    distances = distances[:, 1]
    distance_alpha_thresh = np.percentile(distances, 30)
    agg_model = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_alpha_thresh, linkage='average')
    agg_model.fit(pca_data)

    cluster_df = pd.DataFrame(data['permno'].copy())
    cluster_df['Cluster'] = agg_model.labels_
    cluster_df['DATE'] = month

    cluster_membership.append(cluster_df)

    models_dfs.append({'DATE': month, 'NumClusters': agg_model.n_clusters_})

models_df = pd.DataFrame(models_dfs)
cluster_membership_df = pd.concat(cluster_membership, ignore_index=True)

cluster_membership_df

  for month, data in tqdm(pca_result_df.groupby(pd.Grouper(freq='M')), desc="Processing months"):
Processing months: 100%|██████████| 48/48 [00:50<00:00,  1.06s/it]


Unnamed: 0,permno,Cluster,DATE
0,10026,3691,2018-01-31
1,10028,2047,2018-01-31
2,10032,14,2018-01-31
3,10044,3545,2018-01-31
4,10065,3329,2018-01-31
...,...,...,...
216715,93423,1337,2021-12-31
216716,93426,0,2021-12-31
216717,93427,607,2021-12-31
216718,93434,337,2021-12-31


In [10]:
models_df

Unnamed: 0,DATE,NumClusters
0,2018-01-31,3694
1,2018-02-28,3697
2,2018-03-31,3694
3,2018-04-30,3693
4,2018-05-31,3700
5,2018-06-30,3696
6,2018-07-31,3689
7,2018-08-31,3659
8,2018-09-30,3672
9,2018-10-31,3684


# Trade Simulation

The trade will take the following steps:
1. Check that the security is in the cluster.
2. Get cross-sectional standard dev.
3. split into deciles.
4. if first > last by > 1 std - there is a statarb opportunity.
5. Select Long-Short for that month, and close securities from previous month which have reversed back to their normal distance.

In [14]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

trade_opportunities = []

for month, data in tqdm(pca_result_df.groupby(pd.Grouper(freq='M')), desc="Processing months"):
    cross_sec_std = data['mom1m'].std()

    # Create a portfolio DataFrame and assign deciles
    portfolio = data.copy()
    portfolio['Decile'] = pd.qcut(portfolio['mom1m'], 10, labels=False, duplicates='drop')

    # Iterate through each decile to identify long and short opportunities
    for decile in range(10):
        # Calculate the average 'mom1m' for the current decile
        decile_avg = portfolio[portfolio['Decile'] == decile]['mom1m'].mean()

        # Assuming a strategy where each decile is evaluated for potential long/short opportunities
        # This simplistic example assumes the opportunity is based on the decile's deviation from the cross-sectional mean
        # Adjust this logic as per your specific strategy requirements
        if decile_avg > cross_sec_std:
            longs = portfolio[portfolio['Decile'] == decile]['permno'].tolist()
            trade_opportunities.append({
                'Month': f"{month.year}-{month.month:02d}",
                'Decile': decile,
                'Action': 'Long',
                'PermNos': longs
            })
        elif decile_avg < -cross_sec_std:
            shorts = portfolio[portfolio['Decile'] == decile]['permno'].tolist()
            trade_opportunities.append({
                'Month': f"{month.year}-{month.month:02d}",
                'Decile': decile,
                'Action': 'Short',
                'PermNos': shorts
            })

trade_opportunities_df = pd.DataFrame(trade_opportunities)

print(trade_opportunities_df.tail())


  for month, data in tqdm(pca_result_df.groupby(pd.Grouper(freq='M')), desc="Processing months"):
Processing months: 100%|██████████| 48/48 [00:00<00:00, 151.42it/s]

Empty DataFrame
Columns: []
Index: []



