# Preparing the trade data

In this notebook we merge the datasets and prepare the list of aggregated trades.

In [None]:
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm

from utils import USER, ITEM

In [None]:
input_path = 'source_data/scrape_parsed.parquet'
ciknames_path = 'source_data/cikmap.tab'
cusips_path = 'source_data/cusips.tsv'
output_path = 'trades.tsv'

First we load the holdings data and convert dates to a proper date format.

In [None]:
print(f'Loading {input_path}...')
df = pd.read_parquet(input_path)
df['rdate'] = pd.to_datetime(df.rdate, format='%Y%m%d')
df['fdate'] = pd.to_datetime(df.fdate, format='%Y%m%d')
df

Next we load the CIK (investor identifier) data, and map each CIK to the last name they have been using (some investors changed name while keeping the same CIK).

In [None]:
ciknames = pd.read_csv(ciknames_path, sep='\t')
ciknames_prepared = ciknames.groupby('cik', as_index=False)['cikname'].apply(lambda z: list(z)[-1])
ciknames_prepared

In [None]:
ciknames_prepared.nunique()

Now we join the holdings data to the CIK data and the CUSIP data in order to get investor and security names.

In [None]:
df = pd.merge(df, ciknames_prepared, 'left')

In [None]:
cusipnames = pd.read_csv(cusips_path, sep='\t')
df = pd.merge(df, cusipnames, 'left')
df.cusipname.fillna(df.cusip, inplace=True)
df

Next we compute the aggregate trades. We consider that an aggregate trade is a new holding, a holding that is present in the current quarter and not in the previous quarter.

In [None]:
# Computing the set of securities for each investor and date
df_groups = df.groupby(['cikname', 'rdate'])['cusipname'].apply(set)

In [None]:
# Computing the new securities for each investor and date
diff_data = []
for cikname in tqdm(df.cikname.unique()):
    g = df_groups[cikname]
    for i, s in enumerate(g):
        if i == 0:
            continue
        assert g.index[i] > g.index[i-1], 'dates should be ascending' 
        s0 = g.iloc[i-1]
        diff = s - s0
        for cusip in diff:
            diff_data.append([cikname, cusip, g.index[i]])

In [None]:
# Putting the data in a dataframe
df_diff = pd.DataFrame(diff_data, columns=['cikname', 'cusipname', 'rdate'])
df_diff = df_diff.sort_values(['rdate', 'cikname', 'cusipname']).reset_index(drop=True)
df_diff

Now we create the final columns and save the file as 'trades.csv'

In [None]:
df_diff[USER] = df_diff.cikname
df_diff[ITEM] = df_diff.cusipname
df_diff['date'] = pd.to_datetime(df_diff.rdate, format='%Y%m%d')
df_diff['trade'] = 1 # Just to facilitate analytics
print(f'Saving to {output_path}...')
df_diff[['date', 'trade', USER, ITEM]].to_csv(output_path, index=False, sep='\t')
!head {output_path}