In [246]:
import pandas as pd
import os 

In [247]:
PREFIX = "../data/"
TRADES_PATH = f"{PREFIX}trades/trades.csv"
OUTPUT_DIR = "../data/cleaned"

START = 2014
END = 2023

In [248]:
trades_df = pd.read_csv(TRADES_PATH)

In [249]:
trades_df["asset_type"] = trades_df["ticker"].apply(
    lambda x: "Stock" if isinstance(x, str) and len(x) > 0 else "Other"
)
trades_df = trades_df[trades_df["asset_type"] == "Stock"]

In [250]:

amount_mapping = {
    "$1,001 - $15,000": 8000,
    "$15,001 - $50,000": 35000,
    "$50,001 - $100,000": 75000,
    "$100,001 - $250,000": 175000,
    "$1,001 -": 1000,
    "$250,001 - $500,000": 375000,
    "$500,001 - $1,000,000": 750000,
    "$5,000,001 - $25,000,000": 15000000,
    "$1,000,001 - $5,000,000": 3000000,
    "$1,000,000 +": 1000000,
    "$1,000 - $15,000": 8000,
    "$15,000 - $50,000": 32500,
    "$50,000,000 +": 50000000,
    "$1,000,000 - $5,000,000": 3000000,
    "$25,000,001 - $50,000,000": 37500000,
}

trades_df["chamber"] = trades_df["is_senator"].apply(
    lambda x: "Senate" if x else "House"
)

trades_df["amount"] = trades_df["amount"].astype(str).str.strip()

unmapped_amounts = trades_df[~trades_df["amount"].isin(amount_mapping.keys())]["amount"].unique()

trades_df["amount"] = trades_df["amount"].map(amount_mapping)

trades_df = trades_df[
    [
        "transaction_date",
        "ticker",
        "asset_description",
        "type",
        "amount",
        "state",
        "ptr_link",
        "industry",
        "sector",
        "party",
        "chamber",
        "member",
    ]
]

new_trades_df = trades_df.rename(
    columns={
        "transaction_date": "transaction_date",
        "ticker": "ticker",
        "asset_description": "asset_description",
        "type": "type",
        "amount": "amount",
        "state": "state",
        "ptr_link": "ptr_link",
        "industry": "industry",
        "sector": "sector",
        "party": "party",
        "chamber": "chamber",
        "member": "member",
    }
)

In [251]:
print(trades_df["amount"].unique())

senate_mask = new_trades_df['chamber'] == 'Senate'
new_trades_df.loc[senate_mask, 'transaction_date'] = (
    pd.to_datetime(
        new_trades_df.loc[senate_mask, 'transaction_date'],
        format='%m/%d/%Y',
        errors='coerce'
    )
    .dt.strftime('%Y-%m-%d')
)

new_trades_df['transaction_date'] = pd.to_datetime(
    new_trades_df['transaction_date'],
    errors='coerce'
)

new_trades_df['transaction_date'] = pd.to_datetime(
    new_trades_df['transaction_date'], errors='coerce'
)

new_trades_df = new_trades_df.dropna(subset=['transaction_date'])

start = pd.Timestamp(f'{START}-01-01')
end   = pd.Timestamp(f'{END}-12-31')
mask = (new_trades_df['transaction_date'] >= start) & \
       (new_trades_df['transaction_date'] <= end)

filtered = new_trades_df.loc[mask].copy()

has_ticker = (
    new_trades_df['ticker']
    .fillna('')
    .astype(str)
    .str.strip()
    .replace('', None)
    .notna()
    & (filtered['ticker'].astype(str).str.strip() != '--')
)
filtered = filtered.loc[has_ticker].copy()


[    8000    35000    75000   175000     1000   375000   750000 15000000
  3000000  1000000    32500 50000000 37500000]


In [252]:
df_house_2014 = pd.read_csv(f"{PREFIX}votes/members/house/2014.csv")
df_house_2014['legislator_name'] = df_house_2014['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2015 = pd.read_csv(f"{PREFIX}votes/members/house/2015.csv")
df_house_2015['legislator_name'] = df_house_2015['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2016 = pd.read_csv(f"{PREFIX}votes/members/house/2016.csv")
df_house_2016['legislator_name'] = df_house_2016['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2017 = pd.read_csv(f"{PREFIX}votes/members/house/2017.csv")
df_house_2017['legislator_name'] = df_house_2017['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2018 = pd.read_csv(f"{PREFIX}votes/members/house/2018.csv")
df_house_2018['legislator_name'] = df_house_2018['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2019 = pd.read_csv(f"{PREFIX}votes/members/house/2019.csv")
df_house_2019['legislator_name'] = df_house_2019['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2020 = pd.read_csv(f"{PREFIX}votes/members/house/2020.csv")
df_house_2020['legislator_name'] = df_house_2020['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2021 = pd.read_csv(f"{PREFIX}votes/members/house/2021.csv")
df_house_2021['legislator_name'] = df_house_2021['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2022 = pd.read_csv(f"{PREFIX}votes/members/house/2022.csv")
df_house_2022['legislator_name'] = df_house_2022['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_house_2023 = pd.read_csv(f"{PREFIX}votes/members/house/2023.csv")
df_house_2023['legislator_name'] = df_house_2023['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2014 = pd.read_csv(f"{PREFIX}votes/members/senate/2014.csv")
df_senate_2014['legislator_name'] = df_senate_2014['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2015 = pd.read_csv(f"{PREFIX}votes/members/senate/2015.csv")
df_senate_2015['legislator_name'] = df_senate_2015['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2016 = pd.read_csv(f"{PREFIX}votes/members/senate/2016.csv")
df_senate_2016['legislator_name'] = df_senate_2016['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2017 = pd.read_csv(f"{PREFIX}votes/members/senate/2017.csv")
df_senate_2017['legislator_name'] = df_senate_2017['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2018 = pd.read_csv(f"{PREFIX}votes/members/senate/2018.csv")
df_senate_2018['legislator_name'] = df_senate_2018['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2019 = pd.read_csv(f"{PREFIX}votes/members/senate/2019.csv")
df_senate_2019['legislator_name'] = df_senate_2019['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2020 = pd.read_csv(f"{PREFIX}votes/members/senate/2020.csv")
df_senate_2020['legislator_name'] = df_senate_2020['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2021 = pd.read_csv(f"{PREFIX}votes/members/senate/2021.csv")
df_senate_2021['legislator_name'] = df_senate_2021['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2022 = pd.read_csv(f"{PREFIX}votes/members/senate/2022.csv")
df_senate_2022['legislator_name'] = df_senate_2022['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()

df_senate_2023 = pd.read_csv(f"{PREFIX}votes/members/senate/2023.csv")
df_senate_2023['legislator_name'] = df_senate_2023['legislator_name'].str.replace(r'\(.*?\)', '', regex=True).str.strip()


In [253]:
frames = []
for year, df_h, df_s in [
    (2014, df_house_2014, df_senate_2014),
    (2015, df_house_2015, df_senate_2015),
    (2016, df_house_2016, df_senate_2016),
    (2017, df_house_2017, df_senate_2017),
    (2018, df_house_2018, df_senate_2018),
    (2019, df_house_2019, df_senate_2019),
    (2020, df_house_2020, df_senate_2020),  
    (2021, df_house_2021, df_senate_2021),
    (2022, df_house_2022, df_senate_2022),
    (2023, df_house_2023, df_senate_2023),
]:
    # HOUSE
    house_unique = (
        df_h[['member_id','legislator_name','state']]
        .drop_duplicates()                    
        .assign(year=year, chamber='House')
    )
    house_unique['last_name'] = (
        house_unique['legislator_name']
        .str.split().str[-1]
        .str.lower()
    )
    frames.append(house_unique[['member_id','year','chamber','state','last_name']])

    # SENATE
    senate_unique = (
        df_s[['member_id','legislator_name','state']]
        .drop_duplicates()
        .assign(year=year, chamber='Senate')
    )
    senate_unique['last_name'] = (
        senate_unique['legislator_name']
        .str.split().str[-1]
        .str.lower()
    )
    frames.append(senate_unique[['member_id','year','chamber','state','last_name']])

members_df = pd.concat(frames, ignore_index=True)

members_df = members_df.drop_duplicates(
    subset=['year','chamber','state','last_name'],
    keep='first'
)

trades = filtered.copy()

trades['year']      = trades['transaction_date'].dt.year
trades['last_name'] = trades['member'].str.split().str[-1].str.lower()

merged = trades.merge(
    members_df,
    on=['year','chamber','state','last_name'],
    how='left'
)

cleaned = merged.dropna(subset=['member_id'])


In [254]:
os.makedirs(f"{OUTPUT_DIR}/{START}-{END}", exist_ok=True)
cleaned.to_csv(f"{OUTPUT_DIR}/{START}-{END}/stocks.csv", index=False)