In [1]:
import pandas as pd

In [2]:
# LOAD DATA
senate_transactions = pd.read_csv('https://senate-stock-watcher-data.s3-us-west-2.amazonaws.com/aggregate/all_transactions.csv')
# GET HOUSE
house_transactions = pd.read_csv('https://house-stock-watcher-data.s3-us-west-2.amazonaws.com/data/all_transactions.csv')

In [3]:
# replace transaction_date for index 2654 with 2021-01-03
house_transactions.at[2654, 'transaction_date'] = '2021-06-09'
house_transactions.at[4509, 'transaction_date'] = '2022-11-23'
house_transactions.at[4540, 'transaction_date'] = '2022-08-09'
house_transactions.at[4616, 'transaction_date'] = '2021-08-02'
house_transactions.at[4636, 'transaction_date'] = '2022-07-18'
house_transactions.at[7909, 'transaction_date'] = '2022-11-16'
house_transactions.at[9136, 'transaction_date'] = '2021-11-18'
house_transactions.at[9918, 'transaction_date'] = '2022-11-22'
house_transactions.at[11406, 'transaction_date'] = '2021-01-11'
house_transactions.at[12839, 'transaction_date'] = '2021-06-22'
house_transactions.at[12840, 'transaction_date'] = '2021-06-22'
house_transactions.at[15094, 'transaction_date'] = '2022-11-02'

In [4]:
# format of transaction_date: 2023-04-18
senate_transactions['transaction_date'] = pd.to_datetime(senate_transactions['transaction_date'])
# format of disclosure_date: 05/17/2023
senate_transactions['disclosure_date'] = pd.to_datetime(senate_transactions['disclosure_date'], format='%m/%d/%Y')
# # filter asset_type to include only "Stock"
senate_transactions = senate_transactions[senate_transactions['asset_type'] == 'Stock']

# remove rows where ticker is null
house_transactions = house_transactions[~house_transactions['ticker'].isnull()]
# remove all rows where asset_description.lower() contains "option"
house_transactions['asset_description'] = house_transactions['asset_description'].fillna('')
# Drop rows where asset_description column contains "option"
house_transactions = house_transactions[~house_transactions['asset_description'].str.contains('option', case=False)]
# format of transaction_date: 2021-09-27
house_transactions['transaction_date'] = pd.to_datetime(house_transactions['transaction_date'])
# format of disclosure_date: 10/04/2021
house_transactions['disclosure_date'] = pd.to_datetime(house_transactions['disclosure_date'], format='%m/%d/%Y')

In [5]:
# change representative to name
house_transactions = house_transactions.rename(columns={'representative': 'name'})
# change senator to name
senate_transactions = senate_transactions.rename(columns={'senator': 'name'})
# drop all columns except for disclosure_date, transaction_date, ticker, type, amount, name, asset_description, state, party, industry, sector
columns_to_keep = ['disclosure_date', 'transaction_date', 'ticker', 'type', 'amount', 'name', 'asset_description', 'state', 'party', 'industry', 'sector']
house_transactions = house_transactions[columns_to_keep]
senate_transactions = senate_transactions[columns_to_keep]
# merge the two dataframes
transactions = pd.concat([house_transactions, senate_transactions], axis=0)

In [6]:
# convert transactions type column to lowercase
transactions['type'] = transactions['type'].str.lower()
# replace "sale" and "sale (full)" with "sale_full"
transactions['type'] = transactions['type'].replace('sale', 'sale_full')
transactions['type'] = transactions['type'].replace('sale (full)', 'sale_full')
# replace "sale (partial)" with "sale_partial"
transactions['type'] = transactions['type'].replace('sale (partial)', 'sale_partial')
# drop all "exchange" transactions
transactions = transactions[~transactions['type'].str.contains('exchange', case=False)]

In [7]:
transactions.head()

Unnamed: 0,disclosure_date,transaction_date,ticker,type,amount,name,asset_description,state,party,industry,sector
0,2021-10-04,2021-09-27,BP,purchase,"$1,001 - $15,000",Virginia Foxx,BP plc,NC,Republican,Integrated oil Companies,Energy
1,2021-10-04,2021-09-13,XOM,purchase,"$1,001 - $15,000",Virginia Foxx,Exxon Mobil Corporation,NC,Republican,Integrated oil Companies,Energy
2,2021-10-04,2021-09-10,ILPT,purchase,"$15,001 - $50,000",Virginia Foxx,Industrial Logistics Properties Trust - Common...,NC,Republican,Real Estate Investment Trusts,Real Estate
3,2021-10-04,2021-09-28,PM,purchase,"$15,001 - $50,000",Virginia Foxx,Phillip Morris International Inc,NC,Republican,Farming/Seeds/Milling,Consumer Non-Durables
4,2021-10-04,2021-09-17,BLK,sale_partial,"$1,001 - $15,000",Alan S. Lowenthal,BlackRock Inc,CA,Democrat,Investment Bankers/Brokers/Service,Finance


In [8]:
transactions.to_csv('data/inputs/transactions.csv', index=False)