In [1]:
import pandas as pd
import os

In [2]:
DATA_DIR = '../../data/'

committee_sector_mapping = pd.read_csv(DATA_DIR+'inputs/committee_gic_mapping.csv')
house_committees = pd.read_csv(DATA_DIR+'inputs/house_committees.csv')
senate_committees = pd.read_csv(DATA_DIR+'inputs/senators_committees.csv')
transactions = pd.read_csv(DATA_DIR+'inputs/transactions.csv')

In [3]:
# merge house and senate committees
committees = pd.concat([house_committees, senate_committees])
# take Party.1 and take first letter and put in Party column then drop Party.1
committees['Party'] = committees['Party.1'].str[0]
# if senator is not null set chamber to senate else house
committees['chamber'] = committees['senator'].apply(lambda x: 'senate' if pd.notnull(x) else 'house')

committees = committees.drop(columns=['Party.1', 'senator', 'Office Room', 'Phone'])
committees

Unnamed: 0,State,District,Party,first_name,last_name,full_name,committee_1,committee_2,committee_3,committee_4,committee_5,committee_6,committee_7,committee_8,chamber
0,Alabama,1st,,Carl,Jerry,Carl Jerry,Appropriations,Natural Resources,,,,,,,house
1,Alabama,2nd,,Moore,Barry,Moore Barry,Agriculture,Judiciary,,,,,,,house
2,Alabama,3rd,,Rogers,Mike,Rogers Mike,Armed Services,,,,,,,,house
3,Alabama,4th,,Aderholt,Robert,Aderholt Robert,Appropriations,,,,,,,,house
4,Alabama,5th,,Strong,Dale,Strong Dale,Armed Services,Homeland Security,"Science, Space, and Technology",,,,,,house
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Vermont,,D,Peter,Welch,Peter Welch,"Committee on Agriculture, Nutrition, and Forestry","Committee on Commerce, Science, and Transporta...",Committee on Rules and Administration,Committee on the Judiciary,Joint Economic Committee,,,,senate
96,Rhode Island,,D,Sheldon,Whitehouse,Sheldon Whitehouse,Commission on Security and Cooperation in Europe,Committee on Environment and Public Works,Committee on Finance,Committee on the Budget,Committee on the Judiciary,United States Senate Caucus on International N...,,,senate
97,Mississippi,,R,Roger,Wicker,Roger F. Wicker,Commission on Security and Cooperation in Europe,Committee on Armed Services,"Committee on Commerce, Science, and Transporta...",Committee on Environment and Public Works,Committee on Rules and Administration,Select Committee on Intelligence,,,senate
98,Oregon,,D,Ron,Wyden,Ron Wyden,Committee on Energy and Natural Resources,Committee on Finance,Committee on the Budget,Joint Committee on Taxation,Select Committee on Intelligence,,,,senate


In [4]:
from fuzzywuzzy import process
import re

def preprocess_name(name):
    # Remove any titles, punctuations, convert to lower case
    name = re.sub(r'(Mr\.|Mrs\.|Ms\.|Dr\.|Jr\.|Sr\.)', '', name)
    name = re.sub(r'[^a-zA-Z ]', '', name)
    return name.lower().strip()

def match_names(list1, list2, threshold=90):
    # Preprocess all names
    preprocessed_list1 = [preprocess_name(name) for name in list1]
    preprocessed_list2 = [preprocess_name(name) for name in list2]

    matches = []
    for index, name in enumerate(preprocessed_list1):
        best_match = process.extractOne(name, preprocessed_list2, score_cutoff=threshold)
        best_match_index = preprocessed_list2.index(best_match[0]) if best_match else None
        if best_match:
            matches.append((name, best_match[0], best_match[1]))
            # for successful matches, change the full_name in committees to the name in transactions
            committees.loc[committees['full_name'] == list1[index], 'full_name'] = list2[best_match_index]
            # print(f"Matched: {name} -> {best_match[0]} ({best_match[1]})")
        else:
            matches.append((name, None, 0))  # No match found above the threshold
    return matches

# Example usage with your lists
committee_names = committees['full_name'].unique() # list of names in committees
transaction_names = transactions['name'].unique() # list of names in transactions
matched_names = match_names(committee_names, transaction_names)



In [None]:
# I want to see the merge of full_name in committees and name in transactions and then list out who match and who doesn't

# merge on name
merged = pd.merge(committees, transactions, left_on='full_name', right_on='name', how='outer')


# # list out who match and who doesn't
# print('Matched:')
# print(f"Txns: {len(merged[merged['full_name'].notnull() & merged['name'].notnull()])}")
# print(f"Names: {len(merged[merged['full_name'].notnull() & merged['name'].notnull()]['full_name'].unique())}")
# print('Unmatched:')
# print(f"Txns: {len(merged[merged['full_name'].isnull() & merged['name'].notnull()])}")
# print(f"Names: {len(merged[merged['full_name'].isnull() & merged['name'].notnull()]['name'].unique())}")
# # print out the unmatched full_names unique
# print(merged[merged['full_name'].isnull() & merged['name'].notnull()]['name'].unique())
# print('Missing:')
# print(len(merged[merged['full_name'].notnull() & merged['name'].isnull()]))


In [None]:
insider_transactions = merged[merged['full_name'].notnull() & merged['name'].notnull()].copy()

insider_transactions

def is_insider(row):
    for i in range(1, 9):  # assuming up to 8 committees
        if pd.notna(row[f'committee_{i}']):
            committee_data = committee_sector_mapping[committee_sector_mapping['Senate Committee'] == row[f'committee_{i}']]
            if not committee_data.empty:  # Checking if the DataFrame is not empty
                sectors = committee_data.iloc[:, 1:].values.flatten()  # Flattening the values into a single array
                if any(row['sector'] == sector for sector in sectors):  # Proper use of any with a generator expression
                    return True
    return False

# Apply the function
insider_transactions['is_insider'] = insider_transactions.apply(is_insider, axis=1)

df = insider_transactions[['disclosure_date', 'transaction_date', 'ticker', 'type', 'amount', 'name', 'asset_description', 'state', 'party', 'industry', 'sector', 'is_insider']]

In [None]:
if not os.path.exists(DATA_DIR+'outputs'):
    os.makedirs(DATA_DIR+'outputs')
df.to_csv(DATA_DIR+'outputs/insider_marked_trades.csv', index=False)