In [21]:
import pandas as pd
from tqdm import tqdm
from fuzzywuzzy import process
import re
import json

# Define useful functions

In [22]:
def preprocess_strings(string):
    string = string.lower()
    # remove leading and trailing whitespaces
    string = string.strip()
    # remove any extra whitespaces
    string = re.sub(r'\s+', ' ', string)
    return string

In [23]:
def extract_acronyms(name_list):
    acronyms = []
    for name in name_list:
        # Find acronyms in parentheses
        parenthetical_acronyms = re.findall(r'\(([^)]+)\)', name)
        acronyms.extend(parenthetical_acronyms)
        
        # Find acronyms separated by dashes
        dash_acronyms = re.findall(r'-\s*([a-z]+)', name)
        acronyms.extend(dash_acronyms)
        
        # Find acronyms not in parentheses or separated by dashes
        words = name.split()
        for word in words:
            if word.isupper() and len(word) > 1:
                acronyms.append(word)
    
    return list(set(acronyms))  # Remove duplicates

# Election data

In [None]:
df_elections = pd.read_csv('../data/overall_nlrb_all2024.csv')
list_union_orgs_elections = list(set(list(df_elections["Labor Org 1 Name"].unique()) + list(df_elections["Labor Org 2 Name"].unique()) + list(df_elections["Labor Org 3 Name"].unique())))
# keep only strings
list_union_orgs_elections = [org for org in list_union_orgs_elections if type(org) == str]
list_union_orgs_elections = [preprocess_strings(org) for org in list_union_orgs_elections]
list_union_orgs_elections = list(set(list_union_orgs_elections))

In [None]:
df_union_elections = pd.DataFrame(list_union_orgs_elections, columns=["union_org"])
df_union_elections.to_csv("../data/list_union_orgs_elections.csv")

## Categorization

In [None]:
# Drop any rows where the 'union_org' column is NaN
df_cleaned = df_union_elections.dropna(subset=['union_org'])

acronyms = extract_acronyms(df_cleaned["union_org"])
len(acronyms)

197

Output and manually clean the list of acronyms

In [None]:
acronyms.to_csv("../data/list_acronyms.csv", index=False)

In [None]:
# open list of cleaned acronyms
df_acronyms = pd.read_csv("../data/list_acronyms_cleaned.csv")
acronyms = list(df_acronyms["acronym"])

In [42]:
union_orgs = df_cleaned['union_org'].tolist()

# Hierarchy dictionary
hierarchy = {acr : [] for acr in acronyms}
hierarchy["Independent"] = []

# Function to categorize unions
def categorize_union(org_name):
    for acr in acronyms:
        ## subset org_name by considering only text in parentheses or after a dash
        # org_acr = re.findall(r'\(([^)]+)\)', org_name)
        # org_acr.extend(re.findall(r'-\s*([a-z]+)', org_name))
        # org_acr = " ".join(org_acr)
        if acr in org_name:
            hierarchy[acr].append(org_name)
            return
    hierarchy["Independent"].append(org_name)

# Apply categorization
for org in union_orgs:
    # if not empty string
    if len(org) > 0:
        categorize_union(org)

In [None]:
# save as json file (easy to read with indentation)
with open("../data/hierarchy_elections.json", "w") as outfile:
    json.dump(hierarchy, outfile, indent=4)