In [4]:
import pandas as pd
import json 
import re
from tqdm import tqdm
import fuzzywuzzy   
from fuzzywuzzy import process


# Hierarchy

In [None]:
with open('../data/hierarchy_elections_clean.json') as f:
    hierarchy = json.load(f)

In [83]:
# get all values from the hierarchy
all_unions = []
for key, value in hierarchy.items():
    all_unions += value

len(all_unions)


2911

AFL-CIO unions

In [None]:
aflcio_unions = pd.read_csv('../data/aflcio_unions_cleaned.csv') # this is taken from the AFL-CIO website

Extract acronyms

In [85]:
def extract_acronym(name):
    match = re.search(r'\(([^)]+)\)', name)
    return match.group(1) if match else None

In [86]:
aflcio_unions['Acronym'] = aflcio_unions['Name'].apply(extract_acronym)

In [None]:
aflcio_unions.to_csv('../data/aflcio_unions_cleaned.csv', index=False)

In [88]:
def clean_level2(union_name):
    # remove "district", "local" and "council" from the name
    union_name = re.sub(r'(district|local|council)', '', union_name, flags=re.IGNORECASE)
    # remove numbers from the name
    union_name = re.sub(r'\d+', '', union_name)
    # remove multiple blanks
    union_name = re.sub(r'\s+', ' ', union_name)
    return union_name.strip()

def clean_level4_5(union_name):
    # Optimized pattern to match "district council" or "local" followed by a number, allowing a small buffer of words in between
    local_pattern = re.search(r'local\s+\D{0,20}(\d+\S*)', union_name, re.IGNORECASE)
    # remove the local number from the name
    union_name = re.sub(r'local\s+\D{0,20}\d+\S*', '', union_name, flags=re.IGNORECASE)
    district_pattern = re.search(r'(district council|district|council|dc)\s+\D{0,20}(\d+)', union_name, re.IGNORECASE)
    
    # Extract district/council number if found
    district = district_pattern.group(2) if district_pattern else None
    # Extract local number if found
    local = local_pattern.group(1) if local_pattern else None
    
    return district, local

def match_aflcio(union_name):

    # first check if union_name has one of the acronyms in ()
    match = re.search(r'\(([^)]+)\)', union_name)
    if match:
        acronym = match.group(1)
        for i, row in aflcio_unions.iterrows():
            if row["Acronym"] == acronym:
                return acronym.lower()

    # Remove unnecessary characters
    name = re.sub(r'[^\w\s]', '', union_name)
    
    # Extract the union names (keys) from the unions dictionary
    union_names = aflcio_unions["Name"].tolist()
    
    # Find the closest match using fuzzy matching
    best_match = process.extractOne(name, union_names)
    
    if best_match and best_match[1] > 80:  # Adjust the threshold if needed
        matched_full_name = best_match[0]
        for i, row in aflcio_unions.iterrows():
            full_name = row["Name"]
            acronym = row["Acronym"]
            if full_name == matched_full_name:
                return acronym.lower()
    return "Not Found"

In [76]:
new_hierarchy = {}

In [77]:
for union in tqdm(all_unions):
    new_hierarchy[union] = {}

    # Level 1: AFL-CIO affiliation
    new_hierarchy[union]["afl-cio"] = False

    if union in hierarchy["afl-cio"]:
        new_hierarchy[union]["afl-cio"] = True
    
    if new_hierarchy[union]["afl-cio"] == False:
        for acr in aflcio_unions['Acronym']:
            if acr in hierarchy and union in hierarchy[acr]:
                new_hierarchy[union]["afl-cio"] = True
                break
            elif acr.lower() in union:
                new_hierarchy[union]["afl-cio"] = True
                break

    # Level 2: high-level union affiliation
    new_hierarchy[union]["main_union"] = None

    union_name_cleaned = clean_level2(union)

    for key in hierarchy:
        if key == "Independent":
            continue
        elif key == "afl-cio":
            new_hierarchy[union]["main_union"] = match_aflcio(union_name_cleaned)
        elif union in hierarchy[key]:
            new_hierarchy[union]["main_union"] = key
            break
    
    if new_hierarchy[union]["main_union"] == None:
        new_hierarchy[union]["main_union"] = union_name_cleaned
    
    # Level 3: sub-union affiliation
    pass

    # Level 4: district/council union affiliation
    district, local = clean_level4_5(union)
    new_hierarchy[union]["district"] = district

    # Level 5: local union affiliation
    new_hierarchy[union]["local"] = local

    

100%|██████████| 2911/2911 [00:16<00:00, 172.62it/s]


In [None]:
# save json file (with indent)
with open('../data/hierarchy_unions_v1.json', 'w') as f:
    json.dump(new_hierarchy, f, indent=4)

# Fix hierarchy

Open partially cleaned hierarchy

In [92]:
# save json file (with indent)
with open('../../data/hierarchy_unions_v1.json', 'r') as f:
    new_hierarchy = json.load(f)

Make sure that what is independent in old hierarchy is left as it is here

In [93]:
for union in new_hierarchy:
    if union in hierarchy["Independent"]:
        new_hierarchy[union]["main_union"] = clean_level2(union)

Dump

In [None]:
with open('../data/hierarchy_unions_v2.json', 'w') as f:
    json.dump(new_hierarchy, f, indent=4)