In [36]:
import pandas as pd
import json
from tqdm import tqdm
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import ast

import os

## Online data

### Social media data

In [15]:
posts_data = pd.read_csv("../data/fb_data_with_predictions.csv")

## Offline data

In [None]:
offline_data = pd.read_csv("../data/all_merged_new_complete_all2024_unions.csv")
offline_data.head()

  offline_data = pd.read_csv("../../../data/all_merged_new_complete_all2024_unions.csv")


Unnamed: 0,Name,Case Number,City_x,States & Territories,Date Filed,Region Assigned,Status,Date Closed,Reason Closed,No. of Eligible Voters,...,Stipulated\n/ Consent / Directed,Cert of Rep (Win),Cert of Results (Loss),Closed Date,Votes For Labor Org3,Labor Org3,Stipulated/Consent,Case ID,Stipulated / Consent / Directed,Contract Matches
0,Starbucks Corporation,31-RC-357638,Burbank,CA,2024-12-31,"Region 31, Los Angeles, California",Closed,02/18/2025,Certific. of Representative,16.0,...,,,,,,,,,,[]
1,"Rove Pest Control, Inc.",18-RD-357625,Oakdale,MN,2024-12-31,"Region 18, Minneapolis, Minnesota",Closed,02/14/2025,Certific. of Representative,6.0,...,,,,,,,,,,[]
2,"Poudre Valley Air Services, LLC",27-RM-357587,Fort Collins,CO,2024-12-30,"Region 27, Denver, Colorado",Open,,,,...,,,,,,,,,,[]
3,Metro Caring,27-RC-357526,Denver,CO,2024-12-30,"Region 27, Denver, Colorado",Closed,01/29/2025,Certific. of Representative,22.0,...,,,,,,,,,,[]
4,"InterCon Construction, Inc.",18-RC-357553,Waunakee,WI,2024-12-30,"Region 18, Minneapolis, Minnesota",Closed,01/10/2025,Withdrawal Non-adjusted,,...,,,,,,,,,,[]


## Map between social media accounts and union names

In [17]:
highlevel_unions = pd.read_csv("../data/mapping_fb_unions.csv")
highlevel_unions = highlevel_unions.dropna(subset=["account_match"])
highlevel_unions.head()

Unnamed: 0,union,account_match,type
0,atu,ATUInternational,aflcio
1,ibb,Boilermakers.Union,aflcio
2,unite here,unitehere,aflcio
4,ua,UnitedAssociation,aflcio
6,ironworkers,unionironworkers,aflcio


In [18]:
highlevel_unions_handles = highlevel_unions[["union", "account_match"]]
highlevel_unions_handles.columns = ["main_union", "handle"]
highlevel_unions_handles.head()

Unnamed: 0,main_union,handle
0,atu,ATUInternational
1,ibb,Boilermakers.Union
2,unite here,unitehere
4,ua,UnitedAssociation
6,ironworkers,unionironworkers


Add info on main_union (if available) from hierarchy file

In [None]:
list_files = []
for file in os.listdir("../data/"):
    if file.endswith(".json") and "hierarchy_unions" in file:
        v_number = int(file.split("_")[-1].split(".")[0].split("v")[-1])
        list_files.append([file, v_number])

# sort files by version number
list_files.sort(key=lambda x: x[1])

file = list_files[-1]

with open("../data/" + file[0], "r") as f:
    union_names = json.load(f)

From the hierarchy, we have to find all unions that have union name or main_union equal to union

In [20]:
union_dict = {}
for union in union_names:
    if union_names[union]["main_union"] in highlevel_unions_handles["main_union"].values:
        if union_names[union]["main_union"] not in union_dict:
            union_dict[union_names[union]["main_union"]] = []
        union_dict[union_names[union]["main_union"]].append(union)

In [21]:
rows = []
for main_union in union_dict:
    for union in union_dict[main_union]:
        handle = highlevel_unions_handles[highlevel_unions_handles["main_union"] == main_union]["handle"].values[0]
        rows.append([main_union, union, handle])

In [22]:
unions_df = pd.DataFrame(rows, columns=["main_union", "union", "handle"])

## Offline-online map

Offline events dates

In [23]:
offline_data["Labor Org 1 Name"] = offline_data["Labor Org 1 Name"].str.lower()
offline_data["Labor Org 2 Name"] = offline_data["Labor Org 2 Name"].str.lower()
offline_data["Labor Org 3 Name"] = offline_data["Labor Org 3 Name"].str.lower()
offline_data["Labor Org3"] = offline_data["Labor Org3"].str.lower()
offline_data["Union To Certify"] = offline_data["Union To Certify"].str.lower()

In [26]:
# Create a dictionary for union handles to avoid repeated DataFrame lookups
union_handles = dict(zip(unions_df["union"], unions_df["handle"]))

# Filter offline data once for "RC" and "RD" cases
rc_cases = offline_data[offline_data["Case Number"].str.contains("RC")]
# rd_cases = offline_data[offline_data["Case Number"].str.contains("RD") & ((offline_data["Cert of Results (Loss)"] == "LOSS") | (offline_data["Certification of Results Date (Loss)"]=="LOSS"))]
# rd_cases = offline_data[offline_data["Case Number"].str.contains("RD") & ((offline_data["Cert of Results (Loss)"] == "LOSS") | (offline_data["Certification of Results Date (Loss)"]=="LOSS"))]
rd_cases = offline_data[offline_data["Case Number"].str.contains("RD")]

rc_cases = rc_cases.rename(columns={"City_x": "City"})
rd_cases = rd_cases.rename(columns={"City_x": "City"})

rc_cases['Date Filed'] = pd.to_datetime(rc_cases['Date Filed']).dt.date
rc_cases['Election Held Date'] = pd.to_datetime(rc_cases['Election Held Date']).dt.date
rc_cases["Closed Date"] = pd.to_datetime(rc_cases["Closed Date"], format="mixed", errors="coerce").dt.date

In [31]:
events_dict_complete = {}

for i, row in tqdm(unions_df.iterrows(), total=unions_df.shape[0]):
    union = row["union"]

    main_union = row["main_union"]
    handle = union_handles[union]

    if main_union not in events_dict_complete:
        events_dict_complete[main_union] = {
            "handle": handle,
            "cases": []
        }


    # Filter rows where union is either in "Union To Certify" or "Labor Org 1 Name"
    rc_union_rows = rc_cases[(rc_cases["Union To Certify"] == union)]
    # Filter rows where union is in the election
    rc_union_rows = pd.concat([rc_union_rows, rc_cases[(rc_cases["Labor Org 1 Name"] == union) & (pd.isnull(rc_cases["Union To Certify"]))]])
    rc_union_rows = pd.concat([rc_union_rows, rc_cases[(rc_cases["Labor Org 2 Name"] == union)]])
    rc_union_rows = pd.concat([rc_union_rows, rc_cases[(rc_cases["Labor Org 3 Name"] == union)]])
    rc_union_rows = pd.concat([rc_union_rows, rc_cases[(rc_cases["Labor Org3"] == union)]])

    for _, row in rc_union_rows.iterrows():
        case_number = row["Case Number"]

        election_date_indata = True

        if ((row["Labor Org 1 Name"] == union) & (pd.isnull(row["Union To Certify"]))) or row["Union To Certify"] == union:
            if row["Cert of Rep (Win)"] == "WON" or row["Certification of Representative Date (Win)"] == "WON":
                if not pd.isnull(row["Election Held Date"]):
                    case_winning_election_date = row["Election Held Date"]
                else:
                    case_winning_election_date = row["Closed Date"]
                    election_date_indata = False
            else:
                case_winning_election_date = None

            if row["Cert of Results (Loss)"] == "LOSS" or row["Certification of Results Date (Loss)"] == "LOSS":
                if not pd.isnull(row["Election Held Date"]):
                    case_losing_election_date = row["Election Held Date"]
                else:
                    case_losing_election_date = row["Closed Date"]
                    election_date_indata = False
            else:
                case_losing_election_date = None
        else:
            case_winning_election_date = None
            if not pd.isnull(row["Election Held Date"]):
                case_losing_election_date = row["Election Held Date"]
            else:
                case_losing_election_date = row["Closed Date"]
                election_date_indata = False

        data_str = (
            row["Contract Matches"].replace("Timestamp(", "")  # Remove 'Timestamp('
                    .replace(")", "")           # Remove closing ')'
                    .replace("NaT", "'NaT'")    # Treat 'NaT' as a string or replace it with None
                    .replace("nan", "None")     # Replace 'nan' with Python None
        )

        # Convert to dictionary
        data_str = data_str.replace("\\", "\\\\")  # Escape backslashes

        list_str = ast.literal_eval(data_str)

        events_dict_complete[main_union]["cases"].append({
            "union": union,
            "case_number": case_number,
            # convert to string to avoid json serialization error
            "case_winning_election_date":  str(case_winning_election_date),
            "case_losing_election_date": str(case_losing_election_date),
        })


100%|██████████| 5518/5518 [00:47<00:00, 115.80it/s]


In [32]:
# save events dict to json
with open("../data/events_dict_fb_all2024.json", "w") as f:
    json.dump(events_dict_complete, f, indent=4)