In [72]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import os
import json
from datetime import timedelta
from ripser import ripser
from persim import plot_diagrams
from collections import defaultdict

def load_meeting_df(path):
    try:
        df = pd.read_csv(path)
        if "Meeting Date" not in df.columns:
            print(f"Warning: 'Meeting Date' column not found in {path}. Returning raw DataFrame.")
            return df
        df["meeting_date"] = pd.to_datetime(df["Meeting Date"], errors='coerce').dt.strftime("%Y-%m-%d")
        df.dropna(subset=["meeting_date"], inplace=True)
        df = df.drop_duplicates(subset=["Committee", "Meeting Title", "meeting_date"])\
               .reset_index(drop=True)
    except FileNotFoundError:
        print(f"Error: Meeting file not found at {path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error processing meeting file {path}: {e}")
        return pd.DataFrame()
    return df

def load_stock_transactions_df(path, years_to_include):
    try:
        df = pd.read_csv(path)
        if "transaction_date" not in df.columns or "year" not in df.columns:
            print(f"Warning: 'transaction_date' or 'year' column not found in {path}. Returning raw DataFrame.")
            return df
            
        df["transaction_date"] = pd.to_datetime(df["transaction_date"], errors='coerce').dt.strftime("%Y-%m-%d")
        df.dropna(subset=["transaction_date"], inplace=True) 

        df['year'] = pd.to_numeric(df['year'], errors='coerce')
        df.dropna(subset=['year'], inplace=True)
        df['year'] = df['year'].astype(int)

        df = df[df["year"].isin(years_to_include)]
        if "sector" in df.columns:
            df["sector"] = df["sector"].fillna("Unspecified_Sector")
        else:
            print(f"Warning: 'sector' column not found in {path}. Sectors will not be processed for this DataFrame.")
    except FileNotFoundError:
        print(f"Error: Stock transaction file not found at {path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error processing stock transaction file {path}: {e}")
        return pd.DataFrame()
    return df

def filter_stock_transactions_by_members(df_stocks, member_ids_set): 
    if "member_id" not in df_stocks.columns:
        print("Warning: 'member_id' column not found in stock DataFrame for filtering.")
        return pd.DataFrame()
    # Ensure member_id in df_stocks is string for consistent comparison
    df_stocks['member_id'] = df_stocks['member_id'].astype(str)
    df_new = df_stocks[df_stocks["member_id"].isin(member_ids_set)]
    return df_new

In [73]:
# --- Configuration Constants ---
DATA_PATH = '../data' # Assuming this path is correct relative to your notebook
YEARS_TO_ANALYZE = [2019, 2020, 2021, 2022] # Renamed for clarity
# DAYS_BEFORE = 7 # Moved to relevant cell (bipartite meeting day analysis)
# DAYS_AFTER = 3  # Moved to relevant cell
# TOP_K = 10      # This seems specific to a particular analysis, not global

# --- Load Stock Transactions ---

CONGRESS_PERIOD_MAP = {"116" : [2019, 2020], "117" : [2021, 2022]}

STOCK_TRANSACTIONS_DF = load_stock_transactions_df(f'{DATA_PATH}/cleaned/2014-2023/stocks.csv', YEARS_TO_ANALYZE)
if STOCK_TRANSACTIONS_DF.empty:
    print("CRITICAL: STOCK_TRANSACTIONS_DF is empty after loading. Halting execution.")
    exit()
else:
    print(STOCK_TRANSACTIONS_DF.head())

if "sector" in STOCK_TRANSACTIONS_DF.columns:
    print("Unique sectors in loaded stock data:", STOCK_TRANSACTIONS_DF["sector"].unique())
else:
    print("Warning: 'sector' column not found in STOCK_TRANSACTIONS_DF after loading.")


committee_meeting_files = {
    "House Appropriations": "house_appropriations.csv",
    "House Energy and Commerce": "house_energy_and_commerce.csv",
    "House Financial Services": "house_financial_services.csv",
    "House Oversight and Reform": "house_oversight_and_reform.csv",
    "House Ways and Means": "house_ways_and_means.csv",
    "Senate Appropriations": "senate_appropriations.csv",
    "Senate Banking, Housing, and Urban Affairs": "senate_banking.csv", # Assuming key in COMMITTEE_MEMBERSHIP_MAP matches this
    "Senate Finance": "senate_finance.csv",
    "Senate Health, Education, Labor, and Pensions": "senate_help.csv" # Assuming key in COMMITTEE_MEMBERSHIP_MAP matches this
}

committee_meeting_dfs_list = []
print("\nLoading committee meeting data...")
for committee_key_name, filename in committee_meeting_files.items():
    df_meeting = load_meeting_df(f'{DATA_PATH}/committees/meetings/2019-2022/downloaded/{filename}')
    if not df_meeting.empty:
        # Ensure the 'Committee' column in the meeting data matches the keys used elsewhere (e.g., COMMITTEE_MEMBERSHIP_MAP)
        # If 'Committee' column doesn't exist or is inconsistent, this needs to be handled.
        # For now, we assume the loaded CSV already has a 'Committee' column or it's added by load_meeting_df
        # If load_meeting_df doesn't add it, we might need to assign it here:
        if "Committee" not in df_meeting.columns and not df_meeting.empty:
             df_meeting["Committee"] = committee_key_name # Assign committee name if missing
        committee_meeting_dfs_list.append(df_meeting)
    else:
        print(f"Warning: No data loaded for {committee_key_name} meetings.")

if not committee_meeting_dfs_list:
    print("CRITICAL: No committee meeting data loaded. Some analyses may fail.")
    COMMITTEE_MEETINGS_DF = pd.DataFrame() # Create empty DF
else:
    COMMITTEE_MEETINGS_DF = pd.concat(committee_meeting_dfs_list).reset_index(drop=True)
    print(f"\nTotal committee meetings loaded: {len(COMMITTEE_MEETINGS_DF)}")
    if "Committee" in COMMITTEE_MEETINGS_DF.columns:
        print("Committees found in COMMITTEE_MEETINGS_DF:", COMMITTEE_MEETINGS_DF["Committee"].unique())


COMMITTEE_MEMBERSHIP_MAP = {
    'House Energy and Commerce': {
        '116': {
            'Darren Soto', 'O000171', 'Jan Schakowsky', 'Tim Walberg', 'H. Morgan Griffith', 'B001284', 'Michael Doyle', 'Dave Loebsack', 'B001275', 'D000624', 'M001180', 'Nanette Barragán', 'C001066', 'P000034', 'Peter G. Olson', 'Bill Johnson', 'Eliot Engel', 'M001200', 'Jerry McNerney', 'Cathy McMorris Rodgers', 'Jeff Duncan', 'Greg Walden', 'Markwayne Mullin', 'P000608', 'Ben Ray Luján', 'U000031', 'Yvette D. Clarke', 'Robin Kelly', 'B001257', 'John Shimkus', 'B001248', 'Richard Hudson', 'Diana DeGette', 'Tony Cárdenas', 'Lisa Blunt Rochester', 'Paul Tonko', 'Steve Scalise', 'Brett Guthrie', 'Bobby Rush', 'Anna Eshoo', 'F000461', 'Annie Kuster', 'K000378', 'L000576', 'S001180', 'John Sarbanes', 'Raul Ruiz', 'G000584', 'G.K. Butterfield', 'Joseph Kennedy III', 'L000566', 'W000800', 'M001163', 'C001103', 'Marc Veasey'
                }, 
        '117': {
            'Darren Soto', 'O000171', 'Jan Schakowsky', 'H. Morgan Griffith', 'Tim Walberg', 'Michael Doyle', 'B001275', 'D000624', 'S001216', 'Nanette Barragán', 'M001180', 'C001066', 'P000034', 'D000628', 'C001114', 'Bill Johnson', 'C001120', 'M001200', 'Jerry McNerney', 'Kathleen Rice', 'Lizzie Pannill Fletcher', 'Cathy McMorris Rodgers', 'Jeff Duncan', 'Markwayne Mullin', 'P000608', 'Yvette D. Clarke', 'U000031', 'Robin Kelly', 'Angie Craig', 'T000482', 'Greg Pence', 'B001257', 'Gary Palmer', 'B001248', 'Richard Hudson', 'Diana DeGette', 'Tony Cárdenas', 'Lisa Blunt Rochester', 'Paul Tonko', 'Steve Scalise', 'Brett Guthrie', 'Kelly Armstrong', 'Bobby Rush', 'Anna Eshoo', 'John Joyce', 'Annie Kuster', 'K000378', 'Debbie Lesko', 'L000576', 'S001180', 'John Sarbanes', 'Raul Ruiz', 'G.K. Butterfield', 'L000566', 'W000800', 'M001163', 'C001103', 'Marc Veasey'
                }
            }, 
    'House Financial Services': {
        '116': {
            'Denny Heck', 'Bill Huizenga', 'Michael F.Q. San Nicolas', 'Jennifer Wexton', 'Sean Casten', 'Katie Porter', 'Madeleine Dean', 'P000616', 'A000378', 'G000583', 'Emanuel Cleaver', 'P000593', 'Ben McAdams', 'John Rose', 'G000588', 'H001072', 'Barry Loudermilk', 'Gregory W. Meeks', 'W000812', 'Alfred Lawson', 'Joyce Beatty', 'Warren Davidson', 'Alexandria Ocasio-Cortez', 'Juan Vargas', 'Alexander Mooney', 'Scott Tipton', 'Tom Emmer', 'Maxine Waters', 'Bill Foster', 'Ted Budd', 'Lee Zeldin', 'Rashida Tlaib', 'David Scott', 'Tulsi Gabbard', 'Andy Barr', 'K000392', 'Bryan Steil', 'Bill Posey', 'Lance Gooden', 'Nydia Velazquez', 'Sean Duffy', 'M001156', 'Steve Stivers', 'Carolyn B. Maloney', 'Ayanna Pressley', 'H001074', 'Brad Sherman', 'T000480', 'Frank Lucas', 'Peter King', 'Blaine Luetkemeyer', 'Jim Himes', 'Alma Adams', 'Denver Lee Riggleman III', 'Roger Williams', 'C001049', 'Al Green', 'Jesus Garcia', 'Sylvia Garcia', 'L000562', 'Vicente Gonzalez Jr.'
            }, 
        '117': {
            'Bill Huizenga', 'Michael F.Q. San Nicolas', 'Sean Casten', 'Madeleine Dean', 'A000378', 'G000583', 'Emanuel Cleaver', 'P000593', 'John Rose', 'G000588', 'H001072', 'Barry Loudermilk', 'Gregory W. Meeks', 'W000812', 'Alfred Lawson', 'Joyce Beatty', 'Warren Davidson', 'Alexandria Ocasio-Cortez', 'Juan Vargas', 'Jake Auchincloss', 'Alexander Mooney', 'Tom Emmer', 'Maxine Waters', 'Bill Foster', 'Ted Budd', 'Lee Zeldin', 'Nikema Williams', 'Rashida Tlaib', 'David Scott', 'Andy Barr', 'K000392', 'Ritchie Torres', 'Bryan Steil', 'Bill Posey', 'Lance Gooden', 'Nydia Velazquez', 'M001156', 'Steve Stivers', 'Carolyn B. Maloney', 'Ayanna Pressley', 'H001074', 'Brad Sherman', 'T000480', 'Frank Lucas', 'T000479', 'Blaine Luetkemeyer', 'Jim Himes', 'Alma Adams', 'Roger Williams', 'Al Green', 'Jesus Garcia', 'Sylvia Garcia', 'L000562', 'Vicente Gonzalez Jr.'
                }
            }, 
    'Senate Banking, Housing, and Urban Affairs': {
        '116': {
            'Catherine Cortez Masto', 'Elizabeth Warren', 'Mike Rounds', 'Jon Tester', 'Martha McSally', 'S384', 'Kevin Cramer', 'Richard Shelby', 'S389', 'Bob Menendez', 'S390', 'Brian E. Schatz', 'Ben Sasse', 'S327', 'Sherrod Brown', 'S351', 'S394', 'S347', 'Mike Crapo', 'Kyrsten Sinema', 'Tom Cotton', 'S259', 'Tim Scott'
                    }, 
        '117': {
            'Catherine Cortez Masto', 'Elizabeth Warren', 'Mike Rounds', 'Jon Ossoff', 'Jon Tester', 'S384', 'Kevin Cramer', 'Richard Shelby', 'S389', 'Bob Menendez', 'S390', 'Raphael Warnock', 'S327', 'Sherrod Brown', 'S351', 'S394', 'S347', 'S407', 'Kyrsten Sinema', 'Steve Daines', 'S410', 'Mike Crapo', 'S259', 'Tim Scott'}}, 
    'Senate Health, Education, Labor, and Pensions': {
        '116': {
            'Maggie Hassan', 'S252', 'Rand Paul', 'Mike Braun', 'Elizabeth Warren', 'Johnny Isakson', 'Lisa Murkowski', 'S373', 'S229', 'Mitt Romney', 'Bernie Sanders', 'S402', 'S394', 'Bob Casey Jr.', 'Richard Burr', 'Tammy Baldwin', 'S362', 'Christopher S. Murphy', 'Tim Scott'
            }, 
        '117': {
            'Maggie Hassan', 'S252', 'Rand Paul', 'Mike Braun', 'Lisa Murkowski', 'S373', 'S412', 'S229', 'Mitt Romney', 'Bernie Sanders', 'Ben Ray Luján', 'S402', 'S408', 'S394', 'S347', 'Bob Casey Jr.', 'Richard Burr', 'Tammy Baldwin', 'S362', 'Christopher S. Murphy', 'M001198', 'Tim Scott'
                }
            }, 
    'House Appropriations': {
        '116': {
            'Martha Roby', 'Lucille Roybal-Allard', 'Mark Pocan', 'Kay Granger', 'P000523', 'Mario Diaz-Balart', 'Steve Womack', 'A000055', 'Norma Torres', 'Pete Aguilar', 'Betty McCollum', 'John Moolenaar', 'Tim Ryan', 'Mark Amodei', 'Chellie Pingree', 'Barbara Lee', 'Bonnie Watson Coleman', 'M001188', 'Sanford Bishop Jr.', 'B001286', 'Marcy Kaptur', 'S001148', 'W000797', 'Jeffrey Fortenberry', 'Matt Cartwright', 'C001055', 'Nita Lowey', 'Steven Palazzo', 'Brenda Lawrence', 'Charlie Crist', 'F000459', 'J000295', 'Ken Calvert', 'José Serrano', 'John Carter', 'V000108', 'William Hurd', 'Mike Quigley', 'Chris Stewart', 'N000189', 'Henry Cuellar', 'C001101', 'Jaime Herrera Beutler', 'Derek Kilmer', 'C001053', 'Dutch Ruppersberger', 'Andrew Harris', 'F000462', 'R000609', 'Rosa L. DeLauro', 'Ann Kirkpatrick', 'R000395'
            }, 
        '117': {
            'Lucille Roybal-Allard', 'Mark Pocan', 'Jennifer Wexton', 'Kay Granger', 'P000523', 'Mario Diaz-Balart', 'Steve Womack', 'A000055', 'Norma Torres', 'Benjamin Lee Cline', 'Pete Aguilar', 'Betty McCollum', 'John Moolenaar', 'Tim Ryan', 'G000588', 'Mark Amodei', 'Chellie Pingree', 'Barbara Lee', 'Bonnie Watson Coleman', 'Ashley Hinson', 'Guy Reschenthaler', 'Marcy Kaptur', 'B001286', 'Sanford Bishop Jr.', 'M001188', 'S001148', 'T000483', 'W000797', 'Jeffrey Fortenberry', 'Matt Cartwright', 'C001055', 'Steven Palazzo', 'Lauren Underwood', 'Adriano Espaillat', 'Brenda Lawrence', 'Charlie Crist', 'F000459', 'J000295', 'Ken Calvert', 'G000061', 'John Carter', 'Mike Quigley', 'Chris Stewart', 'N000189', 'Henry Cuellar', 'C001101', 'Jaime Herrera Beutler', 'L000590', 'Derek Kilmer', 'Josh Harder', 'C001053', 'Dutch Ruppersberger', 'Andrew Harris', 'F000462', 'R000609', 'Rosa L. DeLauro', 'Ann Kirkpatrick', 'R000395', 'David G. Valadao'}
            }, 
    'House Oversight and Reform': 
            {
        '116': {
            'F000450', 'Raja Krishnamoorthi', 'Stacey Plaskett', 'Ralph Norman', 'C001108', 'Katie Hill', 'Jimmy Gomez', 'Glenn Grothman', 'S001214', 'Alexandria Ocasio-Cortez', 'Mark Meadows', 'M001205', 'W000797', 'Jody Hice', 'Kweisi Mfume', 'Rashida Tlaib', 'K000389', 'Brenda Lawrence', 'Eleanor Holmes Norton', 'Robin Kelly', 'Jackie Speier', 'G000563', 'Jim Jordan', 'Carolyn B. Maloney', 'Ayanna Pressley', 'Paul Gosar', 'Kelly Armstrong', 'Chip Roy', 'R000616', 'M001184', 'R000606', 'Jim Cooper', 'Michael Cloud', 'John Sarbanes', 'Clay Higgins', 'C001049', 'C001078', 'Mark DeSaulnier', 'L000562', 'W000800', 'G000590'
            }, 
        '117': {
            'F000450', 'Raja Krishnamoorthi', 'Ralph Norman', 'Katie Porter', 'C001108', 'Yvette Herrell', 'Cori Bush', 'Jimmy Gomez', 'Glenn Grothman', 'Alexandria Ocasio-Cortez', 'W000797', 'Hank Johnson', 'Nancy Mace', 'Jody Hice', 'Kweisi Mfume', 'Rashida Tlaib', 'K000389', 'Fred Keller', 'Danny K. Davis', 'Eleanor Holmes Norton', 'Brenda Lawrence', 'Andrew Clyde', 'Robin Kelly', 'Jackie Speier', 'G000563', 'Jim Jordan', 'Carolyn B. Maloney', 'Ayanna Pressley', 'Jacob LaTurner', 'Scott Franklin', 'S000250', 'R000606', 'Jim Cooper', 'Michael Cloud', 'Byron Donalds', 'F000246', 'John Sarbanes', 'Clay Higgins', 'C001078', 'Mark DeSaulnier', 'L000562', 'W000800', 'Andy Biggs'}
            }, 
    'House Ways and Means': 
            {
        '116': {
            'P000096', 'E000296', 'Jason Smith', 'Dan Kildee', 'Jodey Arrington', 'Mike Thompson', 'Adrian Smith', 'Brendan Boyle', 'Jimmy Gomez', 'Ronald James Kind', 'K000376', 'H001065', 'David Schweikert', 'R000597', 'Donald Sternoff Beyer Jr.', 'Danny K. Davis', 'Stephanie Murphy', 'Vern Buchanan', 'B000574', 'S001201', 'Devin Nunes', 'Richard Neal', 'E000298', 'C001080', 'Brad Wenstrup', 'Jimmy Panetta', 'L000557', 'Linda Sánchez', 'S001190', 'H001038', 'Tom Reed', 'Steven Horsford', 'Darin LaHood', 'Terri Sewell', 'M001158', 'D000399', 'Kevin Brady', 'Drew Ferguson', 'D000617', 'Jackie Walorski', 'Gwen Moore'}, 
        '117': {
            'P000096', 'E000296', 'Jason Smith', 'Stacey Plaskett', 'Dan Kildee', 'Jodey Arrington', 'Mike Thompson', 'Adrian Smith', 'Brendan Boyle', 'Jimmy Gomez', 'Ronald James Kind', 'K000376', 'M001205', 'David Schweikert', 'R000597', 'Donald Sternoff Beyer Jr.', 'Danny K. Davis', 'H001082', 'Stephanie Murphy', 'Vern Buchanan', 'B000574', 'S001201', 'Brad Wenstrup', 'Richard Neal', 'E000298', 'C001080', 'S001199', 'Jimmy Panetta', 'L000557', 'Linda Sánchez', 'S001190', 'H001038', 'Tom Reed', 'Steven Horsford', 'Darin LaHood', 'Terri Sewell', 'D000399', 'Kevin Brady', 'Drew Ferguson', 'D000617', 'Jackie Walorski', 'Gwen Moore'
            }
            }, 
    'Senate Appropriations': {
        '116': {
            'S337', 'S252', 'S342', 'Patrick Leahy', 'Jeanne Shaheen', 'Dianne Feinstein', 'Lisa Murkowski', 'S343', 'Joe Manchin III', 'Jon Tester', 'Dick Durbin', 'Richard Shelby', 'S389', 'S229', 'S372', 'Jeff Merkley', 'Brian E. Schatz', 'S390', 'Marco Rubio', 'S347', 'S293', 'Tammy Baldwin', 'Christopher S. Murphy', 'Steve Daines', 'James Lankford', 'S174', 'Cindy Hyde-Smith', 'S344', 'S259'
            }, 
            '117': {'S337', 'S252', 'Mike Braun', 'S342', 'Patrick Leahy', 'Jeanne Shaheen', 'Dianne Feinstein', 'Lisa Murkowski', 'S343', 'Joe Manchin III', 'Jon Tester', 'Dick Durbin', 'Richard Shelby', 'S389', 'S229', 'S372', 'Jeff Merkley', 'Brian E. Schatz', 'S390', 'Marco Rubio', 'S347', 'Martin Heinrich', 'S293', 'Tammy Baldwin', 'S407', 'Christopher S. Murphy', 'S174', 'Cindy Hyde-Smith', 'S344', 'S259'}
            }, 
    'Senate Finance': {
        '116': {'Maggie Hassan', 'Catherine Cortez Masto', 'Johnny Isakson', 'Debbie Stabenow', 'S316', 'S303', 'Rob Portman', 'S373', 'Bob Menendez', 'S330', 'S287', 'S327', 'Sherrod Brown', 'S351', 'Chuck Grassley', 'S277', 'S247', 'Bob Casey Jr.', 'Todd C. Young', 'S275', 'Richard Burr', 'S308', 'Steve Daines', 'James Lankford', 'Mike Crapo', 'Tim Scott'}, '117': {'Maggie Hassan', 'Catherine Cortez Masto', 'Elizabeth Warren', 'Debbie Stabenow', 'S316', 'S303', 'Rob Portman', 'S373', 'John Barrasso', 'Bob Menendez', 'S330', 'S287', 'Ben Sasse', 'S327', 'Sherrod Brown', 'S351', 'Chuck Grassley', 'S277', 'S247', 'Bob Casey Jr.', 'Todd C. Young', 'S275', 'Richard Burr', 'S308', 'Steve Daines', 'James Lankford', 'Mike Crapo', 'Tim Scott'}}}

  transaction_date ticker                                  asset_description  \
0       2021-09-27     BP                                             BP plc   
1       2021-09-13    XOM                            Exxon Mobil Corporation   
2       2021-09-10   ILPT  Industrial Logistics Properties Trust - Common...   
3       2021-09-28     PM                   Phillip Morris International Inc   
4       2021-09-17    BLK                                      BlackRock Inc   

           type  amount state  \
0      purchase    8000    NC   
1      purchase    8000    NC   
2      purchase   35000    NC   
3      purchase   35000    NC   
4  sale_partial    8000    CA   

                                            ptr_link  \
0  https://disclosures-clerk.house.gov/public_dis...   
1  https://disclosures-clerk.house.gov/public_dis...   
2  https://disclosures-clerk.house.gov/public_dis...   
3  https://disclosures-clerk.house.gov/public_dis...   
4  https://disclosures-clerk.house.gov/p

In [74]:
# Cell 3: Sector Mapping Update

# Define the path to your mapping file
MAPPING_FILE_PATH = "./missing_fill_sectors.csv" # Use a constant

if not os.path.exists(MAPPING_FILE_PATH):
    print(f"Warning: Mapping file not found at {MAPPING_FILE_PATH}. Sector mapping will not be performed.")
else:
    try:
        new_mappings_df = pd.read_csv(MAPPING_FILE_PATH)
        if "ticker" not in new_mappings_df.columns or "sector" not in new_mappings_df.columns:
            print(f"Warning: Mapping file {MAPPING_FILE_PATH} must contain 'ticker' and 'sector' columns. Sector mapping skipped.")
        else:
            print("Unique sectors in mapping file:", new_mappings_df["sector"].unique())
            
            # Create a mapping dictionary: Ticker (uppercase) -> Sector
            # Dropna from ticker and sector to avoid issues if they have NaNs
            new_mappings_df.dropna(subset=['ticker', 'sector'], inplace=True)
            mapping_dict = dict(zip(new_mappings_df['ticker'].astype(str).str.upper(), new_mappings_df['sector'].astype(str)))

            # Ensure 'ticker' column exists and is string in STOCK_TRANSACTIONS_DF
            if 'ticker' in STOCK_TRANSACTIONS_DF.columns:
                STOCK_TRANSACTIONS_DF['ticker'] = STOCK_TRANSACTIONS_DF['ticker'].astype(str).str.upper()
                
                # Apply the mapping:
                # For each ticker, if it's in mapping_dict, use the new sector. Otherwise, keep the old one.
                STOCK_TRANSACTIONS_DF['sector'] = STOCK_TRANSACTIONS_DF['ticker'].map(mapping_dict).fillna(STOCK_TRANSACTIONS_DF['sector'])
                print("Sector mapping based on 'missing_fill_sectors.csv' applied.")
            else:
                print("Warning: 'ticker' column not found in STOCK_TRANSACTIONS_DF. Sector mapping skipped.")
    except Exception as e:
        print(f"Error during sector mapping process: {e}")

# --- Analysis of Unspecified Tickers (after potential mapping) ---
if 'ticker' in STOCK_TRANSACTIONS_DF.columns and 'sector' in STOCK_TRANSACTIONS_DF.columns:
    ticker_counts = STOCK_TRANSACTIONS_DF['ticker'].value_counts().reset_index()
    ticker_counts.columns = ['ticker', 'count']
    
    # Get the first sector associated with each ticker (should be consistent after mapping)
    # Using .groupby().first() is safer than assuming drop_duplicates will give the correct one if there were inconsistencies.
    ticker_sectors = STOCK_TRANSACTIONS_DF.groupby('ticker')['sector'].first().reset_index()
    
    ticker_info = pd.merge(ticker_counts, ticker_sectors, on='ticker', how='left')
    
    # Ensure 'sector' column in ticker_info is string before comparison
    ticker_info['sector'] = ticker_info['sector'].astype(str)
    unspecified_tickers = ticker_info[
        (ticker_info['sector'] == 'Unspecified_Sector') | (ticker_info['sector'] == 'unspecified') | (ticker_info['sector'].str.lower() == 'nan')
    ].sort_values(by='count', ascending=False)

    print(f"\nNumber of tickers still having 'Unspecified' or similar sector: {len(unspecified_tickers)}")
    if not unspecified_tickers.empty:
        print("Top 50 Unspecified tickers with counts (after attempting mapping):")
        print(unspecified_tickers[['ticker', 'count', 'sector']].head(50).to_string())
    else:
        print("No tickers remain with 'Unspecified' sector after mapping (or no tickers at all).")
else:
    print("Skipping unspecified ticker analysis as 'ticker' or 'sector' column is missing.")

Unique sectors in mapping file: ['Unspecified' 'Energy' 'Consumer Services' 'Consumer Discretionary'
 'Finance' 'Industrials' 'Basic Materials' 'Miscellaneous' 'Technology'
 'Telecommunications' 'Health Care' 'Consumer Staples' 'Utilities'
 'Real Estate' 'Transportation']
Sector mapping based on 'missing_fill_sectors.csv' applied.

Number of tickers still having 'Unspecified' or similar sector: 0
No tickers remain with 'Unspecified' sector after mapping (or no tickers at all).


In [75]:
# Cell 5: Committee-Level Bipartite Network Analysis (Member-Sector Per Committee)
# This cell will be refactored using a function.

# --- Configuration for this cell's analysis ---
BIPARTITE_INCLUDE_BUYS = False  # Set to True to include "buy" transactions
BIPARTITE_INCLUDE_SELLS = True   # Set to True to include "sell" transactions

BIPARTITE_MIN_SECTOR_NODE_SIZE = 200
BIPARTITE_MAX_SECTOR_NODE_SIZE = 2000
BIPARTITE_MEMBER_NODE_SIZE = 600
BIPARTITE_MAX_EDGE_WIDTH = 8

BIPARTITE_OUTPUT_DIR = "committee_bipartite_member_sector" # Renamed for clarity
os.makedirs(BIPARTITE_OUTPUT_DIR, exist_ok=True)

# --- Main function for single committee bipartite analysis ---
def analyze_and_plot_committee_bipartite(
    committee_name,
    committee_members_set, # Pass the actual set of members
    all_stock_transactions_df,
    output_directory
    ):
    
    print(f"\n--- Analyzing Bipartite for Committee: {committee_name} ---")

    if not committee_members_set:
        print(f"No members defined for '{committee_name}'. Skipping.")
        return {} # Return empty stats

    df_ct = filter_stock_transactions_by_members(all_stock_transactions_df, committee_members_set)
    if df_ct.empty:
        print(f"No transactions found for members of '{committee_name}'. Skipping.")
        return {"num_members": len(committee_members_set), "num_transactions": 0}

    df_ct = df_ct.copy() # Avoid SettingWithCopyWarning
    df_ct["category"] = df_ct["type"].str.lower().map(
        lambda t: "buy" if "purchase" in t
        else ("sell" if "sale" in t else ("exchange" if "exchange" in t else "other"))
    )

    mask = pd.Series(True, index=df_ct.index)
    if not BIPARTITE_INCLUDE_BUYS:  mask &= df_ct["category"] != "buy"
    if not BIPARTITE_INCLUDE_SELLS: mask &= df_ct["category"] != "sell"
    df_ct = df_ct[mask]

    if df_ct.empty:
        print(f"No transactions after type filter for '{committee_name}'. Skipping.")
        return {"num_members": len(committee_members_set), "num_transactions_after_filter": 0}

    present_member_ids = set(df_ct["member_id"].unique())
    # Use global id2name or create a local one if 'member' column exists
    if 'member' in df_ct.columns:
        member_name_map = df_ct[["member_id", "member"]].drop_duplicates().set_index("member_id")["member"].to_dict()
    else: # Fallback if 'member' column is missing
        member_name_map = {mid: mid for mid in present_member_ids} # Use ID as name
    
    # Ensure all present_member_ids have an entry in member_name_map
    for mid in present_member_ids:
        member_name_map.setdefault(mid, mid)


    sorted_members = sorted(list(present_member_ids), key=lambda mid: member_name_map.get(mid, mid)) # Use .get for safety
    
    sector_counts = df_ct["sector"].value_counts().to_dict()
    if not sector_counts:
        print(f"No sector activity for '{committee_name}'. Skipping graph.")
        return {"num_members": len(sorted_members), "total_tx_count": int(df_ct.shape[0]), "total_tx_volume": float(df_ct["amount"].sum())}


    member_sector_agg = (
        df_ct.groupby(["member_id", "sector", "category"])
        .agg(tx_count=pd.NamedAgg(column="amount", aggfunc="count"),
             tx_volume=pd.NamedAgg(column="amount", aggfunc="sum"))
        .reset_index()
    )
    max_edge_amount = member_sector_agg["tx_volume"].max() if not member_sector_agg.empty else 1.0
    if max_edge_amount == 0: max_edge_amount = 1.0 # Avoid division by zero

    unique_sectors = sorted(sector_counts.keys())
    G = nx.Graph() # Use Graph for bipartite, MultiGraph allows parallel edges if needed for buy/sell separately
    G.add_nodes_from(sorted_members, bipartite=0)
    G.add_nodes_from(unique_sectors, bipartite=1)

    for _, row in member_sector_agg.iterrows():
        m, s, cat, vol, cnt = (
            row["member_id"], row["sector"], row["category"],
            row["tx_volume"], int(row["tx_count"])
        )
        if m in sorted_members: # Ensure member is in the graph's member set
            # For a simple bipartite, aggregate buy/sell weights if not distinguished by edge type
            # If you need separate edges for buy/sell, use MultiGraph and distinct edge keys or attributes
            if G.has_edge(m,s):
                G[m][s]['tx_volume'] = G[m][s].get('tx_volume',0) + vol # Sum volumes if multiple categories link same m-s
                G[m][s]['tx_count'] = G[m][s].get('tx_count',0) + cnt
                G[m][s]['categories'] = G[m][s].get('categories', set()).union({cat})
            else:
                G.add_edge(m, s, category=cat, tx_count=cnt, tx_volume=vol, categories={cat})


    # --- Stats Calculation (Simplified) ---
    committee_stats = {
        "num_members": len(sorted_members), "num_sectors": len(unique_sectors),
        "num_edges": G.number_of_edges(), "total_tx_count": int(df_ct.shape[0]),
        "total_tx_volume": float(df_ct["amount"].sum()), "nodes": {}
    }
    # Add per-node stats if needed, similar to your original code but for brevity here.

    # --- Plotting ---
    if not G.nodes() or not G.edges():
        print(f"Graph for {committee_name} is empty or has no edges. Skipping plot.")
        return committee_stats

    node_colors_plot, node_sizes_plot, labels_plot = [], [], {}
    max_sector_count_plot = max(sector_counts.values()) if sector_counts else 1.0
    if max_sector_count_plot == 0: max_sector_count_plot = 1.0

    for mid in sorted_members:
        if mid in G: # Check if node exists in graph G
            node_colors_plot.append("#1f78b4")
            node_sizes_plot.append(BIPARTITE_MEMBER_NODE_SIZE)
            labels_plot[mid] = member_name_map.get(mid, mid)

    current_graph_member_nodes = [n for n,d in G.nodes(data=True) if d['bipartite']==0]
    current_graph_sector_nodes = [n for n,d in G.nodes(data=True) if d['bipartite']==1]


    for sec_node_idx, sec in enumerate(current_graph_sector_nodes):
        if sec in G: # Check if node exists
            node_colors_plot.insert(len(current_graph_member_nodes) + sec_node_idx, "#ff7f0e") # Insert at correct position
            cnt = sector_counts.get(sec, 0)
            size = (cnt / max_sector_count_plot) * (BIPARTITE_MAX_SECTOR_NODE_SIZE - BIPARTITE_MIN_SECTOR_NODE_SIZE) + BIPARTITE_MIN_SECTOR_NODE_SIZE
            node_sizes_plot.insert(len(current_graph_member_nodes) + sec_node_idx, size)
            labels_plot[sec] = sec
    
    # Reorder nodes for plotting to match colors/sizes
    plot_nodes_ordered = current_graph_member_nodes + current_graph_sector_nodes


    edge_list_plot, edge_colors_plot, edge_widths_plot = [], [], []
    for u, v, data in G.edges(data=True):
        vol = data.get("tx_volume",0) # Aggregate volume for simplicity
        color = "gray" # Default, or color by net buy/sell if desired
        # Example: if 'categories' stored, could color by mixed/buy/sell
        width = (abs(vol) / max_edge_amount if max_edge_amount > 0 else 0) * BIPARTITE_MAX_EDGE_WIDTH + 0.5
        edge_list_plot.append((u, v))
        edge_colors_plot.append(color)
        edge_widths_plot.append(min(width, BIPARTITE_MAX_EDGE_WIDTH)) # Cap width

    if not current_graph_member_nodes: # Check if the set for layout is empty
        print(f"No member nodes in graph for {committee_name} to use for bipartite layout. Skipping plot.")
        return committee_stats

    pos_plot = nx.bipartite_layout(G, current_graph_member_nodes) # Use nodes present in G
    
    plt.figure(figsize=(max(12, len(current_graph_member_nodes)*0.3), max(10, len(current_graph_sector_nodes)*0.3) )) # Dynamic figsize
    nx.draw_networkx_nodes(G, pos_plot, nodelist=plot_nodes_ordered, node_color=node_colors_plot, node_size=node_sizes_plot, alpha=0.9)
    nx.draw_networkx_edges(G, pos_plot, edgelist=edge_list_plot, edge_color=edge_colors_plot, width=edge_widths_plot, alpha=0.6)
    nx.draw_networkx_labels(G, pos_plot, labels=labels_plot, font_size=8)
    
    plt.axis("off")
    title = f"{committee_name} — Members ↔ Sectors\n(Buys={'Y' if BIPARTITE_INCLUDE_BUYS else 'N'} | Sells={'Y' if BIPARTITE_INCLUDE_SELLS else 'N'})"
    plt.title(title, fontsize=14)
    plt.tight_layout()
    safe_name_plot = committee_name.replace(" ", "_").replace(",", "").lower()
    plt.savefig(f"{output_directory}/{safe_name_plot}_bipartite_member_sector.png", dpi=150)
    plt.close()
    print(f"[Saved] {output_directory}/{safe_name_plot}_bipartite_member_sector.png")
    
    return committee_stats

# --- Loop through committees and generate bipartite graphs ---
all_committee_stats_json = {}
committee_sector_counts_for_bar_chart = {} # For the combined bar chart later

id2name = {}  # Global mapping for member_id to name, will be populated if not already done

# Populate id2name globally if not already done (or ensure it's passed)
if 'member' in STOCK_TRANSACTIONS_DF.columns:
     id2name_global_temp = STOCK_TRANSACTIONS_DF[["member_id", "member"]].drop_duplicates("member_id").set_index("member_id")["member"].to_dict()
     # Update global id2name
     id2name.update(id2name_global_temp)
    


for committee_name_loop, period_map_loop in COMMITTEE_MEMBERSHIP_MAP.items():
    members_116_loop = period_map_loop.get("116", set())
    members_117_loop = period_map_loop.get("117", set())
    current_committee_members_set = members_116_loop | members_117_loop
    
    stats = analyze_and_plot_committee_bipartite(
        committee_name_loop,
        current_committee_members_set,
        STOCK_TRANSACTIONS_DF, # Pass the main, globally filtered stock DF
        BIPARTITE_OUTPUT_DIR
    )
    all_committee_stats_json[committee_name_loop] = stats
    
    # For combined bar chart data (using raw counts for now)
    # This part needs to be aligned with how df_ct was processed inside analyze_and_plot_committee_bipartite
    # We'll re-filter here for simplicity for the bar chart data source
    temp_df_ct = filter_stock_transactions_by_members(STOCK_TRANSACTIONS_DF, current_committee_members_set)
    if not temp_df_ct.empty:
        temp_df_ct = temp_df_ct.copy()
        temp_df_ct["category"] = temp_df_ct["type"].str.lower().map(
            lambda t: "buy" if "purchase" in t else ("sell" if "sale" in t else "other"))
        mask_bar = pd.Series(True, index=temp_df_ct.index)
        if not BIPARTITE_INCLUDE_BUYS:  mask_bar &= temp_df_ct["category"] != "buy"
        if not BIPARTITE_INCLUDE_SELLS: mask_bar &= temp_df_ct["category"] != "sell"
        temp_df_ct = temp_df_ct[mask_bar]
        if not temp_df_ct.empty:
            committee_sector_counts_for_bar_chart[committee_name_loop] = temp_df_ct["sector"].value_counts().to_dict()


# --- Save JSON report ---
json_report_path = os.path.join(BIPARTITE_OUTPUT_DIR, "all_committees_bipartite_stats.json")
with open(json_report_path, "w") as fp:
    json.dump(all_committee_stats_json, fp, indent=2)
print(f"Saved JSON report for all committees: {json_report_path}")

# --- Combined Stacked Bar Chart (Simplified from your Cell 7) ---
all_sectors_for_bar = sorted(list(set(sec for counts in committee_sector_counts_for_bar_chart.values() for sec in counts.keys())))
if all_sectors_for_bar: # only plot if there's data
    bar_data = []
    sorted_committee_names_for_bar = sorted(committee_sector_counts_for_bar_chart.keys())
    for c_name in sorted_committee_names_for_bar:
        row = [committee_sector_counts_for_bar_chart[c_name].get(sec, 0) for sec in all_sectors_for_bar]
        bar_data.append(row)
    
    df_combined_bar = pd.DataFrame(bar_data, index=sorted_committee_names_for_bar, columns=all_sectors_for_bar)

    plt.figure(figsize=(12, 8)) # Keep original fig size
    df_combined_bar.plot(kind="bar", stacked=True, colormap="tab20", width=0.8, figsize=(12,8)) # Pass figsize here too
    plt.ylabel("Number of Transactions")
    plt.xticks(rotation=45, ha="right")
    plt.title("Sector Breakdown of Transactions by Committee (Filtered by Buys/Sells)")
    plt.legend(title="Sector", bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(f"{BIPARTITE_OUTPUT_DIR}/all_committees_sector_breakdown_filtered.png", dpi=150)
    plt.close()
    print(f"[Saved] {BIPARTITE_OUTPUT_DIR}/all_committees_sector_breakdown_filtered.png")
else:
    print("No data available for combined sector breakdown bar chart.")


--- Analyzing Bipartite for Committee: House Energy and Commerce ---
[Saved] committee_bipartite_member_sector/house_energy_and_commerce_bipartite_member_sector.png

--- Analyzing Bipartite for Committee: House Financial Services ---
[Saved] committee_bipartite_member_sector/house_financial_services_bipartite_member_sector.png

--- Analyzing Bipartite for Committee: Senate Banking, Housing, and Urban Affairs ---
[Saved] committee_bipartite_member_sector/senate_banking_housing_and_urban_affairs_bipartite_member_sector.png

--- Analyzing Bipartite for Committee: Senate Health, Education, Labor, and Pensions ---
[Saved] committee_bipartite_member_sector/senate_health_education_labor_and_pensions_bipartite_member_sector.png

--- Analyzing Bipartite for Committee: House Appropriations ---
[Saved] committee_bipartite_member_sector/house_appropriations_bipartite_member_sector.png

--- Analyzing Bipartite for Committee: House Oversight and Reform ---
[Saved] committee_bipartite_member_sector/

<Figure size 1200x800 with 0 Axes>

In [76]:
# Cell 6: General Congress Bipartite Network (Members not in specified committees)

# --- Configuration for this cell's analysis ---
GENERAL_INCLUDE_BUYS = False  # Consistent with your original, can be changed
GENERAL_INCLUDE_SELLS = True

GENERAL_MIN_SECTOR_NODE_SIZE = 200
GENERAL_MAX_SECTOR_NODE_SIZE = 2000
GENERAL_MEMBER_NODE_SIZE = 600
GENERAL_MAX_EDGE_WIDTH = 8

GENERAL_OUTPUT_DIR = "general_congress_member_sector_bipartite" # Specific output dir
os.makedirs(GENERAL_OUTPUT_DIR, exist_ok=True)

# --- Function to Analyze General Congress Trading ---
def analyze_general_congress_trading(
    stock_transactions_df,
    committee_membership_map,
    output_directory
    ):
    
    print("\n--- Analyzing Bipartite for General Congress (Non-Committee Members) ---")

    # 1) Derive the full set of all committee-member IDs
    all_committee_members = set()
    for period_map in committee_membership_map.values():
        for members_set in period_map.values(): # Iterate through sets of members for "116", "117"
            all_committee_members.update({str(m) for m in members_set}) # Ensure string IDs

    # 2) Filter STOCK_TRANSACTIONS_DF for "general" members
    df_general = stock_transactions_df[
        ~stock_transactions_df["member_id"].astype(str).isin(all_committee_members) # Ensure comparison with string IDs
    ].copy()

    if df_general.empty:
        print("No transactions found for 'General Congress' members. Skipping.")
        return {}

    # 3) Derive "category" and filter by buy/sell
    df_general["category"] = df_general["type"].str.lower().map(
        lambda t: "buy" if "purchase" in t
        else ("sell" if "sale" in t else ("exchange" if "exchange" in t else "other"))
    )
    mask = pd.Series(True, index=df_general.index)
    if not GENERAL_INCLUDE_BUYS:  mask &= df_general["category"] != "buy"
    if not GENERAL_INCLUDE_SELLS: mask &= df_general["category"] != "sell"
    df_general = df_general[mask].copy()

    if df_general.empty:
        print("No 'General Congress' transactions after type filter. Skipping.")
        return {}

    # 4) Sector counts for general population
    general_sector_counts = df_general["sector"].value_counts().sort_index().to_dict()
    general_sector_counts_path = os.path.join(output_directory, "general_sector_counts.json")
    with open(general_sector_counts_path, "w") as fp:
        json.dump(general_sector_counts, fp, indent=2)
    print(f"[Saved] {general_sector_counts_path}")
    
    if not general_sector_counts:
        print("No sector activity for 'General Congress'. Skipping graph and further stats.")
        return {"num_general_members": len(df_general["member_id"].unique()), "total_tx_count": len(df_general)}

    # 5) Build bipartite graph
    present_member_ids_general = set(df_general["member_id"].unique())
    
    if 'member' in df_general.columns:
        member_name_map_general = df_general[["member_id", "member"]].drop_duplicates().set_index("member_id")["member"].to_dict()
    else:
        member_name_map_general = {mid: mid for mid in present_member_ids_general}
    for mid in present_member_ids_general: # Ensure all have an entry
        member_name_map_general.setdefault(mid, mid)

    sorted_members_general = sorted(list(present_member_ids_general), key=lambda mid: member_name_map_general.get(mid, mid))
    unique_sectors_general = sorted(general_sector_counts.keys())

    member_sector_agg_general = (
        df_general.groupby(["member_id", "sector", "category"])
        .agg(tx_count=pd.NamedAgg(column="amount", aggfunc="count"),
             tx_volume=pd.NamedAgg(column="amount", aggfunc="sum"))
        .reset_index()
    )
    max_edge_amount_general = member_sector_agg_general["tx_volume"].max() if not member_sector_agg_general.empty else 1.0
    if max_edge_amount_general == 0: max_edge_amount_general = 1.0

    G_general = nx.Graph()
    G_general.add_nodes_from(sorted_members_general, bipartite=0)
    G_general.add_nodes_from(unique_sectors_general, bipartite=1)

    for _, row in member_sector_agg_general.iterrows():
        m, s, cat, vol, cnt = (
            row["member_id"], row["sector"], row["category"],
            row["tx_volume"], int(row["tx_count"])
        )
        if m in sorted_members_general: # Ensure member is in graph
             if G_general.has_edge(m,s):
                G_general[m][s]['tx_volume'] = G_general[m][s].get('tx_volume',0) + vol
                G_general[m][s]['tx_count'] = G_general[m][s].get('tx_count',0) + cnt
                G_general[m][s]['categories'] = G_general[m][s].get('categories', set()).union({cat})
             else:
                G_general.add_edge(m, s, category=cat, tx_count=cnt, tx_volume=vol, categories={cat})

    # 6) Graph-level and Node-level stats (simplified for brevity, can expand)
    general_stats = {
        "num_general_members": len(sorted_members_general), "num_sectors": len(unique_sectors_general),
        "num_edges": G_general.number_of_edges(), "total_tx_count": int(df_general.shape[0]),
        "total_tx_volume": float(df_general["amount"].sum()), "nodes": {} # Can add per-node later
    }
    general_stats_path = os.path.join(output_directory, "general_congress_graph_stats.json")
    with open(general_stats_path, "w") as fp:
        json.dump(general_stats, fp, indent=2)
    print(f"[Saved] {general_stats_path}")

    # 7) Plotting (if graph has nodes and edges)
    if not G_general.nodes() or not G_general.edges():
        print("Graph for General Congress is empty or has no edges. Skipping plot.")
        return general_stats

    node_colors_plot, node_sizes_plot, labels_plot = [], [], {}
    max_sector_count_plot = max(general_sector_counts.values()) if general_sector_counts else 1.0
    if max_sector_count_plot == 0 : max_sector_count_plot = 1.0

    # Get nodes present in the actual graph G_general for layout and plotting
    current_graph_member_nodes_gen = [n for n,d in G_general.nodes(data=True) if d['bipartite']==0]
    current_graph_sector_nodes_gen = [n for n,d in G_general.nodes(data=True) if d['bipartite']==1]
    plot_nodes_ordered_gen = current_graph_member_nodes_gen + current_graph_sector_nodes_gen


    for mid in current_graph_member_nodes_gen: # Iterate over nodes actually in G_general
        node_colors_plot.append("#636363") # Darker grey for general members
        node_sizes_plot.append(GENERAL_MEMBER_NODE_SIZE)
        labels_plot[mid] = member_name_map_general.get(mid, mid)
    
    for sec_node_idx, sec in enumerate(current_graph_sector_nodes_gen):
        node_colors_plot.insert(len(current_graph_member_nodes_gen) + sec_node_idx, "#fdae61") # Light orange for sectors
        cnt = general_sector_counts.get(sec, 0)
        size = (cnt / max_sector_count_plot) * (GENERAL_MAX_SECTOR_NODE_SIZE - GENERAL_MIN_SECTOR_NODE_SIZE) + GENERAL_MIN_SECTOR_NODE_SIZE
        node_sizes_plot.insert(len(current_graph_member_nodes_gen) + sec_node_idx, size)
        labels_plot[sec] = sec
        
    edge_list_plot, edge_colors_plot, edge_widths_plot = [], [], []
    for u, v, data in G_general.edges(data=True):
        vol = data.get("tx_volume",0)
        color = "gray" # Simplified edge color for general pop
        width = (abs(vol) / max_edge_amount_general if max_edge_amount_general > 0 else 0) * GENERAL_MAX_EDGE_WIDTH + 0.5
        edge_list_plot.append((u,v))
        edge_colors_plot.append(color)
        edge_widths_plot.append(min(width, GENERAL_MAX_EDGE_WIDTH))

    if not current_graph_member_nodes_gen:
        print("No member nodes in General Congress graph for layout. Skipping plot.")
        return general_stats

    pos_plot = nx.bipartite_layout(G_general, current_graph_member_nodes_gen)
    plt.figure(figsize=(max(12, len(current_graph_member_nodes_gen)*0.2), max(10, len(current_graph_sector_nodes_gen)*0.3) ))
    nx.draw_networkx_nodes(G_general, pos_plot, nodelist=plot_nodes_ordered_gen, node_color=node_colors_plot, node_size=node_sizes_plot, alpha=0.9)
    nx.draw_networkx_edges(G_general, pos_plot, edgelist=edge_list_plot, edge_color=edge_colors_plot, width=edge_widths_plot, alpha=0.5)
    nx.draw_networkx_labels(G_general, pos_plot, labels=labels_plot, font_size=7) # Smaller font for potentially many members
    
    plt.axis("off")
    title = f"General Congress Members ↔ Sectors\n(Buys={'Y' if GENERAL_INCLUDE_BUYS else 'N'} | Sells={'Y' if GENERAL_INCLUDE_SELLS else 'N'})"
    plt.title(title, fontsize=14)
    plt.tight_layout()
    graph_path_plot = os.path.join(output_directory, "general_congress_bipartite.png")
    plt.savefig(graph_path_plot, dpi=150)
    plt.close()
    print(f"[Saved] {graph_path_plot}")

    # 8) Bar chart for general population sector counts
    sectors_bar = list(general_sector_counts.keys())
    counts_bar = [general_sector_counts[sec] for sec in sectors_bar]
    plt.figure(figsize=(12, 6))
    plt.bar(sectors_bar, counts_bar, color="#636363")
    plt.xlabel("Sector")
    plt.ylabel("Number of Transactions (General Population)")
    plt.xticks(rotation=45, ha="right")
    plt.title("General Congress Transactions by Sector")
    plt.tight_layout()
    bar_path_plot = os.path.join(output_directory, "general_congress_sector_bar.png")
    plt.savefig(bar_path_plot, dpi=150)
    plt.close()
    print(f"[Saved] {bar_path_plot}")
    
    return general_stats

# --- Execute General Congress Analysis ---
# This assumes STOCK_TRANSACTIONS_DF and COMMITTEE_MEMBERSHIP_MAP are globally available and processed
# from previous cells (especially COMMITTEE_MEMBERSHIP_MAP having string IDs)
general_congress_summary_stats = analyze_general_congress_trading(
    STOCK_TRANSACTIONS_DF,
    COMMITTEE_MEMBERSHIP_MAP,
    GENERAL_OUTPUT_DIR
)


--- Analyzing Bipartite for General Congress (Non-Committee Members) ---
[Saved] general_congress_member_sector_bipartite/general_sector_counts.json
[Saved] general_congress_member_sector_bipartite/general_congress_graph_stats.json
[Saved] general_congress_member_sector_bipartite/general_congress_bipartite.png
[Saved] general_congress_member_sector_bipartite/general_congress_sector_bar.png


In [77]:
# Cell 7: Combined Sector Breakdown Plots (Count and Volume Proportions)

# --- Configuration for this cell ---
COMBINED_PLOT_INCLUDE_BUYS = True
COMBINED_PLOT_INCLUDE_SELLS = True
COMBINED_PLOT_OUTPUT_DIR = "combined_committee_sector_analysis"
os.makedirs(COMBINED_PLOT_OUTPUT_DIR, exist_ok=True)

# --- Data Aggregation Function ---
def aggregate_sector_data_for_committees(
    stock_df, committee_map, include_buys=True, include_sells=True
):
    stock_df["amount"] = pd.to_numeric(stock_df["amount"], errors="coerce").fillna(0.0)
    stock_df["type"] = stock_df["type"].astype(str).str.lower()

    committee_sector_counts = {}
    committee_sector_volumes = {}
    all_sectors = set()

    # Process defined committees
    processed_committee_names_list = []
    for committee_name_key, period_map_val in committee_map.items():
        if committee_name_key == "General Population": continue # Handle GP separately
        processed_committee_names_list.append(committee_name_key)

        members_116 = period_map_val.get("116", set())
        members_117 = period_map_val.get("117", set())
        current_committee_members = {str(m) for m in (members_116 | members_117)}

        if not current_committee_members:
            committee_sector_counts[committee_name_key] = {}
            committee_sector_volumes[committee_name_key] = {}
            continue

        df_c = stock_df[stock_df["member_id"].astype(str).isin(current_committee_members)].copy()
        if df_c.empty:
            committee_sector_counts[committee_name_key] = {}
            committee_sector_volumes[committee_name_key] = {}
            continue
        
        df_c["category"] = df_c["type"].map(
            lambda t: "buy" if ("purchase" in t or "buy" in t)
            else ("sell" if ("sale" in t or "sell" in t)
            else ("exchange" if "exchange" in t else "other"))
        )
        mask_c = pd.Series(True, index=df_c.index)
        if not include_buys:  mask_c &= df_c["category"] != "buy"
        if not include_sells: mask_c &= df_c["category"] != "sell"
        df_c = df_c[mask_c]

        if df_c.empty:
            committee_sector_counts[committee_name_key] = {}
            committee_sector_volumes[committee_name_key] = {}
            continue
            
        committee_sector_counts[committee_name_key] = df_c["sector"].value_counts().to_dict()
        committee_sector_volumes[committee_name_key] = df_c.groupby("sector")["amount"].sum().to_dict()
        all_sectors.update(committee_sector_counts[committee_name_key].keys())
        all_sectors.update(committee_sector_volumes[committee_name_key].keys())

    # Process General Population
    all_committee_members_ever_set = set()
    for committee_name_key, period_map_val in committee_map.items():
        if committee_name_key == "General Population": continue
        for member_s in period_map_val.values():
            all_committee_members_ever_set.update({str(m) for m in member_s})

    df_g = stock_df[~stock_df["member_id"].astype(str).isin(all_committee_members_ever_set)].copy()
    if not df_g.empty:
        df_g["category"] = df_g["type"].map(
            lambda t: "buy" if ("purchase" in t or "buy" in t)
            else ("sell" if ("sale" in t or "sell" in t)
            else ("exchange" if "exchange" in t else "other"))
        )
        mask_g = pd.Series(True, index=df_g.index)
        if not include_buys:  mask_g &= df_g["category"] != "buy"
        if not include_sells: mask_g &= df_g["category"] != "sell"
        df_g = df_g[mask_g]
        if not df_g.empty:
            committee_sector_counts["General Population"] = df_g["sector"].value_counts().to_dict()
            committee_sector_volumes["General Population"] = df_g.groupby("sector")["amount"].sum().to_dict()
            all_sectors.update(committee_sector_counts["General Population"].keys())
            all_sectors.update(committee_sector_volumes["General Population"].keys())
            if "General Population" not in processed_committee_names_list:
                 processed_committee_names_list.insert(0, "General Population") # Add GP to the list for sorting
        else:
            committee_sector_counts["General Population"] = {}
            committee_sector_volumes["General Population"] = {}
    else:
        committee_sector_counts["General Population"] = {}
        committee_sector_volumes["General Population"] = {}

    final_sorted_committees = sorted([
        c_name for c_name in processed_committee_names_list
        if committee_sector_counts.get(c_name) or committee_sector_volumes.get(c_name)
    ], key=lambda x: (x != "General Population", x)) # Sorts "General Population" first, then alphabetically


    final_all_sectors_list = sorted(list(s for s in all_sectors if pd.notna(s)))

    if not final_sorted_committees or not final_all_sectors_list:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), []

    # DataFrames for counts and volumes (absolute)
    data_c = [[committee_sector_counts.get(c, {}).get(s, 0) for s in final_all_sectors_list] for c in final_sorted_committees]
    df_abs_count = pd.DataFrame(data_c, index=final_sorted_committees, columns=final_all_sectors_list)
    data_v = [[committee_sector_volumes.get(c, {}).get(s, 0.0) for s in final_all_sectors_list] for c in final_sorted_committees]
    df_abs_vol = pd.DataFrame(data_v, index=final_sorted_committees, columns=final_all_sectors_list)

    # Normalized DataFrames (proportions)
    sum_c = df_abs_count.sum(axis=1)
    df_norm_count = df_abs_count.copy()
    for col_n in df_norm_count.columns: df_norm_count[col_n] = np.where(sum_c > 0, df_norm_count[col_n].divide(sum_c, axis=0), 0)
    df_norm_count = df_norm_count.fillna(0)

    sum_v = df_abs_vol.sum(axis=1)
    df_norm_vol = df_abs_vol.copy()
    for col_n in df_norm_vol.columns: df_norm_vol[col_n] = np.where(sum_v > 0, df_norm_vol[col_n].divide(sum_v, axis=0), 0)
    df_norm_vol = df_norm_vol.fillna(0)
    
    return df_norm_count, df_norm_vol, df_abs_count, df_abs_vol, final_all_sectors_list


# --- Execute Data Aggregation ---
(df_count_proportions, df_volume_proportions, 
 df_absolute_counts, df_absolute_volumes,
 plotted_sectors) = aggregate_sector_data_for_committees(
    STOCK_TRANSACTIONS_DF, COMMITTEE_MEMBERSHIP_MAP,
    COMBINED_PLOT_INCLUDE_BUYS, COMBINED_PLOT_INCLUDE_SELLS
)

if df_count_proportions.empty and df_volume_proportions.empty:
    print("No data available to plot for combined sector breakdowns. Exiting this cell's plotting.")
else:
    # --- Plotting Side-by-Side ---
    FIG_WIDTH_COMBINED = 10 # Adjusted for potentially more readable legend
    FIG_HEIGHT_COMBINED = 5.5 # Base height, will adjust for legend

    title_fontsize = 10
    axis_label_fontsize = 8
    tick_label_fontsize = 7
    legend_fontsize = 6.5
    legend_title_fontsize = 7.5
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(FIG_WIDTH_COMBINED, FIG_HEIGHT_COMBINED), sharey=True)
    
    # Define a consistent colormap
    num_plot_sectors = len(plotted_sectors)
    base_cmap_plot = plt.get_cmap("tab20")
    colors_plot = [base_cmap_plot(i % len(base_cmap_plot.colors)) for i in range(num_plot_sectors)]

    # Plot 1: By Transaction Count Proportions
    if not df_count_proportions.empty:
        df_count_proportions.plot(kind="bar", stacked=True, color=colors_plot, width=0.85, ax=axes[0], legend=False)
        axes[0].set_ylabel("Proportion of Total Transactions", fontsize=axis_label_fontsize)
        axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha="right", fontsize=tick_label_fontsize)
        axes[0].set_title("By Transaction Count", fontsize=title_fontsize, pad=8)
        axes[0].tick_params(axis='y', labelsize=tick_label_fontsize)
        axes[0].yaxis.set_major_formatter(mticker.FormatStrFormatter('%.1f'))
    else:
        axes[0].text(0.5, 0.5, "No Count Data", ha='center', va='center', transform=axes[0].transAxes)
        axes[0].set_title("By Transaction Count", fontsize=title_fontsize, pad=8)
    axes[0].set_xlabel("")
    axes[0].spines['top'].set_visible(False); axes[0].spines['right'].set_visible(False)
    axes[0].grid(axis='y', linestyle=':', linewidth=0.5, alpha=0.6, color='gray'); axes[0].set_axisbelow(True)

    # Plot 2: By Transaction Volume Proportions
    if not df_volume_proportions.empty:
        df_volume_proportions.plot(kind="bar", stacked=True, color=colors_plot, width=0.85, ax=axes[1], legend=False)
        axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha="right", fontsize=tick_label_fontsize)
        axes[1].set_title("By Transaction Volume", fontsize=title_fontsize, pad=8)
        axes[1].tick_params(axis='y', labelsize=tick_label_fontsize)
    else:
        axes[1].text(0.5, 0.5, "No Volume Data", ha='center', va='center', transform=axes[1].transAxes)
        axes[1].set_title("By Transaction Volume", fontsize=title_fontsize, pad=8)
    axes[1].set_xlabel("")
    axes[1].spines['top'].set_visible(False); axes[1].spines['right'].set_visible(False)
    axes[1].grid(axis='y', linestyle=':', linewidth=0.5, alpha=0.6, color='gray'); axes[1].set_axisbelow(True)

    # Shared Legend
    handles, labels = [], []
    source_ax_for_legend = axes[0] if not df_count_proportions.empty else (axes[1] if not df_volume_proportions.empty else None)
    if source_ax_for_legend:
        handles, labels = source_ax_for_legend.get_legend_handles_labels()

    if handles and labels:
        num_legend_cols_plot = min(5, (len(plotted_sectors) + 2) // 3 if len(plotted_sectors) > 0 else 1)
        if len(plotted_sectors) <= 5: num_legend_cols_plot = len(plotted_sectors) if len(plotted_sectors) > 0 else 1
        elif len(plotted_sectors) <= 10: num_legend_cols_plot = (len(plotted_sectors)+1)//2

        fig.legend(handles, labels, title="Sector", loc='lower center',
                   bbox_to_anchor=(0.5, -0.02), # Adjusted y for possible more legend rows
                   ncol=num_legend_cols_plot,
                   fontsize=legend_fontsize, title_fontsize=legend_title_fontsize, frameon=False,
                   labelspacing=0.3, columnspacing=0.8)
        
    fig.suptitle("Sector Investment Proportions: Committees vs. General Population", fontsize=title_fontsize + 2, y=0.99)
    plt.subplots_adjust(left=0.08, right=0.98, bottom=0.30, top=0.90, wspace=0.12) # Adjusted bottom and top

    side_by_side_plot_path = os.path.join(COMBINED_PLOT_OUTPUT_DIR, "sector_proportions_side_by_side.png")
    plt.savefig(side_by_side_plot_path, dpi=300)
    plt.close(fig)
    print(f"[Saved] Side-by-side plot to: {side_by_side_plot_path}")

    # Save underlying data
    df_count_proportions.to_csv(os.path.join(COMBINED_PLOT_OUTPUT_DIR, "proportions_by_count.csv"))
    df_volume_proportions.to_csv(os.path.join(COMBINED_PLOT_OUTPUT_DIR, "proportions_by_volume.csv"))
    df_absolute_counts.to_csv(os.path.join(COMBINED_PLOT_OUTPUT_DIR, "absolute_counts_by_sector.csv"))
    df_absolute_volumes.to_csv(os.path.join(COMBINED_PLOT_OUTPUT_DIR, "absolute_volumes_by_sector.csv"))
    print(f"Saved underlying data CSVs to '{COMBINED_PLOT_OUTPUT_DIR}'.")

[Saved] Side-by-side plot to: combined_committee_sector_analysis/sector_proportions_side_by_side.png
Saved underlying data CSVs to 'combined_committee_sector_analysis'.


In [78]:
# Cells 8 & 9: Network Science Analysis (COUNT-BASED)

import os
import json
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
# from sklearn.metrics.pairwise import cosine_similarity # Not used here
import community as community_louvain
from scipy.stats import chi2_contingency, entropy
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist # Used by linkage
from tqdm import tqdm
from networkx.algorithms import bipartite
import matplotlib.patches as mpatches # For custom legends

# --- Configuration for COUNT-BASED Analysis ---
COUNT_ANALYSIS_OUTPUT_DIR = "network_analysis_count_based"
os.makedirs(COUNT_ANALYSIS_OUTPUT_DIR, exist_ok=True)

# Path to the JSON file containing committee-sector transaction COUNTS
# This assumes the combined sector breakdown script (Cell 7 refactored) saves this.
# If not, this data needs to be loaded or computed first.
# For now, assuming 'df_absolute_counts' is available or can be loaded.
# If 'df_absolute_counts' is from the previous cell:
if 'df_absolute_counts' not in globals() or df_absolute_counts.empty:
    print("CRITICAL: df_absolute_counts (from Cell 7/combined breakdown) not found or empty. Cannot proceed with count-based network analysis.")
    # As a fallback, try to load it if saved previously
    counts_csv_path = os.path.join("combined_committee_sector_analysis", "absolute_counts_by_sector.csv") # Path from previous cell output
    if os.path.exists(counts_csv_path):
        print(f"Loading absolute counts from: {counts_csv_path}")
        df_counts = pd.read_csv(counts_csv_path, index_col=0)
    else:
        print(f"CRITICAL: Count data CSV not found at {counts_csv_path} either.")
        exit() # Or handle error appropriately
else:
    df_counts = df_absolute_counts.copy() # Use if available from previous cell run

# groups = list(df_counts.index) # Committees + General Population
# all_sectors = list(df_counts.columns)

# Ensure groups and all_sectors are correctly defined based on df_counts
if df_counts.empty:
    print("df_counts is empty, cannot proceed with network analysis.")
    exit()
else:
    groups = list(df_counts.index)
    all_sectors = list(df_counts.columns)


N_PERMUTATIONS_COUNT = 1000 # For null model testing

# --- 1. Build Bipartite Graph (Committees <-> Sectors, weighted by COUNT) ---
print("\n--- Building Bipartite Graph (Count-Weighted) ---")
B_count = nx.Graph()
B_count.add_nodes_from(groups, bipartite=0)
B_count.add_nodes_from(all_sectors, bipartite=1)

for committee in groups:
    for sector in all_sectors:
        count = df_counts.loc[committee, sector]
        if count > 0:
            B_count.add_edge(committee, sector, weight=int(count)) # Ensure weight is int for some algos

print(f"Bipartite graph B_count: {B_count.number_of_nodes()} nodes, {B_count.number_of_edges()} edges.")

# --- 2. Project Bipartite to Committee-Only Graph (G_overlap_count) ---
print("\n--- Projecting to Committee-Only Graph (Count Overlap) ---")
comm_nodes_count = [n for n, d in B_count.nodes(data=True) if d["bipartite"] == 0]
G_overlap_count = nx.Graph()
G_overlap_count.add_nodes_from(comm_nodes_count)
for c1, c2 in combinations(comm_nodes_count, 2):
    common_sectors = set(B_count.neighbors(c1)) & set(B_count.neighbors(c2))
    if not common_sectors: continue
    overlap_weight = sum(min(B_count[c1][s]["weight"], B_count[c2][s]["weight"]) for s in common_sectors)
    if overlap_weight > 0:
        G_overlap_count.add_edge(c1, c2, weight=overlap_weight)
print(f"G_overlap_count has {G_overlap_count.number_of_nodes()} nodes, {G_overlap_count.number_of_edges()} edges.")

# --- 3. Metrics & Community Detection on G_overlap_count ---
print("\n--- Analyzing Committee Projection Network (Count-Based) ---")
if G_overlap_count.number_of_nodes() > 0 and G_overlap_count.number_of_edges() > 0: # Check graph validity
    deg_cent_comm_c = nx.degree_centrality(G_overlap_count)
    eig_cent_comm_c = nx.eigenvector_centrality_numpy(G_overlap_count, weight="weight")
    betw_cent_comm_c = nx.betweenness_centrality(G_overlap_count, weight=None, normalized=True)
    core_numbers_comm_c = nx.core_number(G_overlap_count)
    partition_comm_c = community_louvain.best_partition(G_overlap_count, weight="weight", random_state=42) # Add random_state for reproducibility
    for node in G_overlap_count.nodes(): partition_comm_c.setdefault(node, -1)

    df_comm_metrics_c = pd.DataFrame({
        "committee": list(G_overlap_count.nodes()),
        "degree_centrality": [deg_cent_comm_c.get(c,0) for c in G_overlap_count.nodes()],
        "eigenvector_centrality": [eig_cent_comm_c.get(c,0) for c in G_overlap_count.nodes()],
        "betweenness_centrality": [betw_cent_comm_c.get(c,0) for c in G_overlap_count.nodes()],
        "core_number": [core_numbers_comm_c.get(c,0) for c in G_overlap_count.nodes()],
        "louvain_community": [partition_comm_c.get(c,-1) for c in G_overlap_count.nodes()]
    }).sort_values("eigenvector_centrality", ascending=False)
    df_comm_metrics_c.to_csv(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "committee_projection_metrics_count.csv"), index=False)
    print("Top 5 Committees by Eigenvector Centrality (Count-Based G_overlap):")
    print(df_comm_metrics_c.head(5).to_string(index=False))

    # Visualization (similar to your original, enhanced version)
    plt.figure(figsize=(10, 8)) # Adjusted size
    pos_comm_c = nx.spring_layout(G_overlap_count, seed=42, k=0.6, iterations=50)
    node_colors_c = []
    for n in G_overlap_count.nodes():
        if n == "General Population": node_colors_c.append("#7f7f7f")
        elif n.startswith("House"): node_colors_c.append("#1f77b4")
        else: node_colors_c.append("#2ca02c")
    
    max_eig_c = max(eig_cent_comm_c.values()) if eig_cent_comm_c else 1.0
    if max_eig_c == 0: max_eig_c = 1.0
    comm_sizes_c = [(eig_cent_comm_c.get(n,0) / max_eig_c) * 2500 + 200 for n in G_overlap_count.nodes()]

    edge_weights_c = [d["weight"] for u, v, d in G_overlap_count.edges(data=True)]
    max_w_c = max(edge_weights_c) if edge_weights_c else 1.0
    if max_w_c == 0: max_w_c = 1.0
    
    # Dynamic edge styling based on weight distribution
    percentiles_w_c = np.percentile(edge_weights_c, [0, 60, 80, 100]) if edge_weights_c else [0,0,0,0]
    
    for u, v, d in G_overlap_count.edges(data=True):
        weight = d["weight"]
        if weight >= percentiles_w_c[2]: # Top 20%
            lw, alpha = (weight / max_w_c) * 5 + 0.5, 0.7
        elif weight >= percentiles_w_c[1]: # Next 20%
            lw, alpha = (weight / max_w_c) * 3 + 0.3, 0.5
        else: # Bottom 60%
            lw, alpha = (weight / max_w_c) * 1.5 + 0.1, 0.3
        lw = max(0.1, lw) # ensure min width
        nx.draw_networkx_edges(G_overlap_count, pos_comm_c, edgelist=[(u, v)], width=lw, edge_color="#555555", alpha=alpha)

    nx.draw_networkx_nodes(G_overlap_count, pos_comm_c, node_size=comm_sizes_c, node_color=node_colors_c, edgecolors="black", linewidths=0.7, alpha=0.9)
    nx.draw_networkx_labels(G_overlap_count, pos_comm_c, font_size=9, font_color="black",
                            bbox=dict(facecolor="#f8f8f8", edgecolor="none", alpha=0.8, boxstyle="round,pad=0.2"))
    
    patch_h = mpatches.Patch(color="#1f77b4", label="House Committees")
    patch_s = mpatches.Patch(color="#2ca02c", label="Senate Committees")
    patch_g = mpatches.Patch(color="#7f7f7f", label="General Population")
    plt.legend(handles=[patch_h, patch_s, patch_g], loc="lower left", frameon=False, fontsize=10)
    plt.title("Committee Projection Network (Transaction Count Overlap)\n(Node color = Chamber; size = Eigenvector Centrality)", fontsize=12)
    plt.axis("off"); plt.tight_layout()
    plt.savefig(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "committee_projection_network_count.png"), dpi=300)
    plt.close()
    print(f"[Saved] {COUNT_ANALYSIS_OUTPUT_DIR}/committee_projection_network_count.png")
else:
    print("Committee projection graph (count-based) is empty or has no edges. Skipping analysis and plot.")


# --- 4. Project Bipartite to Sector-Only Graph (G_sec_overlap_count) ---
print("\n--- Projecting to Sector-Only Graph (Count Overlap) ---")
sec_nodes_count = [n for n, d in B_count.nodes(data=True) if d["bipartite"] == 1]
G_sec_overlap_count = nx.Graph()
G_sec_overlap_count.add_nodes_from(sec_nodes_count)
for s1, s2 in combinations(sec_nodes_count, 2):
    overlap_sum = 0
    for committee in groups:
        # Get counts for committee in s1 and s2 from df_counts
        count_s1 = df_counts.loc[committee, s1] if s1 in df_counts.columns else 0
        count_s2 = df_counts.loc[committee, s2] if s2 in df_counts.columns else 0
        overlap_sum += min(count_s1, count_s2)
    if overlap_sum > 0:
        G_sec_overlap_count.add_edge(s1, s2, weight=overlap_sum)
print(f"G_sec_overlap_count has {G_sec_overlap_count.number_of_nodes()} nodes, {G_sec_overlap_count.number_of_edges()} edges.")

# --- 5. Metrics & Community Detection on G_sec_overlap_count ---
# (Similar logic as for committee projection, apply to G_sec_overlap_count)
# ... This part can be refactored into a function if you do it often ...
if G_sec_overlap_count.number_of_nodes() > 0 and G_sec_overlap_count.number_of_edges() > 0:
    eig_cent_sector_c = nx.eigenvector_centrality_numpy(G_sec_overlap_count, weight="weight")
    partition_sector_c = community_louvain.best_partition(G_sec_overlap_count, weight="weight", random_state=42)
    df_sector_metrics_c = pd.DataFrame({
        "sector": list(G_sec_overlap_count.nodes()),
        "eigenvector_centrality": [eig_cent_sector_c.get(s,0) for s in G_sec_overlap_count.nodes()],
        "louvain_community": [partition_sector_c.get(s,-1) for s in G_sec_overlap_count.nodes()]
    }).sort_values("eigenvector_centrality", ascending=False)
    df_sector_metrics_c.to_csv(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "sector_projection_metrics_count.csv"), index=False)
    print("Top 5 Sectors by Eigenvector Centrality (Count-Based G_sec_overlap):")
    print(df_sector_metrics_c.head(5).to_string(index=False))

    # Visualization for Sector Projection
    plt.figure(figsize=(12, 10))
    pos_sec_c = nx.spring_layout(G_sec_overlap_count, seed=24, k=0.8, iterations=50)
    sec_colors_c = [partition_sector_c.get(s,-1) for s in G_sec_overlap_count.nodes()]
    max_eig_sec_c = max(eig_cent_sector_c.values()) if eig_cent_sector_c else 1.0
    if max_eig_sec_c == 0: max_eig_sec_c = 1.0
    sec_sizes_c  = [(eig_cent_sector_c.get(s,0) / max_eig_sec_c) * 2500 + 200 for s in G_sec_overlap_count.nodes()]
    
    edge_weights_sec_c = [d["weight"] for u,v,d in G_sec_overlap_count.edges(data=True)]
    max_w_sec_c = max(edge_weights_sec_c) if edge_weights_sec_c else 1.0
    if max_w_sec_c == 0: max_w_sec_c = 1.0
    edge_widths_sec_c = [(d["weight"] / max_w_sec_c) * 5 + 0.5 for u,v,d in G_sec_overlap_count.edges(data=True)]

    nx.draw_networkx_nodes(G_sec_overlap_count, pos_sec_c, node_size=sec_sizes_c, node_color=sec_colors_c, cmap=plt.cm.viridis, alpha=0.9) # Changed cmap
    nx.draw_networkx_edges(G_sec_overlap_count, pos_sec_c, width=edge_widths_sec_c, edge_color="#777777", alpha=0.5)
    nx.draw_networkx_labels(G_sec_overlap_count, pos_sec_c, font_size=9)
    plt.title("Sector Projection Network (Transaction Count Overlap)\n(Node color = Louvain Community, size = Eigenvector Centrality)", fontsize=12)
    plt.axis("off"); plt.tight_layout()
    plt.savefig(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "sector_projection_network_count.png"), dpi=300)
    plt.close()
    print(f"[Saved] {COUNT_ANALYSIS_OUTPUT_DIR}/sector_projection_network_count.png")
else:
    print("Sector projection graph (count-based) is empty or has no edges. Skipping analysis and plot.")


# --- 6. Statistical Validation (Modularity for G_overlap_count) ---
print("\n--- Modularity Test for Committee Projection (Count-Based) ---")
if G_overlap_count.number_of_nodes() > 0 and G_overlap_count.number_of_edges() > 0:
    real_modularity_c = community_louvain.modularity(partition_comm_c, G_overlap_count, weight="weight")
    comm_degrees_c = [B_count.degree(c) for c in comm_nodes_count] # Degrees from bipartite
    sec_degrees_c  = [B_count.degree(s) for s in sec_nodes_count]
    rand_mods_c = []
    for _ in tqdm(range(N_PERMUTATIONS_COUNT), desc="Random bipartite (count)"):
        B_r_c = bipartite.configuration_model(comm_degrees_c, sec_degrees_c, create_using=nx.Graph(), seed=np.random.randint(10000)) # Add seed
        B_r_c = nx.Graph(((u, v) for u, v in B_r_c.edges() if u != v)); B_r_c.remove_edges_from(nx.selfloop_edges(B_r_c))
        comm_nodes_rand_c = list(range(len(comm_nodes_count)))
        G_r_c = bipartite.weighted_projected_graph(B_r_c, comm_nodes_rand_c) # This projection is unweighted if B_r_c is unweighted
        if G_r_c.number_of_edges() > 0:
            part_r_c = community_louvain.best_partition(G_r_c, weight="weight", random_state=42) # Use weight even if 1
            rand_mods_c.append(community_louvain.modularity(part_r_c, G_r_c, weight="weight"))
        else:
            rand_mods_c.append(0) # Or handle as appropriate if no edges
            
    rand_mods_c = np.array(rand_mods_c)
    emp_pval_c = np.mean(rand_mods_c >= real_modularity_c) if len(rand_mods_c) > 0 else 1.0
    print(f"Real G_overlap_count modularity: {real_modularity_c:.4f}")
    print(f"Mean random modularity: {rand_mods_c.mean():.4f} ± {rand_mods_c.std():.4f}")
    print(f"Empirical p-value for modularity >= real: {emp_pval_c:.4f}")
    pd.DataFrame({"random_modularity_count": rand_mods_c}).to_csv(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "null_modularity_distribution_count.csv"), index=False)
else:
    print("Skipping modularity test for G_overlap_count as it's empty or has no edges.")


# --- 7. Hierarchical Clustering, Heatmap, KL Divergence on df_proportions (from count data) ---
# Ensure df_counts is the one loaded/calculated based on 'COMBINED_JSON' at the start of this cell block
# And df_prop is derived from it.
df_prop_count = df_counts.div(df_counts.sum(axis=1), axis=0).fillna(0)

if not df_prop_count.empty:
    print("\n--- Dendrogram, Heatmap, KL-Div (Count Proportions) ---")
    distance_matrix_c = pdist(df_prop_count.values, metric="euclidean")
    Z_c = linkage(distance_matrix_c, method="ward")
    plt.figure(figsize=(10, 6))
    dendro_data_c = dendrogram(Z_c, labels=df_prop_count.index.tolist(), leaf_rotation=90, leaf_font_size=9)
    plt.title("Dendrogram: Committees & Gen Pop by Transaction Count Proportions", fontsize=11)
    plt.ylabel("Ward Distance", fontsize=9); plt.yticks(fontsize=8); plt.xticks(fontsize=8)
    plt.tight_layout()
    plt.savefig(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "committee_dendrogram_count.png"), dpi=300)
    plt.close()
    print(f"[Saved] {COUNT_ANALYSIS_OUTPUT_DIR}/committee_dendrogram_count.png")

    # KL Divergence
    epsilon = 1e-9
    if "General Population" in df_prop_count.index:
        P_general_c = df_prop_count.loc["General Population"].values + epsilon
        kl_vals_c = {c: float(entropy(df_prop_count.loc[c].values + epsilon, P_general_c))
                     for c in df_prop_count.index if c != "General Population"}
        df_kl_c = pd.DataFrame.from_dict(kl_vals_c, orient="index", columns=["kl_divergence_count"]).sort_values("kl_divergence_count", ascending=False)
        df_kl_c.to_csv(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "committee_kl_divergence_count.csv"))
        # KL Plot omitted for brevity, can be added if needed
    else:
        print("Warning: 'General Population' not in df_prop_count for KL divergence.")

    # Clustered Heatmap
    leaf_order_c = dendro_data_c["ivl"]
    df_clustered_c = df_prop_count.loc[leaf_order_c]
    plt.figure(figsize=(11, 7)) # Adjusted size
    plt.imshow(df_clustered_c.values, aspect="auto", cmap="viridis", vmin=0, vmax=max(0.1, df_clustered_c.values.max())) # Use viridis, ensure vmax > 0
    plt.colorbar(label="Proportion of Transaction Counts")
    plt.yticks(ticks=np.arange(len(leaf_order_c)), labels=leaf_order_c, fontsize=7)
    plt.xticks(ticks=np.arange(len(all_sectors)), labels=all_sectors, rotation=90, fontsize=6)
    plt.title("Clustered Heatmap: Sector Transaction Count Proportions", fontsize=11)
    plt.tight_layout()
    plt.savefig(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "committee_sector_heatmap_clustered_count.png"), dpi=300)
    plt.close()
    print(f"[Saved] {COUNT_ANALYSIS_OUTPUT_DIR}/committee_sector_heatmap_clustered_count.png")
else:
    print("Count proportion DataFrame is empty. Skipping Dendrogram, Heatmap, KL-Div for counts.")


# --- 8. Chi-Square Tests (Absolute Counts) ---
if "General Population" in df_counts.index and not df_counts.loc["General Population"].empty:
    print("\n--- Chi-Square Test (Absolute Counts vs General Population) ---")
    general_vec_c = df_counts.loc["General Population"].values.astype(int)
    results_chi_c = []
    for c_name_chi in groups:
        if c_name_chi == "General Population": continue
        comm_vec_c = df_counts.loc[c_name_chi].values.astype(int)
        if comm_vec_c.sum() == 0 and general_vec_c.sum() == 0 : continue # Avoid all zero contingency
        contingency_c = np.vstack([comm_vec_c, general_vec_c])
        try:
            chi2_c, p_c, dof_c, _ = chi2_contingency(contingency_c)
            results_chi_c.append({"committee": c_name_chi, "chi2_statistic_count": float(chi2_c), "p_value_count": float(p_c)})
        except ValueError as e_chi: # Handle cases where chi2 cannot be computed (e.g. low expected frequencies)
            print(f"Chi2 error for {c_name_chi}: {e_chi}. Assigning p_value=1.0")
            results_chi_c.append({"committee": c_name_chi, "chi2_statistic_count": np.nan, "p_value_count": 1.0})


    if results_chi_c:
        df_chi_c = pd.DataFrame(results_chi_c).sort_values("p_value_count")
        df_chi_c.to_csv(os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "chi2_vs_general_count.csv"), index=False)
        print("Top 5 Committees by smallest χ² p-value (absolute counts):")
        print(df_chi_c.head(5).to_string(index=False))
    else:
        print("No Chi-square results to report for counts.")
else:
    print("General Population data not found or empty in df_counts. Skipping Chi-square for counts.")

print(f"\nCOUNT-BASED Network Analysis complete. Outputs in '{COUNT_ANALYSIS_OUTPUT_DIR}'")


--- Building Bipartite Graph (Count-Weighted) ---
Bipartite graph B_count: 30 nodes, 196 edges.

--- Projecting to Committee-Only Graph (Count Overlap) ---
G_overlap_count has 10 nodes, 45 edges.

--- Analyzing Committee Projection Network (Count-Based) ---
Top 5 Committees by Eigenvector Centrality (Count-Based G_overlap):
                 committee  degree_centrality  eigenvector_centrality  betweenness_centrality  core_number  louvain_community
        General Population                1.0                0.440521                     0.0            9                  0
  House Financial Services                1.0                0.402857                     0.0            9                  0
 House Energy and Commerce                1.0                0.383078                     0.0            9                  0
      House Appropriations                1.0                0.371793                     0.0            9                  0
House Oversight and Reform                1

Random bipartite (count): 100%|██████████| 1000/1000 [00:00<00:00, 1426.09it/s]


Real G_overlap_count modularity: 0.0012
Mean random modularity: 0.0000 ± 0.0005
Empirical p-value for modularity >= real: 0.0080

--- Dendrogram, Heatmap, KL-Div (Count Proportions) ---
[Saved] network_analysis_count_based/committee_dendrogram_count.png
[Saved] network_analysis_count_based/committee_sector_heatmap_clustered_count.png

--- Chi-Square Test (Absolute Counts vs General Population) ---
Top 5 Committees by smallest χ² p-value (absolute counts):
                                    committee  chi2_statistic_count  p_value_count
                   House Oversight and Reform           1761.655898   0.000000e+00
                               Senate Finance            227.940717   8.765101e-38
Senate Health, Education, Labor, and Pensions            222.933923   8.886901e-37
                     House Financial Services            221.965066   1.390626e-36
                         House Appropriations            218.414763   7.164736e-36

COUNT-BASED Network Analysis complete. Ou

In [79]:
# Cell 10: Network Science Analysis (VOLUME-BASED)

# --- Configuration for VOLUME-BASED Analysis ---
VOLUME_ANALYSIS_OUTPUT_DIR = "network_analysis_volume_based"
os.makedirs(VOLUME_ANALYSIS_OUTPUT_DIR, exist_ok=True)

# Path to the JSON file containing committee-sector transaction VOLUMES
# This assumes the combined sector breakdown script (Cell 7 refactored) saves this.
# If 'df_absolute_volumes' is available from previous cell:
if 'df_absolute_volumes' not in globals() or df_absolute_volumes.empty:
    print("CRITICAL: df_absolute_volumes (from Cell 7/combined breakdown) not found or empty. Cannot proceed with volume-based network analysis.")
    volumes_csv_path = os.path.join("combined_committee_sector_analysis", "absolute_volumes_by_sector.csv")
    if os.path.exists(volumes_csv_path):
        print(f"Loading absolute volumes from: {volumes_csv_path}")
        df_volumes = pd.read_csv(volumes_csv_path, index_col=0)
    else:
        print(f"CRITICAL: Volume data CSV not found at {volumes_csv_path} either.")
        exit()
else:
    df_volumes = df_absolute_volumes.copy()

# groups_vol = list(df_volumes.index)
# all_sectors_vol = list(df_volumes.columns)

if df_volumes.empty:
    print("df_volumes is empty, cannot proceed with volume-based network analysis.")
    exit()
else:
    groups_vol = list(df_volumes.index)
    all_sectors_vol = list(df_volumes.columns)


N_PERMUTATIONS_VOL = 1000

# --- 1. Build Bipartite Graph (Committees <-> Sectors, weighted by VOLUME) ---
print("\n--- Building Bipartite Graph (Volume-Weighted) ---")
B_vol = nx.Graph()
B_vol.add_nodes_from(groups_vol, bipartite=0)
B_vol.add_nodes_from(all_sectors_vol, bipartite=1)
for committee in groups_vol:
    for sector in all_sectors_vol:
        volume = df_volumes.loc[committee, sector]
        if volume > 0: # Only add edge if volume is positive
            B_vol.add_edge(committee, sector, weight=float(volume))
print(f"Bipartite graph B_vol: {B_vol.number_of_nodes()} nodes, {B_vol.number_of_edges()} edges.")

# --- 2. Project Bipartite to Committee-Only Graph (G_overlap_vol) ---
print("\n--- Projecting to Committee-Only Graph (Volume Overlap) ---")
comm_nodes_vol = [n for n, d in B_vol.nodes(data=True) if d["bipartite"] == 0]
G_overlap_vol = nx.Graph()
G_overlap_vol.add_nodes_from(comm_nodes_vol)
for c1, c2 in combinations(comm_nodes_vol, 2):
    common_sectors_vol = set(B_vol.neighbors(c1)) & set(B_vol.neighbors(c2))
    if not common_sectors_vol: continue
    overlap_weight_vol = sum(min(B_vol[c1][s]["weight"], B_vol[c2][s]["weight"]) for s in common_sectors_vol)
    if overlap_weight_vol > 0:
        G_overlap_vol.add_edge(c1, c2, weight=overlap_weight_vol)
print(f"G_overlap_vol has {G_overlap_vol.number_of_nodes()} nodes, {G_overlap_vol.number_of_edges()} edges.")
# Save edge list for G_overlap_vol
edge_rows_vol = [(u,v,d["weight"]) for u,v,d in G_overlap_vol.edges(data=True)]
edge_csv_path_vol = os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "committee_overlap_edges_volume.csv")
pd.DataFrame(edge_rows_vol, columns=["source", "target", "weight"]).to_csv(edge_csv_path_vol, index=False)
print(f"[Saved] {edge_csv_path_vol}")


# --- 3. Metrics & Community Detection on G_overlap_vol ---
print("\n--- Analyzing Committee Projection Network (Volume-Based) ---")
if G_overlap_vol.number_of_nodes() > 0 and G_overlap_vol.number_of_edges() > 0:
    deg_cent_comm_v = nx.degree_centrality(G_overlap_vol)
    eig_cent_comm_v = nx.eigenvector_centrality_numpy(G_overlap_vol, weight="weight")
    betw_cent_comm_v = nx.betweenness_centrality(G_overlap_vol, weight=None, normalized=True)
    core_numbers_comm_v = nx.core_number(G_overlap_vol)
    partition_comm_v = community_louvain.best_partition(G_overlap_vol, weight="weight", random_state=42)
    for node in G_overlap_vol.nodes(): partition_comm_v.setdefault(node, -1)

    df_comm_metrics_v = pd.DataFrame({
        "committee": list(G_overlap_vol.nodes()),
        "degree_centrality": [deg_cent_comm_v.get(c,0) for c in G_overlap_vol.nodes()],
        "eigenvector_centrality": [eig_cent_comm_v.get(c,0) for c in G_overlap_vol.nodes()],
        "betweenness_centrality": [betw_cent_comm_v.get(c,0) for c in G_overlap_vol.nodes()],
        "core_number": [core_numbers_comm_v.get(c,0) for c in G_overlap_vol.nodes()],
        "louvain_community": [partition_comm_v.get(c,-1) for c in G_overlap_vol.nodes()]
    }).sort_values("eigenvector_centrality", ascending=False)
    df_comm_metrics_v.to_csv(os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "committee_projection_metrics_volume.csv"), index=False)
    print("Top 5 Committees by Eigenvector Centrality (Volume-Based G_overlap):")
    print(df_comm_metrics_v.head(5).to_string(index=False))

    # Visualization (Volume-based committee projection)
    plt.figure(figsize=(10, 8))
    pos_comm_v = nx.spring_layout(G_overlap_vol, seed=42, k=0.6, iterations=50)
    node_colors_v = []
    for n in G_overlap_vol.nodes():
        if n == "General Population": node_colors_v.append("#7f7f7f")
        elif n.startswith("House"): node_colors_v.append("#1f77b4")
        else: node_colors_v.append("#2ca02c")
    max_eig_v = max(eig_cent_comm_v.values()) if eig_cent_comm_v else 1.0
    if max_eig_v == 0 : max_eig_v = 1.0
    comm_sizes_v = [(eig_cent_comm_v.get(n,0) / max_eig_v) * 2500 + 200 for n in G_overlap_vol.nodes()]
    
    edge_weights_v = [d["weight"] for u,v,d in G_overlap_vol.edges(data=True)]
    max_w_v = max(edge_weights_v) if edge_weights_v else 1.0
    if max_w_v == 0: max_w_v = 1.0
    percentiles_w_v = np.percentile(edge_weights_v, [0, 60, 80, 100]) if edge_weights_v else [0,0,0,0]

    for u, v, d in G_overlap_vol.edges(data=True):
        weight = d["weight"]
        if weight >= percentiles_w_v[2]: lw, alpha = (weight / max_w_v) * 5 + 0.5, 0.7
        elif weight >= percentiles_w_v[1]: lw, alpha = (weight / max_w_v) * 3 + 0.3, 0.5
        else: lw, alpha = (weight / max_w_v) * 1.5 + 0.1, 0.3
        lw = max(0.1, lw)
        nx.draw_networkx_edges(G_overlap_vol, pos_comm_v, edgelist=[(u,v)], width=lw, edge_color="#555555", alpha=alpha)
    
    nx.draw_networkx_nodes(G_overlap_vol, pos_comm_v, node_size=comm_sizes_v, node_color=node_colors_v, edgecolors="black", linewidths=0.7, alpha=0.9)
    nx.draw_networkx_labels(G_overlap_vol, pos_comm_v, font_size=9, font_color="black",
                            bbox=dict(facecolor="#f8f8f8", edgecolor="none", alpha=0.8, boxstyle="round,pad=0.2"))
    patch_h = mpatches.Patch(color="#1f77b4", label="House Committees"); patch_s = mpatches.Patch(color="#2ca02c", label="Senate Committees"); patch_g = mpatches.Patch(color="#7f7f7f", label="General Population")
    plt.legend(handles=[patch_h, patch_s, patch_g], loc="lower left", frameon=False, fontsize=10)
    plt.title("Committee Projection Network (Transaction Volume Overlap)\n(Node color = Chamber; size = Eigenvector Centrality)", fontsize=12)
    plt.axis("off"); plt.tight_layout()
    plt.savefig(os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "committee_projection_network_volume.png"), dpi=300)
    plt.close()
    print(f"[Saved] {VOLUME_ANALYSIS_OUTPUT_DIR}/committee_projection_network_volume.png")
else:
    print("Committee projection graph (volume-based) is empty or has no edges. Skipping.")

# --- 4. Project Bipartite to Sector-Only Graph (G_sec_overlap_vol) ---
# ... (Similar projection and analysis as for counts, but using B_vol and df_volumes) ...
# This section is analogous to the sector projection in the count-based analysis
# For brevity, I'll assume this logic would be mirrored. If you need it explicitly, let me know.
print("\n--- Projecting to Sector-Only Graph (Volume Overlap) ---")
sec_nodes_vol = [n for n, d in B_vol.nodes(data=True) if d["bipartite"] == 1]
G_sec_overlap_vol = nx.Graph()
G_sec_overlap_vol.add_nodes_from(sec_nodes_vol)
# ... (rest of G_sec_overlap_vol creation, metrics, plotting - mirror count version) ...
# This is a placeholder - you'd copy/adapt the G_sec_overlap_count logic here.
if not G_sec_overlap_vol.nodes(): # Minimal check
    print("Sector projection graph (volume-based) could not be built or is empty.")


# --- 5. Statistical Validation (Modularity for G_overlap_vol) ---
print("\n--- Modularity Test for Committee Projection (Volume-Based) ---")
if G_overlap_vol.number_of_nodes() > 0 and G_overlap_vol.number_of_edges() > 0:
    real_modularity_v = community_louvain.modularity(partition_comm_v, G_overlap_vol, weight="weight")
    comm_degrees_v = [B_vol.degree(c) for c in comm_nodes_vol]
    sec_degrees_v  = [B_vol.degree(s) for s in sec_nodes_vol]
    rand_mods_v = []
    for _ in tqdm(range(N_PERMUTATIONS_VOL), desc="Random bipartite (volume)"):
        B_r_v = bipartite.configuration_model(comm_degrees_v, sec_degrees_v, create_using=nx.Graph(), seed=np.random.randint(10000))
        B_r_v = nx.Graph(((u,v) for u,v in B_r_v.edges() if u!=v)); B_r_v.remove_edges_from(nx.selfloop_edges(B_r_v))
        comm_nodes_rand_v = list(range(len(comm_nodes_vol)))
        G_r_v = bipartite.weighted_projected_graph(B_r_v, comm_nodes_rand_v)
        if G_r_v.number_of_edges() > 0:
            part_r_v = community_louvain.best_partition(G_r_v, weight="weight", random_state=42)
            rand_mods_v.append(community_louvain.modularity(part_r_v, G_r_v, weight="weight"))
        else:
            rand_mods_v.append(0)
    rand_mods_v = np.array(rand_mods_v)
    emp_pval_v = np.mean(rand_mods_v >= real_modularity_v) if len(rand_mods_v) > 0 else 1.0
    print(f"Real G_overlap_vol modularity: {real_modularity_v:.4f}")
    print(f"Mean random modularity (volume): {rand_mods_v.mean():.4f} ± {rand_mods_v.std():.4f}")
    print(f"Empirical p-value (volume) >= real: {emp_pval_v:.4f}")
    pd.DataFrame({"random_modularity_volume": rand_mods_v}).to_csv(os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "null_modularity_distribution_volume.csv"), index=False)
else:
    print("Skipping modularity test for G_overlap_vol as it's empty or has no edges.")


# --- 6. Hierarchical Clustering, Heatmap, KL Divergence on df_vol_prop ---
df_vol_prop = df_volumes.div(df_volumes.sum(axis=1), axis=0).fillna(0) # Recalculate based on current df_volumes
if not df_vol_prop.empty:
    print("\n--- Dendrogram, Heatmap, KL-Div (Volume Proportions) ---")
    distance_matrix_v = pdist(df_vol_prop.values, metric="euclidean")
    Z_v = linkage(distance_matrix_v, method="ward")
    plt.figure(figsize=(10, 6))
    dendro_data_v = dendrogram(Z_v, labels=df_vol_prop.index.tolist(), leaf_rotation=90, leaf_font_size=9)
    plt.title("Dendrogram: Committees & Gen Pop by Transaction Volume Proportions", fontsize=11)
    plt.ylabel("Ward Distance", fontsize=9); plt.yticks(fontsize=8); plt.xticks(fontsize=8)
    plt.tight_layout()
    plt.savefig(os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "committee_dendrogram_volume.png"), dpi=300)
    plt.close()
    print(f"[Saved] {VOLUME_ANALYSIS_OUTPUT_DIR}/committee_dendrogram_volume.png")

    if "General Population" in df_vol_prop.index:
        P_general_v = df_vol_prop.loc["General Population"].values + 1e-9
        kl_vals_v = {c: float(entropy(df_vol_prop.loc[c].values + 1e-9, P_general_v))
                     for c in df_vol_prop.index if c != "General Population"}
        df_kl_v = pd.DataFrame.from_dict(kl_vals_v, orient="index", columns=["kl_divergence_volume"]).sort_values("kl_divergence_volume", ascending=False)
        df_kl_v.to_csv(os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "committee_kl_divergence_volume.csv"))
        # KL Plot for volume omitted for brevity
    else:
        print("Warning: 'General Population' not in df_vol_prop for KL divergence (volume).")

    leaf_order_v = dendro_data_v["ivl"]
    df_clustered_v = df_vol_prop.loc[leaf_order_v]
    plt.figure(figsize=(11, 7))
    plt.imshow(df_clustered_v.values, aspect="auto", cmap="viridis", vmin=0, vmax=max(0.1, df_clustered_v.values.max()))
    plt.colorbar(label="Proportion of Transaction Volume")
    plt.yticks(ticks=np.arange(len(leaf_order_v)), labels=leaf_order_v, fontsize=7)
    plt.xticks(ticks=np.arange(len(all_sectors_vol)), labels=all_sectors_vol, rotation=90, fontsize=6)
    plt.title("Clustered Heatmap: Sector Transaction Volume Proportions", fontsize=11)
    plt.tight_layout()
    plt.savefig(os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "committee_sector_heatmap_clustered_volume.png"), dpi=300)
    plt.close()
    print(f"[Saved] {VOLUME_ANALYSIS_OUTPUT_DIR}/committee_sector_heatmap_clustered_volume.png")
else:
    print("Volume proportion DataFrame is empty. Skipping Dendrogram, Heatmap, KL-Div for volumes.")

# --- 7. Chi-Square Tests (Absolute Volumes) ---
if "General Population" in df_volumes.index and not df_volumes.loc["General Population"].empty:
    print("\n--- Chi-Square Test (Absolute Volumes vs General Population) ---")
    general_vec_v = df_volumes.loc["General Population"].round().astype(int).values
    results_chi_v = []
    for c_name_chi_v in groups_vol: # Use groups_vol
        if c_name_chi_v == "General Population": continue
        comm_vec_v = df_volumes.loc[c_name_chi_v].round().astype(int).values
        if comm_vec_v.sum() == 0 and general_vec_v.sum() == 0 : continue
        contingency_v = np.vstack([comm_vec_v, general_vec_v])
        try:
            chi2_v, p_v, dof_v, _ = chi2_contingency(contingency_v)
            results_chi_v.append({"committee": c_name_chi_v, "chi2_statistic_volume": float(chi2_v), "p_value_volume": float(p_v)})
        except ValueError as e_chi_v:
            print(f"Chi2 error for {c_name_chi_v} (volume): {e_chi_v}. Assigning p_value=1.0")
            results_chi_v.append({"committee": c_name_chi_v, "chi2_statistic_volume": np.nan, "p_value_volume": 1.0})
    if results_chi_v:
        df_chi_v = pd.DataFrame(results_chi_v).sort_values("p_value_volume")
        df_chi_v.to_csv(os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "chi2_vs_general_volume.csv"), index=False)
        print("Top 5 Committees by smallest χ² p-value (absolute volumes):")
        print(df_chi_v.head(5).to_string(index=False))
    else:
        print("No Chi-square results to report for volumes.")
else:
    print("General Population data not found or empty in df_volumes. Skipping Chi-square for volumes.")


print(f"\nVOLUME-BASED Network Analysis complete. Outputs in '{VOLUME_ANALYSIS_OUTPUT_DIR}'")


--- Building Bipartite Graph (Volume-Weighted) ---
Bipartite graph B_vol: 30 nodes, 196 edges.

--- Projecting to Committee-Only Graph (Volume Overlap) ---
G_overlap_vol has 10 nodes, 45 edges.
[Saved] network_analysis_volume_based/committee_overlap_edges_volume.csv

--- Analyzing Committee Projection Network (Volume-Based) ---
Top 5 Committees by Eigenvector Centrality (Volume-Based G_overlap):
                 committee  degree_centrality  eigenvector_centrality  betweenness_centrality  core_number  louvain_community
        General Population                1.0                0.516638                     0.0            9                  0
      House Ways and Means                1.0                0.514668                     0.0            9                  0
  House Financial Services                1.0                0.510707                     0.0            9                  0
 House Energy and Commerce                1.0                0.286496                     0.0   

Random bipartite (volume): 100%|██████████| 1000/1000 [00:00<00:00, 1699.57it/s]


Real G_overlap_vol modularity: 0.0796
Mean random modularity (volume): 0.0001 ± 0.0007
Empirical p-value (volume) >= real: 0.0000

--- Dendrogram, Heatmap, KL-Div (Volume Proportions) ---
[Saved] network_analysis_volume_based/committee_dendrogram_volume.png
[Saved] network_analysis_volume_based/committee_sector_heatmap_clustered_volume.png

--- Chi-Square Test (Absolute Volumes vs General Population) ---
Top 5 Committees by smallest χ² p-value (absolute volumes):
                 committee  chi2_statistic_volume  p_value_volume
      House Appropriations           1.409538e+07             0.0
 House Energy and Commerce           2.355701e+07             0.0
  House Financial Services           1.420587e+08             0.0
House Oversight and Reform           2.999258e+08             0.0
      House Ways and Means           1.274248e+08             0.0

VOLUME-BASED Network Analysis complete. Outputs in 'network_analysis_volume_based'


In [80]:
# Cell 11: Temporal Persistence (Betti-0 Barcode)


# --- Configuration ---
TDA_WINDOW_DAYS = 7
TDA_TOP_N_MEMBERS = None # Set to an int (e.g., 200) to limit analysis, None for all
TDA_OUTPUT_DIR = "tda_temporal_weekly_activity"
os.makedirs(TDA_OUTPUT_DIR, exist_ok=True)

# --- Ensure STOCK_TRANSACTIONS_DF is available and prepped ---
if 'STOCK_TRANSACTIONS_DF' not in globals() or STOCK_TRANSACTIONS_DF.empty:
    print("CRITICAL: STOCK_TRANSACTIONS_DF not available for TDA. Skipping.")
    # exit()
else:
    df_tda = STOCK_TRANSACTIONS_DF.copy()
    # Ensure date column is datetime and named 'date_dt' for consistency
    if 'transaction_date_dt' not in df_tda.columns: # If using the original column name directly
        if 'transaction_date' in df_tda.columns:
            df_tda['date_dt'] = pd.to_datetime(df_tda['transaction_date'], errors='coerce')
        else: # If neither 'transaction_date_dt' nor 'transaction_date' exists
            print("CRITICAL: Suitable date column ('transaction_date_dt' or 'transaction_date') not found in STOCK_TRANSACTIONS_DF for TDA.")
            exit() # or raise error
    else: # If 'transaction_date_dt' exists, ensure it's datetime
        df_tda['date_dt'] = pd.to_datetime(df_tda['transaction_date_dt'], errors='coerce')

    df_tda.dropna(subset=['date_dt'], inplace=True) # Remove rows where date parsing failed
    df_tda["member_id"] = df_tda["member_id"].astype(str)

    # Filter for 'sale' or 'purchase' types
    type_mask = df_tda["type"].astype(str).str.contains("sale|purchase", case=False, na=False)
    df_tda = df_tda[type_mask]

    if df_tda.empty:
        print("No 'sale' or 'purchase' transactions found for TDA. Skipping.")
        # exit()

    # --- Build weekly windows ---
    if not df_tda.empty:
        t0_tda = df_tda["date_dt"].min().normalize()
        tN_tda = df_tda["date_dt"].max().normalize() + timedelta(days=1) # Ensure last day is included
        
        # Create window indices
        df_tda["w_idx"] = ((df_tda["date_dt"].dt.normalize() - t0_tda).dt.days // TDA_WINDOW_DAYS)
        num_windows_tda = df_tda["w_idx"].max() + 1 if not df_tda["w_idx"].empty else 0
        
        print(f"{num_windows_tda} weekly windows between {t0_tda.date()} and {tN_tda.date()}")

        # --- Build binary activity matrix ---
        if TDA_TOP_N_MEMBERS and 'amount' in df_tda.columns:
            top_ids_tda = (
                df_tda.groupby("member_id")["amount"]
                .sum().sort_values(ascending=False)
                .head(TDA_TOP_N_MEMBERS).index
            )
            df_tda = df_tda[df_tda["member_id"].isin(top_ids_tda)]
        
        member_list_tda = sorted(df_tda["member_id"].unique())
        if not member_list_tda or num_windows_tda == 0:
            print("Not enough members or windows to create activity matrix for TDA. Skipping.")
            # exit()
        else:
            member_index_tda = {m: i for i, m in enumerate(member_list_tda)}
            M_tda = len(member_list_tda)
            X_tda = np.zeros((M_tda, num_windows_tda), dtype=np.uint8)

            for mid_tda, w_idx_tda in df_tda[["member_id", "w_idx"]].drop_duplicates().itertuples(index=False):
                if mid_tda in member_index_tda and 0 <= w_idx_tda < num_windows_tda:
                     X_tda[member_index_tda[mid_tda], w_idx_tda] = 1
            print(f"Binary activity matrix X_tda.shape = {X_tda.shape}")

            # --- Compute persistent homology ---
            if X_tda.shape[0] >= 2 and X_tda.shape[1] > 0: # Need at least 2 members and some windows
                try:
                    diagrams_tda = ripser(X_tda, maxdim=0, metric="euclidean")["dgms"]
                    h0_tda = diagrams_tda[0]
                    print(f"H₀ bars found: {len(h0_tda)}")

                    # --- Plot barcode ---
                    if len(h0_tda) > 0:
                        plt.figure(figsize=(10, 4.5)) # Adjusted for better title fit
                        plot_diagrams(diagrams_tda, show=False, lifetime=True, 
                                      labels=["H₀ Intervals"], legend=True) # Added labels and legend
                        plt.title("Betti-0 Barcode — Weekly Trading Activity\n(Long bars = members trading in similar weeks)", fontsize=11)
                        plt.xlabel("Filtration ε (Euclidean on binary weekly vectors)", fontsize=9)
                        plt.ylabel("Interval Index", fontsize=9) # More generic y-label
                        plt.xticks(fontsize=8); plt.yticks(fontsize=8)
                        plt.tight_layout()
                        barcode_path_tda = os.path.join(TDA_OUTPUT_DIR, "member_trade_barcode_weekly.png")
                        plt.savefig(barcode_path_tda, dpi=200) # Increased dpi
                        plt.close()
                        print(f"[Saved] {barcode_path_tda}")

                        # --- Export H0 intervals ---
                        df_h0_intervals_tda = pd.DataFrame(h0_tda, columns=["birth", "death"])
                        # Add member_id to the H0 intervals
                        if len(member_list_tda) >= len(df_h0_intervals_tda):
                             df_h0_intervals_tda['member_id'] = member_list_tda[:len(df_h0_intervals_tda)]
                             if id2name: # Check if id2name is populated
                                df_h0_intervals_tda['member_name'] = df_h0_intervals_tda['member_id'].map(lambda x: id2name.get(x,x))
                        df_h0_intervals_tda.to_csv(os.path.join(TDA_OUTPUT_DIR, "betti0_intervals_weekly.csv"), index=False)
                        print(f"[Saved] H0 intervals to {os.path.join(TDA_OUTPUT_DIR, 'betti0_intervals_weekly.csv')}")
                    else:
                        print("No H0 intervals found to plot.")
                except Exception as e_ripser:
                    print(f"Error during Ripser computation or plotting: {e_ripser}")
            else:
                print("Not enough data points or windows for Ripser computation.")

209 weekly windows between 2019-01-03 and 2023-01-01
Binary activity matrix X_tda.shape = (186, 209)
H₀ bars found: 182
[Saved] tda_temporal_weekly_activity/member_trade_barcode_weekly.png
[Saved] H0 intervals to tda_temporal_weekly_activity/betti0_intervals_weekly.csv




In [81]:
# Cell 12: Member-Level Infomap Network (Volume-Weighted Shared Sector Activity)

try:
    from infomap import Infomap
except ImportError:
    print("CRITICAL ERROR: 'infomap' library not found. Please install it (e.g., pip install infomap). This cell will be skipped.")
    # For now, we'll let it proceed and error out if Infomap is actually used without being imported.

import matplotlib.cm as cm
import matplotlib.patches as mpatches
import math
import os
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations

# --- Assume these are defined in previous cells and are valid ---
# COUNT_ANALYSIS_OUTPUT_DIR = "your/output/directory/path"
# STOCK_TRANSACTIONS_DF = pd.DataFrame(...) # Make sure this is loaded
# COMMITTEE_MEMBERSHIP_MAP = {...} # Make sure this is loaded
# id2name = {...} # Optional global name map, make sure this is loaded if used

# --- Configuration ---
INFOMAP_OUTPUT_DIR = os.path.join(COUNT_ANALYSIS_OUTPUT_DIR, "member_infomap_volume_weighted")
os.makedirs(INFOMAP_OUTPUT_DIR, exist_ok=True)
SPARSIFY_PERCENTILE = 99

# --- Ensure prerequisites are available ---
if 'STOCK_TRANSACTIONS_DF' not in globals() or STOCK_TRANSACTIONS_DF.empty:
    print("CRITICAL: STOCK_TRANSACTIONS_DF not available for Infomap analysis. Skipping.")
elif 'COMMITTEE_MEMBERSHIP_MAP' not in globals() or not COMMITTEE_MEMBERSHIP_MAP:
    print("CRITICAL: COMMITTEE_MEMBERSHIP_MAP not available. Skipping.")
elif 'Infomap' not in globals(): # Check if Infomap class was successfully imported
    print("CRITICAL: Infomap library could not be imported. Skipping Infomap analysis.")
else:
    print("Starting Member-Level Infomap Analysis...")
    # A) member_id -> primary committee
    member_to_committee_infomap = {}
    for comm_im, period_map_im in COMMITTEE_MEMBERSHIP_MAP.items():
        for members_im_set in period_map_im.values():
            for m_im in members_im_set:
                member_to_committee_infomap.setdefault(str(m_im), comm_im) # Ensure member ID is string
    print(f"Built member_to_committee_infomap with {len(member_to_committee_infomap)} entries.")

    # B) Aggregate member-sector VOLUME + lookup pretty names
    mask_im = STOCK_TRANSACTIONS_DF["type"].str.contains("sale|purchase", case=False, na=False)
    df_tx_im = STOCK_TRANSACTIONS_DF[mask_im].copy()
    df_tx_im["member_id"] = df_tx_im["member_id"].astype(str) # Ensure member_id is string for consistency

    id2name_im_local = {}
    if 'id2name' in globals() and isinstance(id2name, dict) and id2name:
        print("Using global id2name map.")
        id2name_im_local = id2name
    elif 'member' in df_tx_im.columns:
        print("Building local id2name from transactions 'member' column...")
        id2name_im_local = df_tx_im[["member_id", "member"]].drop_duplicates("member_id").set_index("member_id")["member"].to_dict()
    else:
        print("Warning: No global id2name and no 'member' column in transactions. Using member_id as name.")
        unique_member_ids = df_tx_im["member_id"].unique()
        id2name_im_local = {mid: mid for mid in unique_member_ids}
    print(f"Using id2name_im_local with {len(id2name_im_local)} entries.")

    print("Pivoting transaction data...")
    df_tx_im['amount'] = pd.to_numeric(df_tx_im['amount'], errors='coerce').fillna(0.0)
    pivot_im = (df_tx_im.groupby(["member_id", "sector"])["amount"]
                  .sum().unstack(fill_value=0.0))
    pivot_im.replace([np.inf, -np.inf], np.nan, inplace=True)
    pivot_im.fillna(0.0, inplace=True)
    print(f"Pivot table shape: {pivot_im.shape}")

    members_in_pivot_with_committee = [idx for idx in pivot_im.index if idx in member_to_committee_infomap]
    pivot_im = pivot_im.loc[members_in_pivot_with_committee]
    members_list_im = pivot_im.index.tolist()
    print(f"Number of members with committee assignments and trades: {len(members_list_im)}")

    if not members_list_im:
        print("No members with committee assignments and trades for Infomap analysis. Skipping.")
    else:
        print("Building full member-member graph...")
        G_mem_im = nx.Graph()
        G_mem_im.add_nodes_from(members_list_im)

        for i, j in combinations(members_list_im, 2):
            series_i_vals = pivot_im.loc[i].values
            series_j_vals = pivot_im.loc[j].values
            w_im = np.minimum(series_i_vals, series_j_vals).sum()
            if np.isnan(w_im) or np.isinf(w_im):
                # This should ideally not happen due to prior cleaning
                # print(f"Warning: Calculated weight for pair ({i}, {j}) is NaN or Inf. Skipping edge.")
                continue
            if w_im > 0:
                G_mem_im.add_edge(i, j, weight=w_im)
        print(f"Full member graph (Infomap): {G_mem_im.number_of_nodes()} nodes, {G_mem_im.number_of_edges()} edges.")

        def keep_edges_above_percentile_im(G, percentile_to_keep_above):
            if not G.number_of_edges(): return G.copy()
            weights = [d["weight"] for _, _, d in G.edges(data=True)]
            if not weights: return G.copy()
            if percentile_to_keep_above == 0: threshold = -np.inf # Keep all
            elif percentile_to_keep_above == 100: threshold = np.max(weights) if weights else np.inf # Keep only max weight edges
            else: threshold = np.percentile(weights, percentile_to_keep_above)
            H = nx.Graph()
            H.add_nodes_from(G.nodes(data=True)) # Keep all nodes
            for u, v, d_edge in G.edges(data=True):
                if d_edge["weight"] >= threshold: H.add_edge(u, v, **d_edge)
            return H
        
        print(f"Sparsifying graph, keeping edges with weights >= {SPARSIFY_PERCENTILE}th percentile...")
        G_mem_im_sparse = keep_edges_above_percentile_im(G_mem_im, SPARSIFY_PERCENTILE)
        print(f"Sparse member graph (Infomap): {G_mem_im_sparse.number_of_nodes()} nodes, {G_mem_im_sparse.number_of_edges()} edges.")

        if G_mem_im_sparse.number_of_edges() > 0:
            print("Preparing data for Infomap...")
            # Add "--silent" to im_instance if you want to suppress Infomap's C++ console output
            im_instance = Infomap("--two-level") 
            
            id_map_im_str_to_int = {mem_id_str: i for i, mem_id_str in enumerate(G_mem_im_sparse.nodes())}
            id_map_im_int_to_str = {i: mem_id_str for mem_id_str, i in id_map_im_str_to_int.items()}

            for mem_id_str, int_id in id_map_im_str_to_int.items():
                im_instance.add_node(int_id, id2name_im_local.get(mem_id_str, mem_id_str))
            print(f"Added {im_instance.num_nodes} nodes to Infomap instance.")
            
            link_count = 0
            for u_str, v_str, edge_data in G_mem_im_sparse.edges(data=True):
                u_int = id_map_im_str_to_int.get(u_str)
                v_int = id_map_im_str_to_int.get(v_str)
                if u_int is not None and v_int is not None:
                    im_instance.add_link(u_int, v_int, edge_data["weight"])
                    link_count += 1
                else:
                    print(f"Warning: Edge ({u_str}, {v_str}) refers to node(s) not in id_map_im_str_to_int. Skipping.")
            print(f"Added {link_count} links to Infomap instance.")

            print("Running Infomap algorithm...")
            im_instance.run()
            print(f"Infomap run completed: Codelength L={im_instance.codelength:.3f} bits; Found {im_instance.num_top_modules} top-level modules.")

            modules_im = {}
            print("Extracting module assignments from Infomap result...")
            for node_iterator_item in im_instance.nodes: 
                current_node_int_id = node_iterator_item.id()      
                current_node_module_id = node_iterator_item.module_id  

                if current_node_int_id in id_map_im_int_to_str:
                    original_member_id_str = id_map_im_int_to_str[current_node_int_id]
                    modules_im[original_member_id_str] = current_node_module_id
                else:
                    print(f"Warning: Infomap node with integer id {current_node_int_id} not found in id_map_im_int_to_str. This is unexpected.")
            
            if not modules_im and G_mem_im_sparse.number_of_nodes() > 0:
                print("Warning: No modules were extracted, but graph had nodes. Check Infomap results and mapping logic.")
            elif modules_im:
                print(f"Successfully extracted {len(modules_im)} module assignments for {len(set(modules_im.values()))} unique modules.")

            if modules_im:
                assignment_df_im = pd.DataFrame({
                    "member_id": list(modules_im.keys()),
                    "member_name": [id2name_im_local.get(mid_str, mid_str) for mid_str in modules_im.keys()],
                    "module_id": list(modules_im.values()),
                    "committee": [member_to_committee_infomap.get(mid_str, "Unassigned") for mid_str in modules_im.keys()]
                })
                assignment_filepath = os.path.join(INFOMAP_OUTPUT_DIR, "member_infomap_modules.csv")
                assignment_df_im.to_csv(assignment_filepath, index=False)
                print(f"[Saved] Infomap module assignments to {assignment_filepath}")

                # --- Common data for both plots ---
                nodes_to_draw_full = list(G_mem_im_sparse.nodes())
                unique_comms_im = sorted(list(set(c for c in member_to_committee_infomap.values() if c is not None and c)))
                if not unique_comms_im: unique_comms_im = ["Unassigned"]
                
                num_colors_needed = len(unique_comms_im)
                try: palette_im = cm.get_cmap("tab20", num_colors_needed if num_colors_needed > 0 else 1)
                except ValueError:
                    print(f"Warning: Number of unique committees ({num_colors_needed}) exceeds tab20 palette size. Using 'viridis' colormap.")
                    palette_im = cm.get_cmap("viridis", num_colors_needed if num_colors_needed > 0 else 1)
                comm2color_im = {c: palette_im(i) for i, c in enumerate(unique_comms_im)}
                comm2color_im.setdefault("Unassigned", "silver")
                
                node_total_volumes = {mid_str: pivot_im.loc[mid_str].sum() for mid_str in G_mem_im_sparse.nodes() if mid_str in pivot_im.index}
                default_volume = np.median(list(node_total_volumes.values())) if node_total_volumes else 1.0
                if default_volume <= 0: default_volume = 1.0

                # --- G) Visualise (Original Full Graph) ---
                print("\nPreparing visualization (Full Graph)...")
                
                node_colors_full = [comm2color_im.get(member_to_committee_infomap.get(mid_str, "Unassigned"), "silver") for mid_str in nodes_to_draw_full]
                
                raw_sizes_full = [math.sqrt(max(1, node_total_volumes.get(mid_str, default_volume))) for mid_str in nodes_to_draw_full]
                min_raw_size_f, max_raw_size_f = (min(raw_sizes_full), max(raw_sizes_full)) if raw_sizes_full else (1,1)
                
                node_sizes_final_full = []
                if max_raw_size_f > min_raw_size_f : 
                    node_sizes_final_full = [50 + 950 * (rs - min_raw_size_f) / (max_raw_size_f - min_raw_size_f) for rs in raw_sizes_full]
                elif raw_sizes_full: 
                    node_sizes_final_full = [200] * len(raw_sizes_full)
                else: 
                    node_sizes_final_full = []

                print("Calculating layout (Full Graph)...")
                k_val_full = 0.5 / math.sqrt(max(1, G_mem_im_sparse.number_of_nodes())) if G_mem_im_sparse.number_of_nodes() > 0 else 0.5
                pos_full = nx.spring_layout(G_mem_im_sparse, seed=44, weight="weight", k=k_val_full, iterations=30)
                
                edge_weights_data_full = [d['weight'] for _,_,d in G_mem_im_sparse.edges(data=True)]
                max_w_full = max(edge_weights_data_full) if edge_weights_data_full else 1.0
                if max_w_full == 0: max_w_full = 1.0
                edge_widths_full = [(d["weight"] / max_w_full) * 3.0 + 0.2 for _, _, d in G_mem_im_sparse.edges(data=True)]

                print("Drawing graph (Full Graph)...")
                plt.figure(figsize=(18, 15))
                nx.draw_networkx_edges(G_mem_im_sparse, pos_full, width=edge_widths_full, alpha=0.15, edge_color="#888888")
                
                if len(node_sizes_final_full) != len(nodes_to_draw_full):
                    print(f"Warning (Full Graph): Mismatch in node_sizes_final_full ({len(node_sizes_final_full)}) and nodes_to_draw_full ({len(nodes_to_draw_full)}). Using default size.")
                    node_sizes_final_full = 100
                nx.draw_networkx_nodes(G_mem_im_sparse, pos_full, nodelist=nodes_to_draw_full, node_color=node_colors_full, 
                                       node_size=node_sizes_final_full, alpha=0.85, edgecolors='black', linewidths=0.3)

                labels_to_draw_full = {}
                if node_sizes_final_full and nodes_to_draw_full:
                    node_id_to_size_map_f = {node_id: size for node_id, size in zip(nodes_to_draw_full, node_sizes_final_full)}
                    sorted_nodes_by_size_f = sorted(node_id_to_size_map_f.keys(), key=lambda nid: node_id_to_size_map_f[nid], reverse=True)
                    num_labels_f = min(20, int(len(sorted_nodes_by_size_f) * 0.05) + 1) 
                    nodes_to_label_ids_f = sorted_nodes_by_size_f[:num_labels_f]
                    labels_to_draw_full = {nid: id2name_im_local.get(nid, nid) for nid in nodes_to_label_ids_f}
                if labels_to_draw_full:
                    nx.draw_networkx_labels(G_mem_im_sparse, pos_full, labels=labels_to_draw_full, font_size=8, font_weight='normal',
                                            bbox=dict(facecolor='white', alpha=0.4, edgecolor='none', boxstyle='round,pad=0.1'))

                plt.title("Member-Level Trading Network (Shared Sector Volume, Sparsified)\nColor: Committee | Node Size: Total Trading Volume | Communities by Infomap", pad=20, fontsize=16)
                handles_im_full = [mpatches.Patch(color=comm2color_im.get(c, "silver"), label=str(c)) for c in unique_comms_im]
                plt.legend(handles=handles_im_full, title="Committees", bbox_to_anchor=(1.02, 1), loc="upper left", frameon=False, fontsize=10)
                plt.axis("off"); plt.tight_layout(rect=[0, 0, 0.85, 1])
                
                out_png_full = os.path.join(INFOMAP_OUTPUT_DIR, "member_infomap_vol_committee_color_FULL.png") # Added _FULL
                print(f"Saving full plot to {out_png_full}...")
                plt.savefig(out_png_full, dpi=250, bbox_inches='tight'); plt.close()
                print(f"[Saved] Visualization (Full Graph): {out_png_full}")

                # --- G.2) Visualise (Zoomed-in on Central Cluster) ---
                print("\nPreparing visualization (Zoomed-in Graph)...")
                
                target_node_name = "Josh Gottheimer" 
                target_node_id = None
                for mid, name in id2name_im_local.items():
                    if name == target_node_name:
                        target_node_id = mid
                        break
                
                if target_node_id and target_node_id in G_mem_im_sparse:
                    nodes_for_zoom = {target_node_id}
        
                    for neighbor in G_mem_im_sparse.neighbors(target_node_id):
                        nodes_for_zoom.add(neighbor)

                    G_zoom = G_mem_im_sparse.subgraph(list(nodes_for_zoom)).copy()

                    if G_zoom.number_of_nodes() > 1 and G_zoom.number_of_edges() >= 0: # Allow 0 edges if it's a star around central
                        print(f"Zoomed graph has {G_zoom.number_of_nodes()} nodes and {G_zoom.number_of_edges()} edges.")

                        node_colors_zoom = [comm2color_im.get(member_to_committee_infomap.get(mid_str, "Unassigned"), "silver") 
                                            for mid_str in G_zoom.nodes()]
                        
                        raw_sizes_zoom = [math.sqrt(max(1, node_total_volumes.get(mid_str, default_volume))) 
                                          for mid_str in G_zoom.nodes()]
                        min_raw_size_z, max_raw_size_z = (min(raw_sizes_zoom), max(raw_sizes_zoom)) if raw_sizes_zoom else (1,1)
                        
                        node_sizes_final_zoom = []
                        # Make nodes larger in zoomed plot
                        min_zoom_size, max_zoom_size = 200, 2500 
                        if max_raw_size_z > min_raw_size_z :
                            node_sizes_final_zoom = [min_zoom_size + (max_zoom_size - min_zoom_size) * (rs - min_raw_size_z) / (max_raw_size_z - min_raw_size_z) for rs in raw_sizes_zoom]
                        elif raw_sizes_zoom:
                            node_sizes_final_zoom = [ (min_zoom_size + max_zoom_size) / 2 ] * len(raw_sizes_zoom) # Mid-range size
                        else:
                            node_sizes_final_zoom = []

                        print("Calculating layout (Zoomed Graph)...")
                        k_val_zoom = 0.9 / math.sqrt(max(1, G_zoom.number_of_nodes())) if G_zoom.number_of_nodes() > 0 else 0.9
                        pos_zoom = nx.spring_layout(G_zoom, seed=43, weight="weight", k=k_val_zoom, iterations=100) # Different seed, more iterations

                        edge_weights_data_zoom = [d['weight'] for _,_,d in G_zoom.edges(data=True)]
                        max_w_zoom = max(edge_weights_data_zoom) if edge_weights_data_zoom else 1.0
                        if max_w_zoom == 0: max_w_zoom = 1.0
                        edge_widths_zoom = [(d["weight"] / max_w_zoom) * 4.0 + 0.5 for _, _, d in G_zoom.edges(data=True)]

                        print("Drawing graph (Zoomed Graph)...")
                        plt.figure(figsize=(14, 12)) 
                        nx.draw_networkx_edges(G_zoom, pos_zoom, width=edge_widths_zoom, alpha=0.25, edge_color="#666666")
                        
                        nx.draw_networkx_nodes(G_zoom, pos_zoom, nodelist=list(G_zoom.nodes()), node_color=node_colors_zoom, 
                                               node_size=node_sizes_final_zoom, alpha=0.9, edgecolors='black', linewidths=0.4)
                        
                        labels_zoom = {nid: id2name_im_local.get(nid, nid) for nid in G_zoom.nodes()}
                        nx.draw_networkx_labels(G_zoom, pos_zoom, labels=labels_zoom, font_size=10, font_weight='bold', # Larger font
                                                bbox=dict(facecolor='white', alpha=0.6, edgecolor='none', boxstyle='round,pad=0.2'))

                        plt.title(f"Zoomed: Cluster around {target_node_name}\n(Member-Level Trading Network)", pad=15, fontsize=16)
                        plt.axis("off")
                        plt.tight_layout()
                        
                        out_png_zoom = os.path.join(INFOMAP_OUTPUT_DIR, "member_infomap_vol_ZOOMED_cluster.png")
                        print(f"Saving zoomed plot to {out_png_zoom}...")
                        plt.savefig(out_png_zoom, dpi=250, bbox_inches='tight'); plt.close()
                        print(f"[Saved] Visualization (Zoomed Graph): {out_png_zoom}")
                    else:
                        print(f"Zoomed subgraph for '{target_node_name}' is too small or has no edges. Skipping zoomed plot.")
                else:
                    print(f"Target node '{target_node_name}' not found or not in graph. Skipping zoomed plot.")
            else:
                print("Infomap modules could not be determined or no modules found. Skipping assignments saving and plot.")
        else:
            print("Sparsified graph for Infomap has no edges. Skipping Infomap run and visualization.")
    print("Member-Level Infomap Analysis finished.")

Starting Member-Level Infomap Analysis...
Built member_to_committee_infomap with 355 entries.
Using global id2name map.
Using id2name_im_local with 190 entries.
Pivoting transaction data...
Pivot table shape: (186, 20)
Number of members with committee assignments and trades: 112
Building full member-member graph...
Full member graph (Infomap): 112 nodes, 5022 edges.
Sparsifying graph, keeping edges with weights >= 99th percentile...
Sparse member graph (Infomap): 112 nodes, 51 edges.
Preparing data for Infomap...
Added 112 nodes to Infomap instance.
Added 51 links to Infomap instance.
Running Infomap algorithm...
Infomap run completed: Codelength L=3.494 bits; Found 97 top-level modules.
Extracting module assignments from Infomap result...
Successfully extracted 112 module assignments for 97 unique modules.
  Infomap v2.8.0 starts at 2025-06-05 03:02:04
  -> Input network: 
  -> No file output!
  -> Configuration: two-level
  -> Ordinary network input, using the Map Equation for first 

  try: palette_im = cm.get_cmap("tab20", num_colors_needed if num_colors_needed > 0 else 1)


Saving full plot to network_analysis_count_based/member_infomap_volume_weighted/member_infomap_vol_committee_color_FULL.png...
[Saved] Visualization (Full Graph): network_analysis_count_based/member_infomap_volume_weighted/member_infomap_vol_committee_color_FULL.png

Preparing visualization (Zoomed-in Graph)...
Zoomed graph has 11 nodes and 37 edges.
Calculating layout (Zoomed Graph)...
Drawing graph (Zoomed Graph)...
Saving zoomed plot to network_analysis_count_based/member_infomap_volume_weighted/member_infomap_vol_ZOOMED_cluster.png...
[Saved] Visualization (Zoomed Graph): network_analysis_count_based/member_infomap_volume_weighted/member_infomap_vol_ZOOMED_cluster.png
Member-Level Infomap Analysis finished.


In [82]:
# Cell 13: Crosstab of Committee vs. Infomap Module

if 'assignment_df_im' in globals() and not assignment_df_im.empty:
    print("\n--- Committee vs. Infomap Module Crosstab ---")
    # Ensure 'committee' and 'module_id' columns exist
    if 'committee' in assignment_df_im.columns and 'module_id' in assignment_df_im.columns:
        ct_im = pd.crosstab(assignment_df_im['committee'], assignment_df_im['module_id'])
        
        # Calculate row-wise proportion of the largest module for each committee
        # Avoid division by zero if a committee row sum is 0 (though unlikely if members were assigned)
        row_sums_im = ct_im.sum(axis=1)
        # ct_im['row_prop'] = ct_im.max(axis=1).divide(row_sums_im, axis=0).fillna(0) # Old way
        
        # A more robust way to get the proportion of the dominant module
        dominant_module_prop = []
        for index, row in ct_im.iterrows():
            if row.sum() > 0:
                dominant_module_prop.append(row.max() / row.sum())
            else:
                dominant_module_prop.append(0)
        ct_im['dominant_module_proportion'] = dominant_module_prop

        ct_im_sorted = ct_im.sort_values('dominant_module_proportion', ascending=False)
        print(ct_im_sorted)
        
        # Save to CSV
        ct_im_sorted.to_csv(os.path.join(INFOMAP_OUTPUT_DIR, "committee_vs_infomap_module_crosstab.csv"))
        print(f"[Saved] Crosstab to {INFOMAP_OUTPUT_DIR}/committee_vs_infomap_module_crosstab.csv")
    else:
        print("Warning: 'committee' or 'module_id' not found in assignment_df_im. Skipping crosstab.")
else:
    print("Skipping Committee vs. Infomap Module Crosstab: 'assignment_df_im' not available or empty.")


--- Committee vs. Infomap Module Crosstab ---
module_id                                      1  2  3  4  5  6  7  8  9  10  \
committee                                                                      
Senate Finance                                 2  0  0  0  0  0  0  0  0   0   
Senate Health, Education, Labor, and Pensions  2  0  0  0  0  0  0  0  0   0   
House Ways and Means                           3  0  0  0  0  0  0  0  0   0   
House Oversight and Reform                     1  2  0  0  0  0  0  0  0   0   
House Financial Services                       1  2  0  0  0  0  0  0  1   0   
Senate Appropriations                          0  0  0  0  0  0  0  0  0   0   
Senate Banking, Housing, and Urban Affairs     1  0  0  0  0  0  0  0  0   0   
House Energy and Commerce                      2  0  0  1  1  1  1  0  0   0   
House Appropriations                           1  0  1  0  0  0  0  1  0   1   

module_id                                      ...  89  90  91  92  93  

In [83]:
# Cell 14: Attribute Assortativity

if 'G_mem_im_sparse' in globals() and G_mem_im_sparse.number_of_nodes() > 0:
    print("\n--- Committee Assortativity ---")
    # Ensure 'member_to_committee_infomap' is used, as it's scoped for this Infomap block
    comm_attr_im = {m: member_to_committee_infomap.get(m, 'Unassigned_Comm') for m in G_mem_im_sparse.nodes()}
    nx.set_node_attributes(G_mem_im_sparse, comm_attr_im, 'committee_attr') # Use a unique attribute name
    
    if G_mem_im_sparse.number_of_edges() > 0:
        try:
            r_assortativity = nx.attribute_assortativity_coefficient(G_mem_im_sparse, 'committee_attr')
            print(f"Committee assortativity r = {r_assortativity:.3f}")
            
            # Save to a file
            with open(os.path.join(INFOMAP_OUTPUT_DIR, "committee_assortativity.txt"), "w") as f:
                f.write(f"Committee assortativity r = {r_assortativity:.3f}\n")
            print(f"[Saved] Assortativity to {INFOMAP_OUTPUT_DIR}/committee_assortativity.txt")

        except Exception as e:
            print(f"Could not calculate assortativity: {e}")
    else:
        print("Graph has no edges, cannot calculate assortativity.")
else:
    print("Skipping Assortativity: Infomap graph 'G_mem_im_sparse' not available or empty.")


--- Committee Assortativity ---
Committee assortativity r = -0.109
[Saved] Assortativity to network_analysis_count_based/member_infomap_volume_weighted/committee_assortativity.txt


In [84]:
# Cell 15: Top Bridges

if 'G_mem_im_sparse' in globals() and G_mem_im_sparse.number_of_edges() > 0 : # Needs edges
    print("\n--- Top Bridges (Edge Betweenness Centrality) ---")
    try:
        # Note: Edge betweenness can be slow on larger graphs.
        # If G_mem_im_sparse is very large, consider sampling or a faster approximation.
        bridges_im = nx.edge_betweenness_centrality(G_mem_im_sparse, weight='weight', normalized=True)
        
        if bridges_im: # Check if bridges dictionary is not empty
            top_bridges_im = sorted(bridges_im.items(), key=lambda x: -x[1])[:15] # Top 15
            print(f"Top {len(top_bridges_im)} Bridges (by betweenness centrality):")
            
            bridge_data_to_save = []
            for (u_br, v_br), bc_br in top_bridges_im:
                comm_u = member_to_committee_infomap.get(u_br, "N/A")
                comm_v = member_to_committee_infomap.get(v_br, "N/A")
                print(f"{id2name_im_local.get(u_br, u_br)} ({u_br}) ↔ {id2name_im_local.get(v_br, v_br)} ({v_br}): {bc_br:.4f} (Committees: {comm_u} | {comm_v})")
                bridge_data_to_save.append({
                    "member1_id": u_br, "member1_name": id2name_im_local.get(u_br, u_br), "member1_committee": comm_u,
                    "member2_id": v_br, "member2_name": id2name_im_local.get(v_br, v_br), "member2_committee": comm_v,
                    "betweenness": bc_br
                })
            
            pd.DataFrame(bridge_data_to_save).to_csv(os.path.join(INFOMAP_OUTPUT_DIR, "top_bridges.csv"), index=False)
            print(f"[Saved] Top bridges to {INFOMAP_OUTPUT_DIR}/top_bridges.csv")
        else:
            print("No bridge edges found (edge_betweenness_centrality returned empty).")

    except Exception as e:
        print(f"Could not calculate edge betweenness centrality: {e}")
else:
    print("Skipping Top Bridges: Infomap graph 'G_mem_im_sparse' not available or has no edges.")


--- Top Bridges (Edge Betweenness Centrality) ---
Top 15 Bridges (by betweenness centrality):
Trey Hollingsworth (H001074) ↔ Kevin Hern (H001082): 0.0032 (Committees: House Financial Services | House Ways and Means)
Kevin Hern (H001082) ↔ Kenny Marchant (M001158): 0.0026 (Committees: House Ways and Means | House Ways and Means)
Kevin Hern (H001082) ↔ John W. Hickenlooper (S408): 0.0023 (Committees: House Ways and Means | Senate Health, Education, Labor, and Pensions)
Greg Gianforte (G000584) ↔ Van Taylor (T000479): 0.0021 (Committees: House Energy and Commerce | House Financial Services)
Kevin Hern (H001082) ↔ Susie Lee (L000590): 0.0019 (Committees: House Ways and Means | House Appropriations)
Mark E. Green (G000590) ↔ Trey Hollingsworth (H001074): 0.0018 (Committees: House Oversight and Reform | House Financial Services)
Kevin Hern (H001082) ↔ Thomas R. Carper (S277): 0.0018 (Committees: House Ways and Means | Senate Finance)
Josh Gottheimer (G000583) ↔ John W. Hickenlooper (S408): 

In [85]:
# Cell 16: Committee-Level Infomap (using Committee Overlap Edges)

# This uses G_overlap_vol (committee-committee graph based on shared sector VOLUME)
# Ensure G_overlap_vol is defined and populated from Cell 10 (Volume-based analysis)

INFOMAP_COMMITTEE_LEVEL_OUTPUT_DIR = os.path.join(VOLUME_ANALYSIS_OUTPUT_DIR, "committee_level_infomap")
os.makedirs(INFOMAP_COMMITTEE_LEVEL_OUTPUT_DIR, exist_ok=True)


if 'G_overlap_vol' in globals() and G_overlap_vol.number_of_nodes() > 0 and G_overlap_vol.number_of_edges() > 0:
    print("\n--- Infomap on Committee-Committee Graph (Volume Overlap) ---")
    try:
        im_comm_level = Infomap("--two-level")
        id_map_comm_level = {} # committee_name -> integer_id
        next_id_comm_level = 1

        for node_cl in G_overlap_vol.nodes():
            id_map_comm_level[node_cl] = next_id_comm_level
            im_comm_level.add_node(next_id_comm_level, node_cl) # Use committee name as Infomap node name
            next_id_comm_level += 1
        
        for u_cl, v_cl, d_cl in G_overlap_vol.edges(data=True):
            weight_cl = d_cl.get("weight", 1.0) # Default to 1.0 if weight is missing
            if weight_cl > 0 : # Infomap usually expects positive weights
                 im_comm_level.add_link(id_map_comm_level[u_cl], id_map_comm_level[v_cl], weight_cl)

        im_comm_level.run()
        print(f"Committee-Level Infomap: L={im_comm_level.codelength:.3f} bits; modules={im_comm_level.num_top_modules}")

        inv_id_map_cl = {v: k for k, v in id_map_comm_level.items()}
        modules_cl = {}
        for node_iterator_item_cl in im_comm_level.nodes:
            node_int_id_cl = node_iterator_item_cl.id()      # Call id() as a method
            node_module_id_cl = node_iterator_item_cl.module_id # Access module_id as a property
            if node_int_id_cl in inv_id_map_cl:
                committee_name = inv_id_map_cl[node_int_id_cl]
                modules_cl[committee_name] = node_module_id_cl
            else:
                print(f"Warning (Committee-Level): Infomap node with int_id {node_int_id_cl} not in inv_id_map_cl.")

        # Save and Print Modules
        comm_module_data = []
        print("\nCommittee-Level Infomap Modules (Volume Overlap):")
        for mod_id_cl in sorted(set(modules_cl.values())):
            members_in_mod_cl = [comm_name for comm_name, m_id in modules_cl.items() if m_id == mod_id_cl]
            print(f"  Module {mod_id_cl} ({len(members_in_mod_cl)} committees): {', '.join(sorted(members_in_mod_cl))}")
            for comm_name in sorted(members_in_mod_cl):
                comm_module_data.append({"committee": comm_name, "infomap_module_id": mod_id_cl})
        
        pd.DataFrame(comm_module_data).to_csv(os.path.join(INFOMAP_COMMITTEE_LEVEL_OUTPUT_DIR, "committee_infomap_modules_volume.csv"), index=False)
        print(f"[Saved] Committee Infomap modules to {INFOMAP_COMMITTEE_LEVEL_OUTPUT_DIR}/committee_infomap_modules_volume.csv")


        # Visualization (Color nodes by their Infomap module)
        module_ids_cl_list = sorted(list(set(modules_cl.values())))
        palette_cl = cm.get_cmap("tab20", len(module_ids_cl_list) if len(module_ids_cl_list) > 0 else 1)
        node_colors_cl = [palette_cl(module_ids_cl_list.index(modules_cl.get(n,-1))) for n in G_overlap_vol.nodes()]
        
        # Node sizes can be based on original eigenvector centrality from G_overlap_vol if available
        node_sizes_cl = [eig_cent_comm_v.get(n, 0.01) * 8000 + 200 if 'eig_cent_comm_v' in globals() else 1000 for n in G_overlap_vol.nodes()]


        pos_cl = nx.spring_layout(G_overlap_vol, seed=42, weight="weight", k=0.7, iterations=50)
        edge_weights_cl_data = [d.get('weight',1.0) for _,_,d in G_overlap_vol.edges(data=True)]
        max_w_cl = max(edge_weights_cl_data) if edge_weights_cl_data else 1.0
        if max_w_cl == 0: max_w_cl = 1.0
        edge_widths_cl = [(d.get("weight",1.0) / max_w_cl) * 5 + 0.5 for _, _, d in G_overlap_vol.edges(data=True)]

        plt.figure(figsize=(13, 11))
        nx.draw_networkx_edges(G_overlap_vol, pos_cl, width=edge_widths_cl, alpha=0.3, edge_color="#888888")
        nx.draw_networkx_nodes(G_overlap_vol, pos_cl, node_color=node_colors_cl, node_size=node_sizes_cl, alpha=0.9, edgecolors='black', linewidths=0.5)
        nx.draw_networkx_labels(G_overlap_vol, pos_cl, font_size=8, font_weight='normal')
        plt.title("Committee Network (Shared Sector Volume) - Infomap Communities", fontsize=14, pad=15)
        
        # Create legend for modules
        handles_cl = [mpatches.Patch(color=palette_cl(module_ids_cl_list.index(mod_id)), label=f"Module {mod_id}") for mod_id in module_ids_cl_list]
        if handles_cl:
            plt.legend(handles=handles_cl, title="Infomap Modules", bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=9, title_fontsize=10)
        
        plt.axis("off"); plt.tight_layout(rect=[0,0,0.85,1])
        plt.savefig(os.path.join(INFOMAP_COMMITTEE_LEVEL_OUTPUT_DIR, "committee_projection_infomap_volume.png"), dpi=200)
        plt.close()
        print(f"[Saved] {INFOMAP_COMMITTEE_LEVEL_OUTPUT_DIR}/committee_projection_infomap_volume.png")

    except NameError as e: # Catch if Infomap is not imported
        print(f"Skipping Committee-Level Infomap due to missing library or error: {e}")
    except Exception as e:
        print(f"Error during Committee-Level Infomap: {e}")
else:
    print("Skipping Committee-Level Infomap: 'G_overlap_vol' (committee-committee graph from volume) not available or empty.")


--- Infomap on Committee-Committee Graph (Volume Overlap) ---
Committee-Level Infomap: L=3.121 bits; modules=1

Committee-Level Infomap Modules (Volume Overlap):
  Module 1 (10 committees): General Population, House Appropriations, House Energy and Commerce, House Financial Services, House Oversight and Reform, House Ways and Means, Senate Appropriations, Senate Banking, Housing, and Urban Affairs, Senate Finance, Senate Health, Education, Labor, and Pensions
  Infomap v2.8.0 starts at 2025-06-05 03:02:05
  -> Input network: 
  -> No file output!
  -> Configuration: two-level
  -> Ordinary network input, using the Map Equation for first order network flows
Calculating global network flow using flow model 'undirected'... 
  -> Using undirected links.
  => Sum node flow: 1, sum link flow: 1
Build internal network with 10 nodes and 45 links...
  -> One-level codelength: 3.12089394

Trial 1/1 starting at 2025-06-05 03:02:05
Two-level compression: -1.8e-13% 
Partitioned to codelength 0 + 3

[Saved] network_analysis_volume_based/committee_level_infomap/committee_projection_infomap_volume.png


  palette_cl = cm.get_cmap("tab20", len(module_ids_cl_list) if len(module_ids_cl_list) > 0 else 1)


In [95]:
import pandas as pd
import networkx as nx
# from networkx.algorithms import bipartite # Not strictly needed if calling nx.bipartite directly
import matplotlib.pyplot as plt
import numpy as np
import os
from datetime import datetime, timedelta
from collections import defaultdict

# --- User Adjustable Parameters ---
MEETING_DAYS_BEFORE = 12
MEETING_DAYS_AFTER = 3
MEETING_TOP_K_TO_PLOT = 10
MEETING_EDGE_WEIGHTING = 'volume_net' # 'count', 'volume', or 'volume_net'
MEETING_MIN_TX_FOR_EDGE = 1 # Min transactions if weighting by count (for count)
                            # For volume/volume_net, a small positive float might be better e.g. 0.001
TOP_N_CONTRIBUTING_MEMBERS_TO_SHOW = 3 # How many top members to show per sector link

MEETING_BIPARTITE_OUTPUT_DIR = "meeting_day_sector_bipartite_analysis_v3_detailed" # New output dir
os.makedirs(MEETING_BIPARTITE_OUTPUT_DIR, exist_ok=True)

# --- ASSUME THESE ARE ALREADY DEFINED AND POPULATED ---
# STOCK_TRANSACTIONS_DF (with 'member_id', 'member' (full name), 'transaction_date_dt', 'sector', 'amount', 'type')
# COMMITTEE_MEETINGS_DF (with 'Committee', 'Meeting Title', 'meeting_date_dt')
# COMMITTEE_MEMBERSHIP_MAP
# CONGRESS_PERIOD_MAP
# id2name (dictionary mapping member_id to full name)

if 'id2name' not in globals(): # Ensure id2name exists for prettier output
    print("Warning: id2name map not found. Member names might be missing in detailed output.")
    id2name = {}

print(f"STOCK_TRANSACTIONS_DF shape: {STOCK_TRANSACTIONS_DF.shape}, COMMITTEE_MEETINGS_DF shape: {COMMITTEE_MEETINGS_DF.shape}, len(COMMITTEE_MEMBERSHIP_MAP): {len(COMMITTEE_MEMBERSHIP_MAP)}, len(CONGRESS_PERIOD_MAP): {len(CONGRESS_PERIOD_MAP)}")

if 'STOCK_TRANSACTIONS_DF' not in globals() or 'COMMITTEE_MEETINGS_DF' not in globals() or \
   'COMMITTEE_MEMBERSHIP_MAP' not in globals() or 'CONGRESS_PERIOD_MAP' not in globals():
    print("CRITICAL: Required DataFrames/Mappings not found. Skipping Meeting Day Analysis.")
    exit() # Or handle gracefully
else:
    print(f"\n--- Starting Meeting Day Bipartite Network Analysis ({MEETING_EDGE_WEIGHTING}-weighted) ---")
    print(f"Window: {MEETING_DAYS_BEFORE} days before, {MEETING_DAYS_AFTER} days after meeting.")

    # --- Preprocessing ---
    # (Assuming preprocessing from your script is done: datetime conversions, string types, fillna)
    # For robustness, let's ensure key columns again
    for df_check, name, date_col_orig, date_col_dt in [
        (STOCK_TRANSACTIONS_DF, "Stocks", "transaction_date", "transaction_date_dt"),
        (COMMITTEE_MEETINGS_DF, "Meetings", "Meeting Date", "meeting_date_dt")]:
        if date_col_dt not in df_check.columns or not pd.api.types.is_datetime64_any_dtype(df_check[date_col_dt]):
            df_check[date_col_dt] = pd.to_datetime(df_check[date_col_orig], errors='coerce')
        if df_check[date_col_dt].dt.tz is not None:
            df_check[date_col_dt] = df_check[date_col_dt].dt.tz_localize(None)
        df_check.dropna(subset=[date_col_dt], inplace=True)

    STOCK_TRANSACTIONS_DF["member_id"] = STOCK_TRANSACTIONS_DF["member_id"].astype(str)
    STOCK_TRANSACTIONS_DF["sector"] = STOCK_TRANSACTIONS_DF["sector"].fillna("Unspecified_Sector").astype(str)
    STOCK_TRANSACTIONS_DF["amount"] = pd.to_numeric(STOCK_TRANSACTIONS_DF["amount"], errors='coerce').fillna(0.0)
    STOCK_TRANSACTIONS_DF["type"] = STOCK_TRANSACTIONS_DF["type"].astype(str).str.lower()

    COMMITTEE_MEETINGS_DF.dropna(subset=['Committee', 'Meeting Title'], inplace=True)
    COMMITTEE_MEETINGS_DF['Meeting Title'] = COMMITTEE_MEETINGS_DF['Meeting Title'].astype(str)
    COMMITTEE_MEETINGS_DF['Committee'] = COMMITTEE_MEETINGS_DF['Committee'].astype(str)
    if 'meeting_date_str_original' not in COMMITTEE_MEETINGS_DF.columns:
        COMMITTEE_MEETINGS_DF['meeting_date_str_original'] = COMMITTEE_MEETINGS_DF['meeting_date_dt'].dt.strftime('%Y-%m-%d')

    for comm_md_key, periods_md_val in COMMITTEE_MEMBERSHIP_MAP.items():
        for period_md_key, members_md_val_set in periods_md_val.items():
            COMMITTEE_MEMBERSHIP_MAP[comm_md_key][period_md_key] = {str(m) for m in members_md_val_set}

    # --- Aggregate Trades around Meeting Days ---
    year_to_congress_period_md_map = {yr: p for p, yrs in CONGRESS_PERIOD_MAP.items() for yr in yrs}
    meeting_day_sector_links_detailed = [] # To store more detailed info including members

    unique_meeting_events_df = COMMITTEE_MEETINGS_DF.groupby(
        ['Committee', 'meeting_date_str_original', 'meeting_date_dt']
    )['Meeting Title'].apply(lambda x: sorted(list(set(x)))).reset_index() # Ensure unique, sorted titles
    
    print(f"Processing {len(unique_meeting_events_df)} unique committee-meeting-day events...")

    for _, event_row_data in tqdm(unique_meeting_events_df.iterrows(), total=len(unique_meeting_events_df), desc="Aggregating trades around meetings"):
        committee_name_md_val = event_row_data['Committee']
        meeting_date_obj_md_val = event_row_data['meeting_date_dt']
        meeting_date_str_md_val = event_row_data['meeting_date_str_original']
        meeting_titles_md_list = event_row_data['Meeting Title']

        meeting_year_md_val = meeting_date_obj_md_val.year
        congress_p_md_val = year_to_congress_period_md_map.get(meeting_year_md_val)
        if not congress_p_md_val: continue
        
        comm_members_md_set = COMMITTEE_MEMBERSHIP_MAP.get(committee_name_md_val, {}).get(congress_p_md_val)
        if not comm_members_md_set: continue

        start_window_dt = meeting_date_obj_md_val - timedelta(days=MEETING_DAYS_BEFORE)
        end_window_dt = meeting_date_obj_md_val + timedelta(days=MEETING_DAYS_AFTER)

        period_tx_df_slice = STOCK_TRANSACTIONS_DF[
            (STOCK_TRANSACTIONS_DF['member_id'].isin(comm_members_md_set)) &
            (STOCK_TRANSACTIONS_DF['transaction_date_dt'] >= start_window_dt) &
            (STOCK_TRANSACTIONS_DF['transaction_date_dt'] <= end_window_dt)
        ]
        
        is_buy_md_series = period_tx_df_slice['type'].str.contains('purchase|buy', case=False, na=False)
        is_sell_md_series = period_tx_df_slice['type'].str.contains('sale|sell', case=False, na=False)
        relevant_tx_md_df = period_tx_df_slice[is_buy_md_series | is_sell_md_series].copy()

        if not relevant_tx_md_df.empty:
            # Calculate signed amount for net volume calculation
            relevant_tx_md_df.loc[:, 'signed_amount'] = np.where(
                is_buy_md_series[relevant_tx_md_df.index], # Align index for where condition
                relevant_tx_md_df['amount'], 
                -relevant_tx_md_df['amount']
            )
            
            # Group by sector to get overall link weight AND contributing members
            for sector_name, sector_group_df in relevant_tx_md_df.groupby('sector'):
                link_weight = 0
                if MEETING_EDGE_WEIGHTING == 'count':
                    link_weight = len(sector_group_df)
                elif MEETING_EDGE_WEIGHTING == 'volume':
                    link_weight = sector_group_df['amount'].sum()
                elif MEETING_EDGE_WEIGHTING == 'volume_net':
                    link_weight = sector_group_df['signed_amount'].sum()
                
                min_thresh_val = MEETING_MIN_TX_FOR_EDGE if MEETING_EDGE_WEIGHTING == 'count' else 0.001
                if abs(link_weight) >= min_thresh_val:
                    # Identify top contributing members to this sector's net volume/count
                    member_contributions = sector_group_df.groupby('member_id')['signed_amount' if MEETING_EDGE_WEIGHTING == 'volume_net' else 'amount'].sum()
                    if MEETING_EDGE_WEIGHTING == 'count':
                        member_contributions = sector_group_df['member_id'].value_counts()
                    
                    # Sort members by absolute contribution for 'volume_net', or by count/volume otherwise
                    if MEETING_EDGE_WEIGHTING == 'volume_net':
                        sorted_members_contrib = member_contributions.abs().sort_values(ascending=False)
                    else:
                        sorted_members_contrib = member_contributions.sort_values(ascending=False)
                    
                    top_contributors_list = []
                    for mem_id, contrib_val in sorted_members_contrib.head(TOP_N_CONTRIBUTING_MEMBERS_TO_SHOW).items():
                        mem_name = id2name.get(mem_id, mem_id)
                        # For volume_net, show actual signed contribution
                        actual_contrib = member_contributions.get(mem_id, contrib_val) if MEETING_EDGE_WEIGHTING == 'volume_net' else contrib_val
                        top_contributors_list.append(f"{mem_name} ({actual_contrib:,.0f})")


                    meeting_day_id_str_md = f"{committee_name_md_val}|{meeting_date_str_md_val}"
                    meeting_day_sector_links_detailed.append({
                        'meeting_day_id': meeting_day_id_str_md,
                        'committee': committee_name_md_val,
                        'date': meeting_date_str_md_val,
                        'sector': sector_name,
                        'weight': link_weight,
                        'plot_weight': abs(link_weight),
                        'titles': meeting_titles_md_list,
                        'top_contributors': ", ".join(top_contributors_list) if top_contributors_list else "N/A"
                    })
    
    if not meeting_day_sector_links_detailed:
        print("No relevant trades found linking meeting days to sectors after detailed aggregation.")
    else:
        df_meeting_links_detailed = pd.DataFrame(meeting_day_sector_links_detailed)
        print(f"Generated {len(df_meeting_links_detailed)} detailed committee-meeting-day <-> sector links.")
        
        # Save detailed links to CSV
        detailed_links_csv_path = os.path.join(MEETING_BIPARTITE_OUTPUT_DIR, f"meeting_day_sector_links_detailed_{MEETING_EDGE_WEIGHTING}.csv")
        df_meeting_links_detailed.to_csv(detailed_links_csv_path, index=False)
        print(f"[Saved] Detailed links with contributors to: {detailed_links_csv_path}")


        # --- Build Bipartite Graph (same as before) ---
        B_meetings_viz = nx.Graph()
        all_meeting_day_ids_viz = df_meeting_links_detailed['meeting_day_id'].unique()
        all_linked_sectors_viz = df_meeting_links_detailed['sector'].unique()
        
        B_meetings_viz.add_nodes_from(all_meeting_day_ids_viz, bipartite=0, type='meeting_day_event')
        B_meetings_viz.add_nodes_from(all_linked_sectors_viz, bipartite=1, type='sector')

        # Store titles and top contributors as node attributes for meeting_day_id nodes
        meeting_day_attributes = df_meeting_links_detailed.groupby('meeting_day_id').agg(
            titles=('titles', 'first'), # Assuming titles are same for a given meeting_day_id
            # Note: Top contributors are per sector link, not per meeting day.
            # We will access this from df_meeting_links_detailed during printout.
        ).to_dict('index')
        nx.set_node_attributes(B_meetings_viz, meeting_day_attributes)

        for _, link_row_data in df_meeting_links_detailed.iterrows():
            B_meetings_viz.add_edge(link_row_data['meeting_day_id'], link_row_data['sector'], 
                                weight=link_row_data['weight'], plot_weight=link_row_data['plot_weight'])
        print(f"Meeting Day Bipartite Graph (for viz): {B_meetings_viz.number_of_nodes()} nodes, {B_meetings_viz.number_of_edges()} edges.")
        
        # --- Plotting (Focus on Top K Meeting Days) ---
        graph_to_plot_final_md = B_meetings_viz
        meeting_day_nodes_for_plot_final = list(all_meeting_day_ids_viz)

        if MEETING_TOP_K_TO_PLOT > 0 and len(all_meeting_day_ids_viz) > MEETING_TOP_K_TO_PLOT:
            md_abs_weights_plot = defaultdict(float)
            for u, v, data_edge in B_meetings_viz.edges(data=True): # Iterate over graph edges
                node_u_type = B_meetings_viz.nodes[u].get('type')
                node_v_type = B_meetings_viz.nodes[v].get('type')
                plot_w = data_edge.get('plot_weight', 0)

                if node_u_type == 'meeting_day_event': md_abs_weights_plot[u] += plot_w
                elif node_v_type == 'meeting_day_event': md_abs_weights_plot[v] += plot_w # Should not happen if u is always meeting_day in construction
            
            if md_abs_weights_plot:
                top_md_nodes_plot = sorted(md_abs_weights_plot, key=md_abs_weights_plot.get, reverse=True)[:MEETING_TOP_K_TO_PLOT]
                nodes_to_include_plot = set(top_md_nodes_plot)
                for md_node_p in top_md_nodes_plot:
                    nodes_to_include_plot.update(B_meetings_viz.neighbors(md_node_p))
                graph_to_plot_final_md = B_meetings_viz.subgraph(nodes_to_include_plot).copy()
                meeting_day_nodes_for_plot_final = [n for n in top_md_nodes_plot if n in graph_to_plot_final_md]
                print(f"Plotting filtered graph: {graph_to_plot_final_md.number_of_nodes()} nodes, {graph_to_plot_final_md.number_of_edges()} edges.")
            # ... (else part for no weights or full graph plotting as before) ...

        if graph_to_plot_final_md.number_of_edges() > 0:
            # ... (Plotting logic - largely same, ensure to use graph_to_plot_final_md) ...
            # (Make sure node attributes like 'type' are correctly carried to the subgraph if filtering)
            # For example, when getting bipartite set for layout:
            layout_meeting_day_nodes_final = [n for n in graph_to_plot_final_md.nodes() if graph_to_plot_final_md.nodes[n].get('type') == 'meeting_day_event']
            if not layout_meeting_day_nodes_final : # Fallback if subgraph doesn't retain type or no meeting days
                layout_meeting_day_nodes_final = [n for n in graph_to_plot_final_md.nodes() if B_meetings_viz.nodes[n].get('type') == 'meeting_day_event']

            if not layout_meeting_day_nodes_final: print("No 'meeting_day_event' nodes for layout in plot graph. Skipping plot.")
            else:
                # (Rest of your plotting code from previous version, using graph_to_plot_final_md and layout_meeting_day_nodes_final)
                # ... PLOTTING CODE ...
                plt.figure(figsize=(max(18, len(layout_meeting_day_nodes_final)*0.6), 14)) # Adjusted figsize
                pos_md_plot = nx.bipartite_layout(graph_to_plot_final_md, layout_meeting_day_nodes_final, align='vertical', scale=3, aspect_ratio=0.35)
                node_colors_plot_list = ['#a6cee3' if graph_to_plot_final_md.nodes[n]['type'] == 'meeting_day_event' else '#fdbf6f' for n in graph_to_plot_final_md.nodes()]
                node_degrees_plot_dict = dict(graph_to_plot_final_md.degree())
                node_sizes_plot_list = [node_degrees_plot_dict.get(n,1) * 100 + 200 for n in graph_to_plot_final_md.nodes()] # Smaller base size
                edge_plot_weights_md = [d.get('plot_weight',1) for _,_,d in graph_to_plot_final_md.edges(data=True)]
                max_pw_md_val = max(edge_plot_weights_md) if edge_plot_weights_md else 1.0; max_pw_md_val = 1.0 if max_pw_md_val == 0 else max_pw_md_val
                edge_widths_md_plot = [0.4 + 4.5 * (d.get('plot_weight',1) / max_pw_md_val) for _,_,d in graph_to_plot_final_md.edges(data=True)]
                edge_colors_md_plot = ['#33a02c' if d.get('weight',0) > 0 else ('#e31a1c' if d.get('weight',0) < 0 else 'lightgray') for _,_,d in graph_to_plot_final_md.edges(data=True)] if MEETING_EDGE_WEIGHTING == 'volume_net' else ['dimgray'] * graph_to_plot_final_md.number_of_edges()
                nx.draw_networkx_nodes(graph_to_plot_final_md, pos_md_plot, node_color=node_colors_plot_list, node_size=node_sizes_plot_list, alpha=0.9, edgecolors='grey', linewidths=0.5)
                nx.draw_networkx_edges(graph_to_plot_final_md, pos_md_plot, width=edge_widths_md_plot, alpha=0.5, edge_color=edge_colors_md_plot)
                labels_md_plot = {n: (f"{n.split('|')[0][:25]}..\\n{n.split('|')[1]}" if graph_to_plot_final_md.nodes[n]['type'] == 'meeting_day_event' else n) for n in graph_to_plot_final_md.nodes()}
                nx.draw_networkx_labels(graph_to_plot_final_md, pos_md_plot, labels=labels_md_plot, font_size=7, font_weight='normal')
                plot_title_str_md = f"Bipartite Network: Committee Meeting Days & Sector Trades ({MEETING_EDGE_WEIGHTING}-weighted)\\nWindow: {MEETING_DAYS_BEFORE} days before, {MEETING_DAYS_AFTER} days after meeting day"
                if MEETING_EDGE_WEIGHTING == 'volume_net': plot_title_str_md += " (Green=Net Buys, Red=Net Sells)"
                plt.title(plot_title_str_md, fontsize=12); plt.axis('off'); plt.tight_layout(pad=0.5)
                plot_filename_final = os.path.join(MEETING_BIPARTITE_OUTPUT_DIR, f"meeting_days_sectors_bipartite_final_{MEETING_EDGE_WEIGHTING}.png")
                plt.savefig(plot_filename_final, dpi=200); plt.close()
                print(f"Saved final bipartite meeting day plot: {plot_filename_final}")

                # --- Enhanced Printout for Top K Meeting Days ---
                print("\n--- Detailed Analysis of Top Plotted Committee-Meeting-Days ---")
                for md_id_print in meeting_day_nodes_for_plot_final: # Use the actual list of plotted meeting day nodes
                    committee_name_print, date_print = md_id_print.split('|')
                    print(f"\n  - Committee-Day: {committee_name_print} on {date_print} (Degree in plot: {graph_to_plot_final_md.degree(md_id_print)})")
                    
                    # Get meeting titles from node attribute if stored, otherwise from map
                    titles_print = graph_to_plot_final_md.nodes[md_id_print].get('meeting_titles', meeting_day_attributes.get(md_id_print, {}).get('titles', ["N/A"]))
                    print(f"    Associated Meeting Titles:")
                    for i_title, title_p in enumerate(titles_print): print(f"      {i_title+1}. {title_p[:120]}") # Truncate long titles

                    print(f"    Trades in Sectors (around this meeting):")
                    # Iterate through the links in df_meeting_links_detailed for this specific meeting_day_id
                    links_for_this_day = df_meeting_links_detailed[df_meeting_links_detailed['meeting_day_id'] == md_id_print].sort_values(by='plot_weight', ascending=False)
                    for _, link_detail_row in links_for_this_day.iterrows():
                        sector_p = link_detail_row['sector']
                        weight_p = link_detail_row['weight']
                        plot_weight_p = link_detail_row['plot_weight']
                        contributors_p = link_detail_row['top_contributors']
                        
                        weight_str_p = f"{weight_p:,.0f}" if MEETING_EDGE_WEIGHTING != 'count' else str(weight_p)
                        print(f"      * Sector: {sector_p.ljust(25)} | {MEETING_EDGE_WEIGHTING.capitalize()}: {weight_str_p.rjust(10)} | Top Contributors: {contributors_p}")
        else:
            print("Graph for meeting day analysis (potentially filtered) has no edges to plot.")
print("\n--- Meeting Day Bipartite Detailed Analysis Script Complete ---")

STOCK_TRANSACTIONS_DF shape: (17043, 16), COMMITTEE_MEETINGS_DF shape: (2763, 9), len(COMMITTEE_MEMBERSHIP_MAP): 9, len(CONGRESS_PERIOD_MAP): 2

--- Starting Meeting Day Bipartite Network Analysis (volume_net-weighted) ---
Window: 12 days before, 3 days after meeting.
Processing 2431 unique committee-meeting-day events...


Aggregating trades around meetings: 100%|██████████| 2431/2431 [00:03<00:00, 767.69it/s] 


Generated 8401 detailed committee-meeting-day <-> sector links.
[Saved] Detailed links with contributors to: meeting_day_sector_bipartite_analysis_v3_detailed/meeting_day_sector_links_detailed_volume_net.csv
Meeting Day Bipartite Graph (for viz): 1065 nodes, 5379 edges.
Plotting filtered graph: 26 nodes, 77 edges.
Saved final bipartite meeting day plot: meeting_day_sector_bipartite_analysis_v3_detailed/meeting_days_sectors_bipartite_final_volume_net.png

--- Detailed Analysis of Top Plotted Committee-Meeting-Days ---

  - Committee-Day: House Ways and Means on 2022-02-15 (Degree in plot: 9)
    Associated Meeting Titles:
      1. Examining the Economic Impact of Federal Infrastructure Investment
    Trades in Sectors (around this meeting):
      * Sector: Technology                | Volume_net: 15,051,000 | Top Contributors: Suzan K. DelBene (15,000,000), Earl Blumenauer (35,000), Ron Estes (8,000)
      * Sector: Public Utilities          | Volume_net:     32,000 | Top Contributors: E

In [None]:
```python
import pandas as pd
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import os

# --- ASSUMPTIONS ---
# STOCK_TRANSACTIONS_DF: Loaded with 'member_id', 'ticker', 'transaction_date_dt' (datetime)
# id2name: Dictionary mapping member_id to member's full name.
# member_to_committee: Dictionary mapping member_id to committee name (not used here for coloring).

member_to_committee = COMMITTEE_MEMBERSHIP_MAP  # Still available, but we'll color by PageRank instead

# --- Ensure required DataFrames/mappings are available ---
if 'STOCK_TRANSACTIONS_DF' not in globals() or \
   'id2name' not in globals() or \
   'member_to_committee' not in globals():
    print("Error: STOCK_TRANSACTIONS_DF, id2name, or member_to_committee is not defined.")

OUTPUT_DIR_LF = "leader_follower_analysis"
os.makedirs(OUTPUT_DIR_LF, exist_ok=True)

# --- Parameters ---
MAX_LAG_DAYS = 7
MIN_LF_COUNT_FOR_EDGE = 10
TOP_N_PAIRS_TO_PRINT = 20
TOP_N_CENTRAL_NODES_TO_PRINT = 10

# --- Preprocessing ---
if not pd.api.types.is_datetime64_any_dtype(STOCK_TRANSACTIONS_DF['transaction_date_dt']):
    STOCK_TRANSACTIONS_DF['transaction_date_dt'] = pd.to_datetime(
        STOCK_TRANSACTIONS_DF['transaction_date_dt'], errors='coerce'
    )
    STOCK_TRANSACTIONS_DF.dropna(subset=['transaction_date_dt'], inplace=True)

STOCK_TRANSACTIONS_DF['member_id'] = STOCK_TRANSACTIONS_DF['member_id'].astype(str)
STOCK_TRANSACTIONS_DF['ticker'] = STOCK_TRANSACTIONS_DF['ticker'].astype(str).str.upper()

if 'type' in STOCK_TRANSACTIONS_DF.columns:
    df_trades_lf = STOCK_TRANSACTIONS_DF[
        STOCK_TRANSACTIONS_DF['type'].astype(str).str.contains('sale|purchase', case=False, na=False)
    ].copy()
else:
    print("Warning: 'type' column not found. Using all transactions for leader-follower analysis.")
    df_trades_lf = STOCK_TRANSACTIONS_DF.copy()

df_trades_lf.sort_values(by=['ticker', 'transaction_date_dt'], inplace=True)

leader_follower_pairs_raw = defaultdict(int)

for ticker_symbol, ticker_group_df in df_trades_lf.groupby('ticker'):
    if len(ticker_group_df) < 2:
        continue

    trades_list = ticker_group_df.to_dict('records')
    for i in range(len(trades_list)):
        trade_A = trades_list[i]
        member_A_id = trade_A['member_id']
        date_A = trade_A['transaction_date_dt']

        for j in range(i + 1, len(trades_list)):
            trade_B = trades_list[j]
            member_B_id = trade_B['member_id']
            date_B = trade_B['transaction_date_dt']

            if member_A_id == member_B_id:
                continue

            time_lag = (date_B - date_A).days
            if 0 < time_lag <= MAX_LAG_DAYS:
                leader_follower_pairs_raw[(member_A_id, member_B_id)] += 1
            elif time_lag > MAX_LAG_DAYS:
                break

if not leader_follower_pairs_raw:
    print("No leader-follower patterns found with the specified lag.")
else:
    df_leader_follower_with_ids = pd.DataFrame([
        {
            "leader_id": k[0],
            "follower_id": k[1],
            "leader_name": id2name.get(k[0], k[0]),
            "follower_name": id2name.get(k[1], k[1]),
            "count": v
        }
        for k, v in leader_follower_pairs_raw.items()
    ]).sort_values(by="count", ascending=False)

    print(f"\nTop {TOP_N_PAIRS_TO_PRINT} Leader-Follower Pairs (lag ≤ {MAX_LAG_DAYS} days):")
    print(
        df_leader_follower_with_ids[['leader_name', 'follower_name', 'count']]
        .head(TOP_N_PAIRS_TO_PRINT)
        .to_string()
    )
    df_leader_follower_with_ids.to_csv(
        os.path.join(OUTPUT_DIR_LF, "leader_follower_all_pairs.csv"),
        index=False
    )
    print(f"[Saved] All leader-follower pairs to {os.path.join(OUTPUT_DIR_LF, 'leader_follower_all_pairs.csv')}")

    # --- Build Directed Graph ---
    G_lf = nx.DiGraph()
    all_members_in_lf_pairs = set()

    for _, row in df_leader_follower_with_ids.iterrows():
        if row['count'] >= MIN_LF_COUNT_FOR_EDGE:
            leader_id = row['leader_id']
            follower_id = row['follower_id']
            G_lf.add_edge(leader_id, follower_id, weight=row['count'])
            all_members_in_lf_pairs.add(leader_id)
            all_members_in_lf_pairs.add(follower_id)

    for mid in all_members_in_lf_pairs:
        if not G_lf.has_node(mid):
            G_lf.add_node(mid)

    if G_lf.number_of_nodes() == 0:
        print("Leader-follower graph is empty (no edges met the threshold).")
    else:
        print(f"\nLeader-Follower Graph: {G_lf.number_of_nodes()} members, {G_lf.number_of_edges()} edges.")

        # --- Centrality Analysis ---
        out_degree_centrality = {n: G_lf.out_degree(n, weight='weight') for n in G_lf.nodes()}
        df_out_degree = pd.DataFrame.from_dict(
            out_degree_centrality, orient='index', columns=['weighted_out_degree']
        )
        df_out_degree['member_name'] = df_out_degree.index.map(lambda x: id2name.get(x, x))
        df_out_degree.sort_values(by='weighted_out_degree', ascending=False, inplace=True)
        print(f"\nTop {TOP_N_CENTRAL_NODES_TO_PRINT} Leaders by Weighted Out-Degree:")
        print(df_out_degree[['member_name', 'weighted_out_degree']].head(TOP_N_CENTRAL_NODES_TO_PRINT).to_string())
        df_out_degree.to_csv(
            os.path.join(OUTPUT_DIR_LF, "leader_follower_out_degree_centrality.csv"),
            index=True
        )

        in_degree_centrality = {n: G_lf.in_degree(n, weight='weight') for n in G_lf.nodes()}
        df_in_degree = pd.DataFrame.from_dict(
            in_degree_centrality, orient='index', columns=['weighted_in_degree']
        )
        df_in_degree['member_name'] = df_in_degree.index.map(lambda x: id2name.get(x, x))
        df_in_degree.sort_values(by='weighted_in_degree', ascending=False, inplace=True)
        print(f"\nTop {TOP_N_CENTRAL_NODES_TO_PRINT} Followers by Weighted In-Degree:")
        print(df_in_degree[['member_name', 'weighted_in_degree']].head(TOP_N_CENTRAL_NODES_TO_PRINT).to_string())
        df_in_degree.to_csv(
            os.path.join(OUTPUT_DIR_LF, "leader_follower_in_degree_centrality.csv"),
            index=True
        )

        # Compute PageRank once for coloring
        try:
            pagerank_dict = nx.pagerank(G_lf, weight='weight')
        except Exception as e:
            print(f"Could not calculate PageRank: {e}")
            pagerank_dict = {n: 0.0 for n in G_lf.nodes()}

        # --- Visualization of the Leader-Follower Graph (Colored by PageRank) ---
        plt.figure(figsize=(18, 15))
        pos_lf = nx.spring_layout(G_lf, k=0.5, iterations=50, seed=42)

        # Normalize PageRank values for colormap
        pr_values = pd.Series(pagerank_dict)
        if not pr_values.empty:
            pr_min, pr_max = pr_values.min(), pr_values.max()
            norm = mcolors.Normalize(vmin=pr_min, vmax=pr_max)
            cmap = cm.viridis
            node_colors_lf = [cmap(norm(pagerank_dict[n])) for n in G_lf.nodes()]
        else:
            node_colors_lf = ["silver" for _ in G_lf.nodes()]

        # Node sizes by out-degree (normalized between [50, 1000])
        out_degrees_for_size = pd.Series({n: G_lf.out_degree(n, weight='weight') for n in G_lf.nodes()})
        if not out_degrees_for_size.empty and out_degrees_for_size.max() > 0:
            normalized = (
                out_degrees_for_size.values - out_degrees_for_size.min()
            ) / (out_degrees_for_size.max() - out_degrees_for_size.min())
            node_sizes_lf = 50 + (normalized * 950)
        else:
            node_sizes_lf = [200] * G_lf.number_of_nodes()

        # Edge widths by weight (normalized between [0.2, 3.2])
        edge_weights_lf = [d['weight'] for _, _, d in G_lf.edges(data=True)]
        if edge_weights_lf:
            max_edge_w_lf = max(edge_weights_lf) or 1.0
            edge_widths_lf_plot = [(w / max_edge_w_lf) * 3.0 + 0.2 for w in edge_weights_lf]
        else:
            edge_widths_lf_plot = [0.2 for _ in G_lf.edges()]

        nx.draw_networkx_edges(
            G_lf,
            pos_lf,
            width=edge_widths_lf_plot,
            alpha=0.15,
            edge_color="#888888"
        )
        nx.draw_networkx_nodes(
            G_lf,
            pos_lf,
            nodelist=list(G_lf.nodes()),
            node_color=node_colors_lf,
            node_size=node_sizes_lf,
            alpha=0.85,
            edgecolors='black',
            linewidths=0.3
        )

        # Labels for top 5 highest PageRank values
        labels_lf = {}
        if pagerank_dict:
            # Sort members by PageRank descending and take top 5
            sorted_pr = sorted(pagerank_dict.items(), key=lambda kv: kv[1], reverse=True)
            top5_nodes = [n for n, pr in sorted_pr[:5]]
            labels_lf = {n: id2name.get(n, n) for n in top5_nodes}

        if labels_lf:
            nx.draw_networkx_labels(
                G_lf,
                pos_lf,
                labels=labels_lf,
                font_size=10,
                font_weight='bold',
                bbox=dict(facecolor='white', alpha=0.4, edgecolor='none', boxstyle='round,pad=0.1')
            )

        plt.title(
            f"Leader-Follower Network (Min Edge Count: {MIN_LF_COUNT_FOR_EDGE}, Lag: {MAX_LAG_DAYS} days)\n"
            "Node Color: PageRank Centrality | Node Size: Weighted Out-Degree",
            pad=20,
            fontsize=16
        )

        # Create colorbar for PageRank (provide ax= to avoid ValueError)
        if not pr_values.empty:
            sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
            sm.set_array([])
            ax = plt.gca()
            cbar = plt.colorbar(sm, ax=ax, fraction=0.046, pad=0.04)
            cbar.set_label("PageRank Value", rotation=270, labelpad=15)

        plt.axis("off")
        plt.tight_layout(rect=[0, 0, 0.85, 1])

        lf_graph_path = os.path.join(
            OUTPUT_DIR_LF,
            f"leader_follower_network_min{MIN_LF_COUNT_FOR_EDGE}_lag{MAX_LAG_DAYS}_pagerank_top5labels.png"
        )
        plt.savefig(lf_graph_path, dpi=250, bbox_inches='tight')
        plt.close()
        print(f"[Saved] Leader-follower network plot (PageRank-colored, top 5 labeled) to {lf_graph_path}")
```



Top 20 Leader-Follower Pairs (lag ≤ 3 days):
           leader_name         follower_name  count
143    Josh Gottheimer        Greg Gianforte     30
236    Josh Gottheimer      Gilbert Cisneros     26
133      Dean Phillips     Michael T. McCaul     22
26         Pat Roberts       Josh Gottheimer     21
234  David B. McKinley       Josh Gottheimer     17
474    Josh Gottheimer        Kenny Marchant     16
573   Tommy Tuberville          Cynthia Axne     15
453        John Curtis       Josh Gottheimer     14
185       David Perdue           Steve Cohen     14
452       Bill Cassidy       Josh Gottheimer     13
287      Kurt Schrader       Josh Gottheimer     13
502  Robert J. Wittman       Josh Gottheimer     12
41     Susan W. Brooks        Kelly Loeffler     12
157       David Perdue             Ron Wyden     11
490    Josh Gottheimer  Shelley Moore Capito     11
30         Pat Roberts     David B. McKinley     10
46        David Perdue    K. Michael Conaway     10
491    Josh Gotthe

In [88]:
import pandas as pd
from collections import defaultdict, Counter
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm # For colormaps
import matplotlib.patches as mpatches # For legends
import os
import numpy as np
import community as community_louvain # For Louvain: pip install python-louvain

# --- ASSUMPTIONS ---
# STOCK_TRANSACTIONS_DF: Loaded
# id2name: Dictionary
# COMMITTEE_MEMBERSHIP_MAP: Your main map (used to create member_to_committee)

# --- Create member_to_committee from COMMITTEE_MEMBERSHIP_MAP ---
if 'COMMITTEE_MEMBERSHIP_MAP' not in globals():
    print("CRITICAL: COMMITTEE_MEMBERSHIP_MAP not defined.")
    exit()
member_to_committee = {} 
for comm, period_map in COMMITTEE_MEMBERSHIP_MAP.items():
    for members_set in period_map.values():
        for m_id in members_set:
            member_to_committee.setdefault(str(m_id), comm)

# --- Ensure required DataFrames/mappings are available ---
if 'STOCK_TRANSACTIONS_DF' not in globals() or \
   'id2name' not in globals() or \
   'member_to_committee' not in globals():
    print("Error: STOCK_TRANSACTIONS_DF, id2name, or member_to_committee is not defined.")
    exit() 

OUTPUT_DIR_LF = "leader_follower_analysis_louvain_spaced" # New output dir
os.makedirs(OUTPUT_DIR_LF, exist_ok=True)

# --- Parameters ---
MAX_LAG_DAYS = 7
MIN_LF_COUNT_FOR_EDGE = 5 # Adjusted as per your last run for more edges
TOP_N_PAIRS_TO_PRINT = 20
TOP_N_CENTRAL_NODES_TO_PRINT = 10
LABEL_PERCENTILE_THRESHOLD = 90 

# --- Preprocessing --- (Same as your provided code)
if not pd.api.types.is_datetime64_any_dtype(STOCK_TRANSACTIONS_DF['transaction_date_dt']):
    STOCK_TRANSACTIONS_DF['transaction_date_dt'] = pd.to_datetime(STOCK_TRANSACTIONS_DF['transaction_date_dt'], errors='coerce')
    STOCK_TRANSACTIONS_DF.dropna(subset=['transaction_date_dt'], inplace=True)
STOCK_TRANSACTIONS_DF['member_id'] = STOCK_TRANSACTIONS_DF['member_id'].astype(str)
STOCK_TRANSACTIONS_DF['ticker'] = STOCK_TRANSACTIONS_DF['ticker'].astype(str).str.upper()
if 'type' in STOCK_TRANSACTIONS_DF.columns:
    df_trades_lf = STOCK_TRANSACTIONS_DF[
        STOCK_TRANSACTIONS_DF['type'].astype(str).str.contains('sale|purchase', case=False, na=False)
    ].copy()
else:
    df_trades_lf = STOCK_TRANSACTIONS_DF.copy()
df_trades_lf.sort_values(by=['ticker', 'transaction_date_dt'], inplace=True)
leader_follower_pairs_raw = defaultdict(int)
for ticker_symbol, ticker_group_df in df_trades_lf.groupby('ticker'):
    if len(ticker_group_df) < 2: continue
    trades_list = ticker_group_df.to_dict('records') 
    for i in range(len(trades_list)):
        trade_A = trades_list[i]; member_A_id = trade_A['member_id']; date_A = trade_A['transaction_date_dt']
        for j in range(i + 1, len(trades_list)):
            trade_B = trades_list[j]; member_B_id = trade_B['member_id']; date_B = trade_B['transaction_date_dt']
            if member_A_id == member_B_id: continue
            time_lag = (date_B - date_A).days
            if 0 < time_lag <= MAX_LAG_DAYS: leader_follower_pairs_raw[(member_A_id, member_B_id)] += 1
            elif time_lag > MAX_LAG_DAYS: break 

if not leader_follower_pairs_raw:
    print("No leader-follower patterns found with the specified lag.")
else:
    df_leader_follower_with_ids = pd.DataFrame([
        {"leader_id": k[0], "follower_id": k[1], 
         "leader_name": id2name.get(k[0], k[0]), "follower_name": id2name.get(k[1], k[1]),
         "count": v}
        for k, v in leader_follower_pairs_raw.items()
    ]).sort_values(by="count", ascending=False)
    
    print(f"\nTop {TOP_N_PAIRS_TO_PRINT} Leader-Follower Pairs (A's trade precedes B's trade in same ticker within {MAX_LAG_DAYS} days):")
    print(df_leader_follower_with_ids[['leader_name', 'follower_name', 'count']].head(TOP_N_PAIRS_TO_PRINT).to_string())
    # (Save df_leader_follower_with_ids)
    df_leader_follower_with_ids.to_csv(os.path.join(OUTPUT_DIR_LF, "leader_follower_all_pairs.csv"), index=False)
    print(f"[Saved] All leader-follower pairs to {os.path.join(OUTPUT_DIR_LF, 'leader_follower_all_pairs.csv')}")


    G_lf = nx.DiGraph()
    all_members_in_graph_nodes = set()
    for _, row in df_leader_follower_with_ids.iterrows():
        if row['count'] >= MIN_LF_COUNT_FOR_EDGE:
            G_lf.add_edge(row['leader_id'], row['follower_id'], weight=row['count'])
            all_members_in_graph_nodes.add(row['leader_id']); all_members_in_graph_nodes.add(row['follower_id'])
    for mid in all_members_in_graph_nodes:
        if not G_lf.has_node(mid): G_lf.add_node(mid)

    if G_lf.number_of_nodes() == 0:
        print("Leader-follower graph is empty (no edges met the MIN_LF_COUNT_FOR_EDGE threshold).")
    else:
        print(f"\nLeader-Follower Graph: {G_lf.number_of_nodes()} members, {G_lf.number_of_edges()} directed links (min count: {MIN_LF_COUNT_FOR_EDGE}).")

        # --- Centrality Analysis ---
        out_degree_centrality_weighted = {n: G_lf.out_degree(n, weight='weight') for n in G_lf.nodes()}
        # (Save centrality DFs - same as before)
        # ...

        # --- Louvain Community Detection ---
        partition_lf = {} 
        if G_lf.number_of_edges() > 0:
            try:
                G_lf_undirected = G_lf.to_undirected() 
                partition_lf = community_louvain.best_partition(G_lf_undirected, weight='weight', random_state=42)
                nx.set_node_attributes(G_lf, partition_lf, 'louvain_community_lf')
                
                # --- Print Community Details ---
                print(f"\n--- Louvain Community Details (Leader-Follower Network) ---")
                communities_summary = defaultdict(list)
                for member_id, comm_id in partition_lf.items():
                    communities_summary[comm_id].append(member_id)
                
                df_louvain_list = []
                for comm_id, members_in_comm in sorted(communities_summary.items()):
                    print(f"\nCommunity ID: {comm_id} (Size: {len(members_in_comm)})")
                    member_details_list = []
                    for member_id in sorted(members_in_comm, key=lambda m: id2name.get(m,m)): # Sort members by name within community
                        member_name = id2name.get(member_id, member_id)
                        committee = member_to_committee.get(member_id, "Unspecified")
                        print(f"  - {member_name} ({member_id}) - Committee: {committee}")
                        member_details_list.append(f"{member_name} [{committee}]")
                        df_louvain_list.append({
                            'member_id': member_id, 
                            'member_name': member_name, 
                            'committee': committee, 
                            'louvain_community_lf': comm_id
                        })
                
                if df_louvain_list:
                    df_louvain_output = pd.DataFrame(df_louvain_list).sort_values(by=['louvain_community_lf', 'member_name'])
                    df_louvain_output.to_csv(os.path.join(OUTPUT_DIR_LF, "leader_follower_louvain_communities_detailed.csv"), index=False)
                    print(f"\n[Saved] Detailed Louvain communities to {os.path.join(OUTPUT_DIR_LF, 'leader_follower_louvain_communities_detailed.csv')}")

            except Exception as e_louvain:
                print(f"Could not perform Louvain community detection: {e_louvain}")
                partition_lf = {n: 0 for n in G_lf.nodes()} 
        else:
            print("Graph has no edges, skipping Louvain community detection.")
            partition_lf = {n: 0 for n in G_lf.nodes()}


        # --- Standard Matplotlib Visualization (Nodes colored by Louvain Community) ---
        fig, ax = plt.subplots(figsize=(24, 20)) # Increased figure size for more space

        # Increase k for more spread, and more iterations
        k_val = 1.0 / np.sqrt(G_lf.number_of_nodes()) if G_lf.number_of_nodes() > 0 else 1.0
        k_val = max(k_val, 0.3) # Ensure k is not too small for very dense small graphs
        pos_lf = nx.spring_layout(G_lf, k=k_val * 1.5, # Multiplier for k to increase spacing
                                  iterations=100, # Increased iterations
                                  seed=42, weight='weight')

        node_ids_list_viz_louvain = list(G_lf.nodes())
        
        # Node colors by Louvain community
        num_communities_viz = len(set(partition_lf.values()))
        louvain_cmap_name = "tab20" if num_communities_viz > 10 else "tab10"
        if num_communities_viz <=1 : louvain_cmap_name = "Set1"
        if num_communities_viz > 20: louvain_cmap_name = "turbo" # Or other perceptually uniform like 'viridis'
        
        louvain_palette = cm.get_cmap(louvain_cmap_name, max(2, num_communities_viz))
        node_colors_louvain = [louvain_palette(partition_lf.get(n, 0) % louvain_palette.N) for n in node_ids_list_viz_louvain]

        # Node sizes by weighted out-degree
        out_degrees_values_louvain = np.array([out_degree_centrality_weighted.get(n, 0) for n in node_ids_list_viz_louvain])
        if len(out_degrees_values_louvain) > 0 and out_degrees_values_louvain.max() > 0:
            scaled_degrees_viz = np.log1p(out_degrees_values_louvain)
            node_sizes_louvain = 200 + (scaled_degrees_viz / (scaled_degrees_viz.max() if scaled_degrees_viz.max() > 0 else 1)) * 2500 
        else:
            node_sizes_louvain = [300] * len(node_ids_list_viz_louvain)

        # Identify Nodes to Label (Influencers)
        labels_to_draw_louvain = {}
        nodes_to_label_ids_louvain = set()
        if len(out_degrees_values_louvain) > 0 :
            degree_threshold_labeling_louvain = np.percentile(out_degrees_values_louvain, LABEL_PERCENTILE_THRESHOLD)
            for i, node_id_val_louvain in enumerate(node_ids_list_viz_louvain):
                if out_degrees_values_louvain[i] >= degree_threshold_labeling_louvain and out_degrees_values_louvain[i] > 0.01: 
                    labels_to_draw_louvain[node_id_val_louvain] = id2name.get(node_id_val_louvain, node_id_val_louvain)
                    nodes_to_label_ids_louvain.add(node_id_val_louvain)
        
        # Node border for influencers (red), others standard
        node_edge_colors_louvain = ['red' if n in nodes_to_label_ids_louvain else 'dimgrey' for n in node_ids_list_viz_louvain]
        node_linewidths_louvain = [2.0 if n in nodes_to_label_ids_louvain else 0.6 for n in node_ids_list_viz_louvain] # Thicker border for influencers

        # Edge properties
        edge_weights_louvain_list = [G_lf[u][v]['weight'] for u,v in G_lf.edges()]
        edge_widths_louvain_plot = [0.3] * G_lf.number_of_edges() # Thinner default for less clutter
        edge_alpha_louvain = 0.25 
        if edge_weights_louvain_list:
            max_edge_w_louvain = float(max(edge_weights_louvain_list)) if edge_weights_louvain_list else 1.0
            if max_edge_w_louvain == 0: max_edge_w_louvain = 1.0
            edge_widths_louvain_plot = [0.3 + (w / max_edge_w_louvain)**0.7 * 3.5 for w in edge_weights_louvain_list]

        # --- Drawing ---
        nx.draw_networkx_nodes(G_lf, pos_lf, ax=ax, nodelist=node_ids_list_viz_louvain, 
                               node_color=node_colors_louvain, 
                               node_size=node_sizes_louvain, alpha=0.9, 
                               edgecolors=node_edge_colors_louvain, 
                               linewidths=node_linewidths_louvain) 
        
        nx.draw_networkx_edges(G_lf, pos_lf, ax=ax, width=edge_widths_louvain_plot, 
                               alpha=edge_alpha_louvain, edge_color="darkgrey", arrows=True, # Slightly darker arrows
                               arrowstyle='-|>', arrowsize=9, # Smaller arrows
                               connectionstyle='arc3,rad=0.08', # Less curve
                               node_size=node_sizes_louvain)
        
        # Draw labels with background
        label_font_size_louvain = 7.0 # Even smaller for more labels if needed
        for node_viz_louvain, (x_viz_l, y_viz_l) in pos_lf.items():
            if node_viz_louvain in labels_to_draw_louvain:
                ax.text(x_viz_l, y_viz_l, labels_to_draw_louvain[node_viz_louvain], 
                        fontsize=label_font_size_louvain, ha='center', va='center', # Centered on node
                        fontweight='bold', # Make labeled names bold
                        color='black', # Ensure label text is black for contrast on white bg
                        bbox=dict(boxstyle='round,pad=0.25', fc='white', alpha=0.8, ec='red', lw=0.7)) # White bg, red border for influencer labels
        
        ax.set_title(f"Leader-Follower Network & Louvain Communities\n(Min. {MIN_LF_COUNT_FOR_EDGE} links/ticker | Lag ≤ {MAX_LAG_DAYS} days | Node Color: Louvain Community)", 
                     fontsize=15, pad=15, weight='semibold')
        
        # Create legend
        legend_handles_louvain = []
        if labels_to_draw_louvain: 
            legend_handles_louvain.append(mpatches.Patch(edgecolor='red', facecolor='white', lw=1.5,
                                                  label=f"Top {100-LABEL_PERCENTILE_THRESHOLD}% Influencers (Red Border & Label BG)"))
        
        # Add a note about community colors if not making a full legend for each color
        unique_comm_ids_in_plot = sorted(list(set(partition_lf.get(n,0) for n in node_ids_list_viz_louvain)))
        if len(unique_comm_ids_in_plot) > 1 : 
            # Generic community legend entry if too many, or create one for each if few
            if len(unique_comm_ids_in_plot) <= 10: # Make legend for up to 10 distinct communities
                 for comm_id_leg in unique_comm_ids_in_plot:
                     legend_handles_louvain.append(mpatches.Patch(color=louvain_palette(comm_id_leg % louvain_palette.N), label=f"Community {comm_id_leg}"))
            else:
                 legend_handles_louvain.append(mpatches.Patch(color='grey', label=f"Louvain Communities ({len(unique_comm_ids_in_plot)} groups)"))


        if legend_handles_louvain:
            ax.legend(handles=legend_handles_louvain, 
                      title="Legend", 
                      bbox_to_anchor=(1.01, 0.98), loc="upper left", fontsize=8.5, title_fontsize=9.5,
                      frameon=True, facecolor='#f7f7f7', framealpha=0.9, edgecolor='darkgrey')
        
        ax.set_axis_off() 
        fig.tight_layout(rect=[0, 0, 0.87, 0.96]) # Adjust rect for title and legend
        
        lf_graph_path_louvain = os.path.join(OUTPUT_DIR_LF, f"leader_follower_louvain_min{MIN_LF_COUNT_FOR_EDGE}_lag{MAX_LAG_DAYS}_spaced.png")
        plt.savefig(lf_graph_path_louvain, dpi=200) 
        plt.close(fig)
        print(f"[Saved] Spaced leader-follower network with Louvain communities to {lf_graph_path_louvain}")


Top 20 Leader-Follower Pairs (A's trade precedes B's trade in same ticker within 7 days):
                leader_name      follower_name  count
293         Josh Gottheimer   Gilbert Cisneros     86
82           Kelly Loeffler       David Perdue     80
72            Dean Phillips     Kelly Loeffler     46
286               Ro Khanna       Peter Meijer     43
310         Josh Gottheimer     Greg Gianforte     41
100           Kurt Schrader    Josh Gottheimer     36
296           Dean Phillips  Michael T. McCaul     36
49              Pat Roberts    Josh Gottheimer     28
420         Josh Gottheimer   Tommy Tuberville     26
583            David Perdue     Kelly Loeffler     25
41          Josh Gottheimer        Pat Roberts     24
65              Zoe Lofgren        John Curtis     22
273         Josh Gottheimer       Nancy Pelosi     22
891         Josh Gottheimer     Mikie Sherrill     20
521           Kurt Schrader          Ro Khanna     19
343         Josh Gottheimer     Kelly Loeffle

  louvain_palette = cm.get_cmap(louvain_cmap_name, max(2, num_communities_viz))


[Saved] Spaced leader-follower network with Louvain communities to leader_follower_analysis_louvain_spaced/leader_follower_louvain_min5_lag7_spaced.png
