# Analysing email network

## Running simulation for N times after 500 steps
### NOTE: DO NOT RUN THE NOTEBOOK IN MAIN DIRECTORY ENSURE YOU MAKE A COPY AND POINT TO THE APPROPRIATE FILES
The following code runs the simulation N_RUNS time and collects the resulting centrality and violations

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from model_new_1 import ProblemSolvingModel

# Configuration
N_RUNS = 50  # Number of times to run the model
STEPS_PER_RUN = 500

# Storage for aggregation
# agent_sums[i] will store sum of violations for agent i across runs
agent_sums = {} 
agent_centralities = {} # Store centrality (should be constant if graph is static)

print(f"Starting {N_RUNS} runs...")

for run_idx in range(N_RUNS):
    # Initialize model with a new seed
    seed = 42 + run_idx
    model = ProblemSolvingModel(
        K=50,
        alpha=2,
        obs_prob=0.01,
        clause_interval=10,
        R=2000,
        setup_source="dataset",
        file_path="data/EmailManufacturing-copy.xml",
        seed=seed
    )
    
    # Run the model
    for _ in range(STEPS_PER_RUN):
        model.step()
    
    # Collect data from this run
    # 1. Get graph and agents
    G = model.network
    agents = model.schedule.agents
    agent_dict = {a.unique_id: a for a in agents}
    
    # 2. Calculate Centrality (re-calculating to be safe, though static)
    deg_dict = dict(G.in_degree())
    ######################################################
    ###   unommment whiever centrality you need       #### 
    ######################################################
#    deg_dict = nx.eigenvector_centrality(G)
    max_deg = max(deg_dict.values()) if deg_dict else 1.0
    max_deg = max(max_deg, 1.0)
    
    # 3. Aggregate
    for pid in agent_dict:
        # Get metrics
        viol = agent_dict[pid].true_violations
        centr = deg_dict.get(pid, 0) / max_deg
        
        # Store/Add violations
        if pid not in agent_sums:
            agent_sums[pid] = []
            agent_centralities[pid] = centr # Store once
        agent_sums[pid].append(viol)

    print(f"Run {run_idx+1}/{N_RUNS} complete.")

# --- Post-Processing ---

ids = sorted(agent_sums.keys())
x_vals = [] # Centrality
y_vals = [] # Average Violations

for pid in ids:
    avg_viol = np.mean(agent_sums[pid])
    x_vals.append(agent_centralities[pid])
    y_vals.append(avg_viol)

# --- Plotting ---

plt.figure(figsize=(8, 6))
plt.scatter(x_vals, y_vals, s=15, alpha=0.7, label='Agent Avg')

# Add regression line

# if len(x_vals) > 1:
#     z = np.polyfit(x_vals, y_vals, 1)
#     p = np.poly1d(z)
#     plt.plot(x_vals, p(x_vals), "k-", linewidth=2, label=f'Trend (slope={z[0]:.2f})')

plt.xlabel("Normalized In-Degree Centrality")
plt.ylabel(f"Average Violations (over {N_RUNS} runs)")
plt.title("Agent Performance: Centrality vs. Avg Violations")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


## Plot centrality v/s Violations

In [None]:
plt.rcParams.update({'font.size': 16})
plt.figure(figsize=(8, 6))
plt.scatter(x_vals, y_vals, s=15, alpha=0.7, label='Agent Avg')
#############################################################################
# Add regression line , uncomment the following section for regression line #
#############################################################################
#if len(x_vals) > 1:
#    z = np.polyfit(x_vals, y_vals, 1)
#    p = np.poly1d(z)
#    plt.plot(x_vals, p(x_vals), "k-", linewidth=2, label=f'Trend (slope={z[0]:.2f})')

plt.xlabel("Eigenvector Centrality")
plt.ylabel(f"Average Violations (over {N_RUNS} runs)")
plt.title("Agent Performance: Centrality vs. Avg Violations")
plt.legend()
#plt.xscale("log")
#plt.yscale("log")
plt.ylim(10,max(y_vals)+10)
plt.grid(True, alpha=0.3)
#plt.savefig("output/violations-centr-email.png")
plt.show()

## Saving data to csv

In [None]:
import pandas as pd
import numpy as np

# 1. Prepare the data structure
# 'ids', 'agent_centralities', and 'agent_sums' come from the previous script
ids = sorted(agent_sums.keys())
export_rows = []

for pid in ids:
    violations_list = agent_sums[pid]
    
    row = {
        "Agent_ID": pid,
        "Centrality": agent_centralities[pid],
        "Average_Violations": np.mean(violations_list),
        "Std_Dev_Violations": np.std(violations_list),  # Helpful to check stability
        "Min_Violations": np.min(violations_list),
        "Max_Violations": np.max(violations_list),
        "Num_Runs": N_RUNS
    }
    
    # OPTIONAL: If you want every single run's value in the file
    # for i, val in enumerate(violations_list):
    #     row[f"Run_{i+1}_Violations"] = val
        
    export_rows.append(row)

# 2. Create a DataFrame
df = pd.DataFrame(export_rows)

# 3. Save to CSV
filename = "agent_performance_summary.csv"
#filename = "agent_performance_summary_eigenvector.csv"

df.to_csv(filename, index=False)

print(f"✅ Successfully saved data for {len(ids)} agents to '{filename}'")
print(df.head()) # Print first few rows to verify


## Matching nodes between graph and datasets

In [None]:
import pandas as pd
import networkx as nx
import datetime

# 1. Load the CSV with real identities
# Assuming CSV format: Sender;Recipient;EventDate
csv_file = "data/communication.csv"  # Replace with your actual file name
df = pd.read_csv(csv_file, sep=';')

# Convert 'EventDate' to Unix timestamp to match the XML key
# We use pd.to_datetime and then convert to int (seconds)
df['Timestamp'] = pd.to_datetime(df['EventDate']).astype('int64') // 10**9


print("CSV Data loaded:")
print(df.head())

# 2. Load the GraphML/XML file
graph_file = "data/EmailManufacturing-copy.xml" # Replace with your actual file name
G = nx.read_graphml(graph_file)
cnt=0
for u, v, data in G.edges(data=True):
    print(u,"\t",v,"\t",data["time"])
    if cnt>5: break
    cnt+=1 

In [None]:
times=[]
js=[]
xml_data={
    "is":[],
    "js":[],
    "time":[]
}
print(len(G.edges()))
for u, v, data in G.edges(data=True):
    xml_data["is"].append(u)
    xml_data["js"].append(v)
    xml_data["time"].append(data["time"])
    
    times.append(data["time"])
df_graph=pd.DataFrame(xml_data)
df_graph

In [None]:
df_graph1=df_graph.sort_values(by=["time","is"],)


# Aggregate Graph Data: Get unique senders per timestamp
g_senders = df_graph1.groupby('time')['is'].unique().reset_index()
g_senders.columns = ['time', 'graph_sender_list']
g_senders
# Aggregate CSV Data: Get unique senders per timestamp
c_senders = df.groupby('Timestamp')['Sender'].unique().reset_index()
c_senders.columns = ['time', 'csv_sender_list']

# Merge on Time
merged = pd.merge(g_senders, c_senders, on='time', how='inner')
merged

In [None]:
# Build Mapping Dictionary
node_map = {}

for idx, row in merged.iterrows():
    g_list = row['graph_sender_list']
    c_list = row['csv_sender_list']
    
    # Case 1: Perfect 1-to-1 match (Most common)
    # Only one sender active at this second in both files
    if len(g_list) == 1 and len(c_list) == 1:
        node_map[g_list[0]] = c_list[0]
        
    # Case 2: Multiple senders active at the same second
    # We can't distinguish them unless we look at the count of messages sent
    elif len(g_list) > 1:
        # Refine by checking message counts
        # Get counts for this specific timestamp
        g_counts = df_graph[df_graph['time'] == row['time']]['is'].value_counts()
        c_counts = df[df['Timestamp'] == row['time']]['Sender'].value_counts()
        
        for g_node in g_list:
            count = g_counts[g_node]
            # Find csv node with same count
            matches = c_counts[c_counts == count].index.tolist()
            if len(matches) == 1:
                node_map[g_node] = matches[0]

# 3. Apply Mapping to DataFrame
print(f"Mapped {len(node_map)} unique sender nodes.") #167 nodes

In [None]:
# Convert dictionary to DataFrame for saving
mapping_df = pd.DataFrame(list(node_map.items()), columns=['Graph_Node', 'Real_ID'])
print(mapping_df.head())

# Save mapping
mapping_df.to_csv("node_identity_mapping.csv", index=False)

# 4. Verify Mapping
# Create a mapped version of the graph dataframe
df_graph_mapped = df_graph.copy()
df_graph_mapped['Real_Sender'] = df_graph_mapped['is'].map(node_map)

print("\nVerification (First 5 rows):")
print(df_graph_mapped.head())

## Mapped 154 unqiue sender nodes but nodes that only recieve are left

In [None]:
import pandas as pd

# 1. Load Data & Previous Mapping
# Assuming df_graph, df, and node_map (dictionary from previous step) exist
# df_graph columns: ['is', 'js', 'time']
# df columns: ['Sender', 'Recipient', 'Timestamp']

# Apply known sender mappings to the graph dataframe
df_graph['mapped_sender'] = df_graph['is'].map(node_map)

# 2. Filter for Unmapped Nodes
all_graph_nodes = set(df_graph['is']).union(set(df_graph['js']))
mapped_nodes = set(node_map.keys())
unmapped_nodes = all_graph_nodes - mapped_nodes

print(f"Total Nodes: {len(all_graph_nodes)}")
print(f"Already Mapped: {len(mapped_nodes)}")
print(f"Remaining Unmapped: {len(unmapped_nodes)}")

# 3. Create "Receiver Fingerprints"
# A fingerprint is a set of (Sender_ID, Timestamp) tuples that a node received.

# Fingerprint for Graph Nodes (using mapped sender IDs)
graph_fingerprints = {}
# Only look at rows where the target is unmapped
candidates = df_graph[df_graph['js'].isin(unmapped_nodes)]

for node, group in candidates.groupby('js'):
    # Create a sorted tuple of (Sender, Time) events
    # We rely on the 'mapped_sender' we already found
    events = sorted(list(zip(group['mapped_sender'], group['time'])))
    # Convert to tuple so it's hashable/comparable
    graph_fingerprints[node] = tuple(events)

# Fingerprint for CSV Nodes (Real Data)
# We need to find which Real IDs are not yet in our values
mapped_real_ids = set(node_map.values())
all_real_ids = set(df['Sender']).union(set(df['Recipient']))
unmapped_real_ids = all_real_ids - mapped_real_ids

csv_fingerprints = {}
candidates_csv = df[df['Recipient'].isin(unmapped_real_ids)]

for node, group in candidates_csv.groupby('Recipient'):
    events = sorted(list(zip(group['Sender'], group['Timestamp'])))
    csv_fingerprints[node] = tuple(events)

# 4. Match Fingerprints
new_mappings = 0
for g_node, g_fp in graph_fingerprints.items():
    # Look for this fingerprint in the CSV fingerprints
    # Inefficient loop but fine for 13 nodes
    for c_node, c_fp in csv_fingerprints.items():
        if g_fp == c_fp:
            node_map[g_node] = c_node
            new_mappings += 1
            break

print(f"\nSuccessfully mapped {new_mappings} pure receiver nodes.")
print(f"Total Mapped: {len(node_map)}")

# 5. Save Final Full Mapping
full_mapping_df = pd.DataFrame(list(node_map.items()), columns=['Graph_Node', 'Real_ID'])
##################################################################
##################################################################
full_mapping_df.to_csv("full_node_mapping.csv", index=False)
##################################################################
##################################################################
print("Full mapping saved to 'full_node_mapping.csv'")


## Finding node mapping of xml_ids to Agent_ID in ABM

In [None]:
loaded_graph = nx.read_graphml("data/EmailManufacturing-copy.xml")
print(f"Loaded graph from {filepath}")
print(f"  Nodes: {loaded_graph.number_of_nodes()}")
print(f"  Edges: {loaded_graph.number_of_edges()}")
print(f"  Type: {'Directed' if loaded_graph.is_directed() else 'Undirected'}")
original_node_ids = list(loaded_graph.nodes())
node_mapping = {orig_id: i for i, orig_id in enumerate(original_node_ids)}

### Validating mapping results

In [None]:
print("--- Starting Validation ---")

# 1. Translate the Graph DataFrame
df_translated = df_graph.copy()
df_translated['Sender'] = df_translated['is'].map(node_map)
df_translated['Recipient'] = df_translated['js'].map(node_map)
df_translated['Timestamp'] = df_translated['time']

# Drop unmapped rows if any (should be 0 if fully mapped)
unmapped_edges = df_translated[df_translated['Sender'].isna() | df_translated['Recipient'].isna()]
if len(unmapped_edges) > 0:
    print(f"⚠️ WARNING: {len(unmapped_edges)} edges could not be translated (nodes missing from map).")
    df_translated.dropna(subset=['Sender', 'Recipient'], inplace=True)

# Ensure types match for comparison (int vs int)
df_translated['Sender'] = df_translated['Sender'].astype(int)
df_translated['Recipient'] = df_translated['Recipient'].astype(int)

# --- Check 1: Set Comparison (Exact Match of Events) ---
# Create a set of tuples (Sender, Recipient, Time) for both
graph_events = set(zip(df_translated['Sender'], df_translated['Recipient'], df_translated['Timestamp']))
csv_events = set(zip(df['Sender'], df['Recipient'], df['Timestamp']))

common = graph_events.intersection(csv_events)
missing_in_csv = graph_events - csv_events
extra_in_csv = csv_events - graph_events

print(f"\n1. Event Match Validation")
print(f"   Total Graph Events: {len(graph_events)}")
print(f"   Total CSV Events:   {len(csv_events)}")
print(f"   ✅ Exact Matches:    {len(common)}")
print(f"   ❌ Missing in CSV:   {len(missing_in_csv)}")
print(f"   ❌ Extra in CSV:     {len(extra_in_csv)}")

match_rate = len(common) / len(graph_events) * 100
print(f"   -> Match Accuracy: {match_rate:.2f}%")

# --- Check 2: Node Activity Consistency ---
# Compare number of messages sent/received per node
def get_activity(df, role_col):
    return df[role_col].value_counts().sort_index()

# Senders
g_counts = get_activity(df_translated, 'Sender')
c_counts = get_activity(df, 'Sender')
# Align indexes (some nodes might be missing in one)
all_senders = sorted(set(g_counts.index) | set(c_counts.index))
diff_s = pd.DataFrame({'Graph_Count': g_counts, 'CSV_Count': c_counts}).fillna(0)
diff_s['Diff'] = diff_s['Graph_Count'] - diff_s['CSV_Count']

print(f"\n2. Node Activity Consistency")
mismatched_senders = diff_s[diff_s['Diff'] != 0]
if len(mismatched_senders) == 0:
    print("   ✅ PERFECT: All nodes send the exact same number of messages.")
else:
    print(f"   ⚠️ MISMATCH: {len(mismatched_senders)} nodes have different sent counts.")
    print(mismatched_senders.head())

# --- Check 3: Spot Check Specific Discrepancies ---
if len(missing_in_csv) > 0:
    print("\n3. Sample Discrepancy (Graph event not found in CSV):")
    sample = list(missing_in_csv)[0]
    print(f"   Graph says: {sample}")
    # Check if it exists with slight time diff?
    near_matches = df[
        (csv['Sender'] == sample[0]) & 
        (csv['Recipient'] == sample[1])
    ]
    if not near_matches.empty:
        print("   Closest CSV match found:")
        print(near_matches)
    else:
        print("   No similar event found in CSV.")

print("\n--- Validation Complete ---")


# Finding hierarchies

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from io import StringIO

# --- 1. Load Hierarchy Data ---
csv_data = """ID;ReportsToID
1;152
2;132
3;152
4;technical email account - not used by employees
5;36
6;163
7;86
8;85
9;85
10;technical email account - not used by employees
11;148
12;104
13;36
14;163
15;31
16;39
17;70
18;163
19;136
20;27
21;technical email account - not used by employees
22;104
23;technical email account - not used by employees
24;technical email account - not used by employees
25;104
26;technical email account - not used by employees
27;86
28;39
29;76
30;77
31;143
32;141
33;76
34;121
35;90
36;86
37;118
38;141
39;136
40;70
41;90
42;70
43;123
44;141
45;77
46;technical email account - not used by employees
47;27
48;154
49;39
50;152
51;former employee account
52;118
53;85
54;141
55;141
56;45
57;27
58;141
59;47
60;156
61;104
62;141
63;104
64;154
65;27
66;70
67;85
68;70
69;86
70;86
71;163
72;27
73;136
74;90
75;former employee account
76;69
77;76
78;104
79;148
80;148
81;39
82;121
83;36
84;159
85;86
86;86
87;former employee account
88;154
89;70
90;69
91;29
92;33
93;former employee account
94;29
95;39
96;36
97;104
98;104
99;136
100;36
101;121
102;36
103;85
104;86
105;104
106;36
107;159
108;29
109;154
110;163
111;former employee account
112;136
113;104
114;104
115;123
116;39
117;143
118;104
119;90
120;85
121;86
122;124
123;136
124;143
125;141
126;47
127;90
128;85
129;90
130;70
131;159
132;152
133;141
134;124
135;36
136;69
137;69
138;148
139;former employee account
140;39
141;136
142;143
143;69
144;19
145;76
146;141
147;33
148;86
149;19
150;162
151;104
152;69
153;69
154;90
155;36
156;86
157;39
158;129
159;136
160;148
161;148
162;69
163;86
164;137
165;29
166;159
167;141
"""
df_hier = pd.read_csv(StringIO(csv_data), sep=';')

# Clean & Build Graph
df_clean = df_hier[pd.to_numeric(df_hier['ReportsToID'], errors='coerce').notnull()].copy()
df_clean['ReportsToID'] = df_clean['ReportsToID'].astype(int)

H = nx.DiGraph()

# Explicitly add all nodes
all_nodes = set(df_clean['ID']).union(set(df_clean['ReportsToID']))
H.add_nodes_from(all_nodes)

# Add edges (Manager -> Employee)
for _, row in df_clean.iterrows():
    employee = row['ID']
    manager = row['ReportsToID']
    if employee != manager: 
        H.add_edge(manager, employee) 

# Calculate Levels (Distance from CEO 86)
try:
    levels = nx.single_source_shortest_path_length(H, 86)
except nx.NodeNotFound:
    print("❌ Error: CEO node (86) not found in hierarchy graph.")
    levels = {}

# Create DataFrame for Hierarchy Levels
level_df = pd.DataFrame(list(levels.items()), columns=['Real_ID', 'Hierarchy_Level'])
level_df['Real_ID'] = level_df['Real_ID'].astype(int)

print(f"✅ Hierarchy Levels computed for {len(level_df)} employees.")


# --- 2. Load Simulation Results & Apply Mapping (Corrected) ---
try:
    df_sim = pd.read_csv("agent_performance_summary.csv")
    df_map = pd.read_csv("full_node_mapping.csv")
    
    # 1. Check format of Mapping file's keys
    map_key_example = df_map['Graph_Node'].iloc[0] # likely 'n0'
    
    # 2. Check format of Simulation file's keys
    sim_key_example = df_sim['Agent_ID'].iloc[0]   # likely 0 (int)
    
    print(f"Mapping Key Type: {type(map_key_example)} (Example: {map_key_example})")
    print(f"Sim Key Type:     {type(sim_key_example)} (Example: {sim_key_example})")
    
    # 3. Standardize to match the MAPPING file format
    # If mapping is 'n0', 'n1' (string) and sim is 0, 1 (int)
    if isinstance(map_key_example, str) and str(map_key_example).startswith('n'):
        print("-> Converting Simulation IDs to 'nX' format...")
        # Convert int to string with 'n' prefix
        df_sim['Graph_Node'] = 'n' + df_sim['Agent_ID'].astype(str)
    else:
        # Just ensure types match (string vs string or int vs int)
        print("-> Converting Simulation IDs to match Mapping type...")
        df_sim['Graph_Node'] = df_sim['Agent_ID'].astype(df_map['Graph_Node'].dtype)

    # 4. Verify types before merge
    print(f"Sim 'Graph_Node' dtype: {df_sim['Graph_Node'].dtype}")
    print(f"Map 'Graph_Node' dtype: {df_map['Graph_Node'].dtype}")

    # 5. Perform Translation
    df_sim_mapped = pd.merge(df_sim, df_map, on='Graph_Node', how='inner')
    
    print(f"✅ Successfully mapped {len(df_sim_mapped)} agents.")
    print(df_sim_mapped[['Graph_Node', 'Real_ID', 'Average_Violations']].head())

except Exception as e:
    print(f"❌ Error during mapping: {e}")
    df_sim_mapped = pd.DataFrame()

# --- 3. Merge with Hierarchy Data ---
if not df_sim_mapped.empty:
    # Merge on Real_ID (e.g., 86, 17, 36...)
    merged_df = pd.merge(df_sim_mapped, level_df, on='Real_ID', how='inner')
    
    if merged_df.empty:
        print("⚠️ Merge result empty! IDs did not match between Mapping and Hierarchy.")
    else:
        # --- 4. Analysis ---
        print("\n--- Performance by Hierarchy Level ---")
        level_stats = merged_df.groupby('Hierarchy_Level')['Average_Violations'].describe()[['count', 'mean', 'min', 'max']]
        print(level_stats)

        print("\n--- Top 5 Best Performing Agents (Least Violations) ---")
        top_agents = merged_df.sort_values('Average_Violations').head(5)
        print(top_agents[['Real_ID', 'Hierarchy_Level', 'Average_Violations']])

        # --- 5. Visualization ---
        plt.figure(figsize=(10, 6))
        merged_df.boxplot(column='Average_Violations', by='Hierarchy_Level', grid=False, patch_artist=True)
        plt.title('Violations Distribution by Hierarchy Level')
        plt.suptitle('') 
        plt.xlabel('Hierarchy Level (0 = CEO)')
        plt.ylabel('Average Violations')
        plt.show()

        # Save final report
        merged_df.to_csv("hierarchy_performance_analysis_mapped.csv", index=False)
        print("\n✅ Analysis saved to 'hierarchy_performance_analysis_mapped.csv'")

else:
    print("Skipping analysis due to missing data.")


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from io import StringIO

# --- 1. Load & Build Hierarchy Graph (Corrected for Node 86) ---
# OR load from file:
# df_hier = pd.read_csv("reports_to.csv", sep=';')
df_hier = pd.read_csv(StringIO(csv_data), sep=';')

# Clean data
df_clean = df_hier[pd.to_numeric(df_hier['ReportsToID'], errors='coerce').notnull()].copy()
df_clean['ReportsToID'] = df_clean['ReportsToID'].astype(int)

# Build Graph
H = nx.DiGraph()
# CRITICAL FIX: Add all nodes explicitly first to ensure the CEO (86) exists
all_nodes = set(df_clean['ID']).union(set(df_clean['ReportsToID']))
H.add_nodes_from(all_nodes)

for _, row in df_clean.iterrows():
    if row['ID'] != row['ReportsToID']:
        H.add_edge(row['ReportsToID'], row['ID']) # Edge: Manager -> Employee

# Compute Levels (Distance from CEO #86)
try:
    levels = nx.single_source_shortest_path_length(H, source=86)
    level_df = pd.DataFrame(list(levels.items()), columns=['Agent_ID', 'Hierarchy_Level'])
except Exception as e:
    print(f"Error calculating levels: {e}")
    level_df = pd.DataFrame()

# --- 2. Load Simulation Centrality Data ---
try:
    # Load the summary file from the previous step
    df_sim = pd.read_csv("agent_performance_summary.csv")
except FileNotFoundError:
    print("⚠️ 'agent_performance_summary.csv' not found. Generating dummy data for demo.")
    # Dummy data generator for testing
    import numpy as np
    ids = list(levels.keys())
    df_sim = pd.DataFrame({
        'Agent_ID': ids, 
        'Centrality': np.random.rand(len(ids)) # Random centrality
    })

# --- 3. Merge Data ---
# Inner merge ensures we only analyze nodes present in both the Hierarchy and the Simulation
merged_df = pd.merge(df_sim, level_df, on='Agent_ID', how='inner')

if merged_df.empty:
    print("⚠️ Warning: Merge resulted in empty data. Check if 'Agent_ID's in simulation match 'ID's in hierarchy.")
else:
    # --- 4. Statistical Analysis ---
    print("\n--- Degree Centrality by Hierarchy Level ---")
    # We want to see the Mean and Max centrality per level
    stats = merged_df.groupby('Hierarchy_Level')['Centrality'].describe()[['count', 'mean', 'std', 'max']]
    print(stats)

    # Find the "Informal Leaders" (High centrality but low rank/high level number)
    print("\n--- Top 5 Most Central Agents ---")
    top_central = merged_df.sort_values('Centrality', ascending=False).head(5)
    print(top_central[['Agent_ID', 'Hierarchy_Level', 'Centrality']])

    # --- 5. Visualization ---
    plt.figure(figsize=(10, 6))
    
    # Create boxplot
    merged_df.boxplot(column='Centrality', by='Hierarchy_Level', grid=False, patch_artist=True)
    
    plt.title('Communication Centrality vs. Corporate Hierarchy')
    plt.suptitle('') # Removes default pandas subtitle
    plt.xlabel('Hierarchy Level (0 = CEO, Higher = Lower Rank)')
    plt.ylabel('Degree Centrality (Simulation)')
    
    plt.show()
    
    # Save results
    merged_df.to_csv("hierarchy_centrality_analysis.csv", index=False)
    print("\n✅ Analysis saved to 'hierarchy_centrality_analysis.csv'")


# Visualsing Company Network

In [None]:
plt.figure(figsize=(16, 16))
pos=nx.nx_agraph.graphviz_layout(H,prog="twopi",args='-Granksep=0.25 -Gnodesep=3.0')
nx.draw(H,with_labels=True,pos=pos,font_size=16)

In [None]:

plt.figure(figsize=(16, 16))
d = dict(nx.degree(H))
# Ensure minimum node size so small nodes don't vanish
node_sizes = [max(300, v * 200) for v in d.values()] 
pos = nx.nx_agraph.graphviz_layout(H, prog="dot", args='-Grankdir=TB -Granksep=2.0 -Gnodesep=1.0')
#pos=nx.nx_agraph.graphviz_layout(H,prog="dot",args='-Granksep=0.25 -Gnodesep=3.0')
nx.draw(H,with_labels=True,pos=pos,font_size=16,node_size=[v * 150 for v in d.values()])



In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Assuming H is your graph
plt.figure(figsize=(12, 12)) # Increased width for better separation

# 1. Layout
# Using 'dot' is perfect for hierarchies. increasing nodesep helps horizontal spacing.
try:
    pos = nx.nx_agraph.graphviz_layout(H, prog="twopi", args='-Grankdir=TB -Granksep=2.0 -Gnodesep=1.0')
except ImportError:
    # Fallback if graphviz missing (e.g. on some systems)
    pos = nx.kamada_kawai_layout(H) 

# 2. Metrics for styling
d = dict(nx.degree(H))
# Ensure minimum node size so small nodes don't vanish
node_sizes = [max(300, v * 200) for v in d.values()] 

# 3. Draw Components Separately (Layered approach)

# Layer 1: Edges (Light gray, curved looks better for hierarchies)
nx.draw_networkx_edges(H, pos, edge_color='#AAAAAA', width=1.0, arrows=True, arrowsize=15, 
                       connectionstyle="arc3,rad=0.1")

# Layer 2: Nodes (Color by degree or hierarchy level)
nodes = nx.draw_networkx_nodes(H, pos, node_size=node_sizes, node_color='#40a6d1', alpha=0.9)
nodes.set_edgecolor('white') # Add border to nodes

# Layer 3: Labels (The critical fix)
# We create a custom label dictionary to filter out technical nodes if needed, or show all
labels = {n: str(n) for n in H.nodes()}

nx.draw_networkx_labels(H, pos, labels, font_size=10, font_weight='bold',
                        bbox=dict(facecolor='white', edgecolor='none', alpha=0.7, boxstyle='round,pad=0.2'))

plt.title("Company Hierarchy", fontsize=20)
plt.axis('off') # Clean background
plt.tight_layout()
plt.show()


# Compiling Results

In [None]:
df_sim = pd.read_csv("agent_performance_summary.csv")


## Adding xml_id

In [None]:
df_sim = pd.read_csv("agent_performance_summary.csv")
df_sim = df_sim.assign(
    xml_id=pd.Series([ "n"+str(id) for id in df_sim["Agent_ID"]]).values)

## Adding Real_id mapped

In [None]:
df_sim = df_sim.assign(
    Real_id=pd.Series(
        [full_mapping_df["Real_ID"][full_mapping_df["Graph_Node"]==xml_id].values[0] for xml_id in df_sim["xml_id"]]
        ).values
        )


## Adding Hierarchy levels

In [None]:
levels=[]
for agent_id in df_sim["Agent_ID"]:
    level=level_df["Hierarchy_Level"][level_df["Agent_ID"]==agent_id].values
    if len(level)==0:
        levels.append(-1)
    else:
        levels.append(int(level[0]))
levels        


In [None]:
df_sim = df_sim.assign(
    level=pd.Series(
        levels
        ).values
        )

# Plotting publication plots

In [None]:

print("\n--- Eigenvector Centrality by Hierarchy Level ---")
# We want to see the Mean and Max centrality per level
stats = sorted_sim.groupby('level')['Centrality'].describe()[['count', 'mean', 'std', 'max']]
print(stats)

# Find the "Informal Leaders" (High centrality but low rank/high level number)
print("\n--- Top 5 Most Central Agents ---")
top_central = sorted_sim.sort_values('Centrality', ascending=False).head(5)
print(top_central[['Agent_ID', 'level', 'Centrality']])

# --- 5. Visualization ---
plt.figure(figsize=(10, 6))

# Create boxplot
sorted_sim.boxplot(column='Centrality', by='level', grid=False, patch_artist=False)

plt.title('Centrality vs. Corporate Hierarchy')
plt.suptitle('') # Removes default pandas subtitle
plt.xlabel('Hierarchy Level (0 = CEO)')
plt.ylabel('Degree Centrality (Simulation)')

plt.show()

In [None]:
# --- 5. Visualization ---
plt.figure(figsize=(10, 6))

# Create boxplot
sorted_sim.boxplot(column='Average_Violations', by='level', grid=False, patch_artist=False)

plt.title('Violations vs. Corporate Hierarchy')
plt.suptitle('') # Removes default pandas subtitle
plt.xlabel('Hierarchy Level (0 = CEO)')
plt.ylabel('Average Violations (Simulation)')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Update font size
plt.rcParams.update({'font.size': 16})
plt.figure(figsize=(10, 7))

# Define levels
unique_levels = sorted(sorted_sim["level"].unique())

# Create a colormap instance
cmap = plt.get_cmap("tab10") 

# Define a list of markers to cycle through
# 'o': circle, 's': square, '^': triangle_up, 'D': diamond, 'v': triangle_down, 'X': x, 'P': plus
markers = ['o', 's', '^', 'D', 'v', 'X', 'P', '*', 'h', '<']

for i, level in enumerate(unique_levels):
    # Filter data for this level
    subset = sorted_sim[sorted_sim["level"] == level]
    
    # Label for legend
    label_text = f"Level {level}"
    if level == 0: 
        label_text = "Level 0 (CEO)"
    if level == -1: 
        label_text = "Level -1 (No Report)"
    
    # Select marker for this level
    # Special case for CEO if you want to force a specific one (e.g., Star '*')
    if level == 0:
        current_marker = '*' 
        current_size = 200 # Make CEO bigger
    else:
        current_marker = markers[i % len(markers)]
        current_size = 60 # Default size
    
    # Scatter plot for this subset
    plt.scatter(
        subset.Centrality, 
        subset.Average_Violations, 
        s=current_size, 
        alpha=0.5, 
        label=label_text,
        color=cmap(i % 10), 
        marker=current_marker
    )

plt.xlabel("Eigenvector Centrality")
plt.ylabel(f"Average Violations (over {N_RUNS} runs)")
plt.title("Agent Performance: Centrality vs. Avg Violations")

# Place legend
plt.legend(title="Hierarchy Level", bbox_to_anchor=(1.05, 1), loc='best')

plt.grid(True, alpha=0.3)
plt.tight_layout() 
plt.savefig("output/violations-centr-email.png")
plt.show()


## Correlation between centralities

In [None]:
df_sim_deg=pd.read_csv("Emails_Analysis.csv")
df_sim_eigen=pd.read_csv("Emails_Analysis_eigenvector.csv")
np.corrcoef(df_sim_deg["Centrality"],df_sim_eigen["Centrality"])

In [None]:
plt.scatter(df_sim_deg["Centrality"],df_sim_eigen["Centrality"])