In [12]:
import json
import pandas as pd
import os
import re

# === CONFIGURATION ===
HOMOG_FOLDER = "./nomiclaw/nomiclaw_homogeneous"
HET_G_FOLDER = "./nomiclaw/nomiclaw_heterogeneous"

HET_MODEL_MAP = {
    "agent_1":  "phi4",    "agent_2":  "gemma3", "agent_3":  "llama3",
    "agent_4":  "qwen3",   "agent_5":  "phi4-reasoning", "agent_6":  "phi4-mini-reasoning",
    "agent_7":  "granite3.3", "agent_8":  "gemma2", "agent_9":  "deepseek-r1",
    "agent_10": "llama2"
}

# regex patterns
het_pattern  = re.compile(r'run(\d+)_vignette(\d+)_results\.json')
homo_pattern = re.compile(r'homogeneous_(.+?)_vignette(\d+)_results\.json')

def parse_rule(rule_text, reasoning_text):
    # assume rule_text holds the parsed rule; fallback to raw
    return rule_text, reasoning_text

def parse_vote(vote_id, reasoning_text):
    # assume vote_id holds parsed vote; fallback to raw
    return vote_id, reasoning_text

def export(folder, setup, model_map=None, out_csv=None, filename_pattern=None):
    records = []
    for fname in sorted(os.listdir(folder)):
        if not fname.endswith(".json"):
            continue

        # extract metadata
        if setup == "heterogeneous":
            m = het_pattern.search(fname)
            run_id = int(m.group(1))
            vignette_id = int(m.group(2))
            model_map_local = model_map
        else:  # homogeneous
            m = homo_pattern.search(fname)
            run_id = None
            vignette_id = int(m.group(2))
            # model name from filename
            model_name = m.group(1)
            # map every agent to this one model
            model_map_local = {f"agent_{i+1}": model_name for i in range( (10 if model_map is None else len(model_map)) )}

        game = json.load(open(os.path.join(folder, fname)))
        for rnd in game:
            round_num = rnd["round_number"]
            winner = rnd.get("winner_id", None)
            votes = [v["to"] for v in rnd["voting_network"]]
            vote_counts = pd.Series(votes).value_counts().to_dict()

            for aid, ad in rnd["agents"].items():
                model = model_map_local.get(aid, aid)
                # parse or fallback
                rule, pr = parse_rule(ad.get("proposed_rule",""), ad.get("proposal_reasoning",""))
                v_target, vr = parse_vote(ad.get("vote",""), ad.get("voting_reasoning",""))

                records.append({
                    "setup": setup,
                    "run_id": run_id,
                    "vignette_id": vignette_id,
                    "round": round_num,
                    "agent_id": aid,
                    "model": model,
                    "winner_id": winner,
                    "votes_received": vote_counts.get(aid, 0),
                    # proposal fields
                    "parsed_rule": rule,
                    "parsed_reasoning": pr,
                    "raw_proposal_reasoning": ad.get("proposal_reasoning",""),
                    # vote fields
                    "parsed_vote": v_target,
                    "parsed_voting_reasoning": vr,
                    "raw_voting_reasoning": ad.get("voting_reasoning",""),
                    # scores
                    "cumulative_score": ad.get("cumulative_score", None),
                    "score_change": ad.get("score_change", None),
                })

    df = pd.DataFrame(records)
    df.to_csv(out_csv, index=False)
    print(f"✅ Exported {len(df)} rows to {out_csv}")

# Run exports
export(HOMOG_FOLDER, "homogeneous", model_map=None, 
       out_csv="qualitative_homogeneous_full.csv", filename_pattern=homo_pattern)

export(HET_G_FOLDER, "heterogeneous", model_map=HET_MODEL_MAP, 
       out_csv="qualitative_heterogeneous_full.csv", filename_pattern=het_pattern)


✅ Exported 1000 rows to qualitative_homogeneous_full.csv
✅ Exported 1200 rows to qualitative_heterogeneous_full.csv


In [13]:
import pandas as pd

# Load qualitative CSVs
het_df = pd.read_csv('./qualitative_heterogeneous_full.csv')
homo_df = pd.read_csv('./qualitative_homogeneous_full.csv')

# Verify row counts
het_rows = len(het_df)
homo_rows = len(homo_df)

# Expected rows
expected_het = 6 * 4 * 5 * 10  # runs * vignettes * rounds * agents
expected_homo = 10 * 4 * 5 * 5  # models * vignettes * rounds * agents

# Entries per model
het_counts = het_df['model'].value_counts().reset_index().rename(columns={'index':'model','model':'count'})
homo_counts = homo_df['model'].value_counts().reset_index().rename(columns={'index':'model','model':'count'})

# Parse failure summary
het_parse_fail = het_df['parsed_reasoning'].isna().sum() + het_df['parsed_voting_reasoning'].isna().sum()
homo_parse_fail = homo_df['parsed_reasoning'].isna().sum() + homo_df['parsed_voting_reasoning'].isna().sum()

# Display
print("Qualitative Extraction Summary", pd.DataFrame({
    'Dataset': ['Heterogeneous', 'Homogeneous'],
    'Rows Loaded': [het_rows, homo_rows],
    'Expected Rows': [expected_het, expected_homo],
    'Total Parse Failures': [het_parse_fail, homo_parse_fail]
}))

print("Entries per Model (Heterogeneous)", het_counts)
print("Entries per Model (Homogeneous)", homo_counts)


Qualitative Extraction Summary          Dataset  Rows Loaded  Expected Rows  Total Parse Failures
0  Heterogeneous         1200           1200                     0
1    Homogeneous         1000           1000                     0
Entries per Model (Heterogeneous)                  count  count
0                 phi4    120
1               gemma3    120
2               llama3    120
3                qwen3    120
4       phi4-reasoning    120
5  phi4-mini-reasoning    120
6           granite3.3    120
7               gemma2    120
8          deepseek-r1    120
9               llama2    120
Entries per Model (Homogeneous)                  count  count
0          deepseek-r1    100
1               gemma2    100
2               gemma3    100
3           granite3.3    100
4               llama2    100
5               llama3    100
6  phi4-mini-reasoning    100
7       phi4-reasoning    100
8                 phi4    100
9                qwen3    100


In [17]:
import pandas as pd

# 1. Load the qualitative exports
het = pd.read_csv('qualitative_heterogeneous_full.csv')
homo = pd.read_csv('qualitative_homogeneous_full.csv')

# 2. Tag the setup
het['setup']  = 'heterogeneous'
homo['setup'] = 'homogeneous'

# 3. Combine
df = pd.concat([het, homo], ignore_index=True)

# 4. Reconstruct mentions columns
#    - parsed_vote holds the agent_id they voted for (or blank)
#    - raw_voting_reasoning contains the full text
df['mentions_vote_target'] = df.apply(
    lambda r: isinstance(r['parsed_vote'], str) and r['parsed_vote'] in str(r['raw_voting_reasoning']),
    axis=1
)
df['mentions_winner'] = df.apply(
    lambda r: isinstance(r['winner_id'], str) and r['winner_id'] in str(r['raw_voting_reasoning']),
    axis=1
)

# 5. Aggregate per setup and model
grouped = df.groupby(['setup','model']).agg(
    total_votes         = ('raw_voting_reasoning','count'),
    mentioned_vote_target = ('mentions_vote_target','sum'),
    mentioned_winner      = ('mentions_winner','sum')
).reset_index()

# 6. Compute percentages
grouped['Explicitly Supported Peer (%)']    = (grouped['mentioned_vote_target'] / grouped['total_votes'] * 100).round(1)
grouped['Acknowledged Final Winner (%)']   = (grouped['mentioned_winner']    / grouped['total_votes'] * 100).round(1)

# 7. Select and sort for display
result = grouped[[
    'setup','model',
    'Explicitly Supported Peer (%)',
    'Acknowledged Final Winner (%)'
]].sort_values(['setup','Explicitly Supported Peer (%)'], ascending=[True, False])

print(result.to_markdown(index=False))


| setup         | model               |   Explicitly Supported Peer (%) |   Acknowledged Final Winner (%) |
|:--------------|:--------------------|--------------------------------:|--------------------------------:|
| heterogeneous | phi4-reasoning      |                            54.2 |                            43.3 |
| heterogeneous | llama3              |                            52.5 |                            11.7 |
| heterogeneous | granite3.3          |                            29.2 |                            15   |
| heterogeneous | llama2              |                            27.5 |                            19.2 |
| heterogeneous | phi4-mini-reasoning |                            27.5 |                            14.2 |
| heterogeneous | phi4                |                            24.2 |                            10   |
| heterogeneous | deepseek-r1         |                            19.2 |                             9.2 |
| heterogeneous | gemma2    

The influence‐alignment table shows how often each model explicitly cites another agent’s proposal when voting (“Explicitly Supported Peer”) and how often it mentions the eventual round winner in its justification (“Acknowledged Final Winner”), separately for heterogeneous and homogeneous populations.

1. Peer Support Drops in Mixing:

    In the homogeneous condition, llama3 most frequently cites a specific peer (76 %), followed by phi4-reasoning (48 %) and granite3.3 (30 %). This suggests that when surrounded by identical agents, models are more willing to explicitly anchor their vote to another’s rule.

    In the heterogeneous setting, explicit peer‐support rates fall across the board. phi4-reasoning (54.2 %) and llama3 (52.5 %) still lead, but even they drop compared to homogeneous (54.2 % vs. 48 % for phi4-reasoning is actually slightly higher; llama3 drops from 76 % to 52.5 %). Most models show declines of 10–30 percentage points. This indicates that model diversity dampens explicit coalition signaling.

2. Winner Acknowledgment Is Selective:

    Homogeneously, models mention the final winner in 16–27 % of justifications (peaking at 27 % for llama3 and 26 % for phi4-reasoning).

    Heterogeneously, this drops by roughly half for most models: phi4-reasoning goes from 26 % → 43.3 %? (phi4-reasoning actually increases—interesting anomaly), while llama3 falls from 27 % → 11.7 %. Others like qwen3 plummet from 4 % → 1.7 %. Overall, mixed groups are less likely to explicitly endorse or recognize the winner’s proposal.

3. Model‐Specific Patterns:

    phi4-reasoning bucked the overall trend: it increases explicit peer support from 48 % → 54.2 % and winner acknowledgment from 26 % → 43.3 %. This suggests phi4-reasoning may adaptively highlight allies even in diverse company.

    llama3 remains highly social in both settings but still shows reduced winner‐mention (27 % → 11.7 %).

    deepseek-r1 and gemma2 hover around ~29 % → 19 % peer support and ~16 % → 9 % acknowledgments, reflecting a moderate but consistent tendency to recognize others.

    Underperformers like gemma3 and qwen3 rarely cite peers or winners (< 11 % in all cases), indicating a self‐focused or inattentive voting rationale.

Overall Insight:
Mixing LLM architectures weakens explicit coalition‐building signals and reduces overt recognition of successful proposals, except for certain models (notably phi4-reasoning) that become even more pronounced in their social reasoning. This aligns with our quantitative findings of lower reciprocity and higher volatility in heterogeneous settings.