In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import random

### Step 1: Load CDI data

In [None]:
CDI = pd.read_csv('/Users/abbyhultquist/Documents/First Year Project/long_categorization_6.csv')
CDI['child_id'] = CDI['child_id'].astype(str)

metadata_cols = CDI.columns[:21].tolist()
word_cols     = CDI.columns[21:].tolist()

print(f"{len(word_cols)} word columns, {CDI['child_id'].nunique()} children")

680 word columns, 122 children


### Step 2: Load similarity matrix and build known-word lists

In [3]:
similarity_matrix = (
    pd.read_csv("/Users/abbyhultquist/Documents/First Year Project/similarity_mat/nouns.csv")
    .set_index("Unnamed: 0")
)
sim_vocab = set(similarity_matrix.index) & set(similarity_matrix.columns)
THRESHOLD = 0.5

# For each child x session, record which sim_vocab words they know
known_words_df = pd.DataFrame([
    {
        "child_id":   str(row["child_id"]),
        "session_num": row["session_num"],
        "known_words": [w for w in word_cols if row[w] == 1 and w in sim_vocab],
    }
    for _, row in CDI.iterrows()
])
known_words_df["num_known"] = known_words_df["known_words"].str.len()

talker_lookup = CDI[["child_id", "Talker_Type"]].drop_duplicates().assign(child_id=lambda d: d["child_id"].astype(str))

print(known_words_df.head())

  child_id  session_num                                        known_words  \
0     4139            1  [bear, bird, cat, dog, duck, frog, pig, sheep,...   
1     4139            2  [bear, bird, cat, dog, duck, frog, owl, pig, s...   
2     4139            3  [bear, bird, cat, cow, dog, duck, frog, owl, p...   
3     4139            4  [bear, bird, butterfly, cat, cow, dog, duck, e...   
4     4139            5  [bear, bee, bird, bunny, butterfly, cat, cow, ...   

   num_known  
0         60  
1         89  
2        112  
3        122  
4        149  


### Step 3: Build observed semantic graphs

In [None]:
def build_semantic_graph(words, sim_matrix, threshold):
    """Connect words whose cosine similarity exceeds threshold."""
    G = nx.Graph()
    G.add_nodes_from(words)
    for i, wi in enumerate(words):
        for wj in words[i+1:]:
            sim = sim_matrix.loc[wi, wj]
            if sim >= threshold:
                G.add_edge(wi, wj, weight=float(sim))
    return G

# Build graphs for each child x session
graphs = {}
for _, row in known_words_df.iterrows():
    child_id = row["child_id"]
    session = row["session_num"]
    words = row["known_words"]
    
    if len(words) > 0:
        key = (child_id, session)
        graphs[key] = build_semantic_graph(words, similarity_matrix, THRESHOLD)

print(f"Built {len(graphs)} graphs ({known_words_df['child_id'].nunique()} children × multiple sessions)")

Built 1162 graphs (122 children × multiple sessions)


In [15]:
# Summary: graphs per session and children per session
session_summary = pd.DataFrame([
    {
        "session_num": session,
        "num_graphs": sum(1 for (cid, s) in graphs.keys() if s == session),
        "num_children": len(set(cid for (cid, s) in graphs.keys() if s == session))
    }
    for session in sorted(known_words_df["session_num"].unique())
])

print("=== Graphs by Session ===")
print(session_summary)
print(f"\nTotal graphs built: {len(graphs)}")
print(f"Total children: {len(set(cid for (cid, s) in graphs.keys()))}")


=== Graphs by Session ===
    session_num  num_graphs  num_children
0             1         118           118
1             2         110           110
2             3         111           111
3             4         108           108
4             5         105           105
5             6          93            93
6             7          89            89
7             8          93            93
8             9          84            84
9            10          83            83
10           11          82            82
11           12          85            85
12           13           1             1

Total graphs built: 1162
Total children: 121


### Step 4: Compute graph metrics

In [16]:
def graph_metrics(child_id, session, G):
    n = G.number_of_nodes()
    m = G.number_of_edges()
    avg_degree     = 2 * m / n if n > 0 else 0
    avg_clustering = nx.average_clustering(G) if n > 0 else 0

    # Geodesic on largest connected component only if it has >1 node
    avg_geodesic = 0
    if m > 0:
        lcc = G.subgraph(max(nx.connected_components(G), key=len)).copy()
        if lcc.number_of_nodes() > 1:
            avg_geodesic = nx.average_shortest_path_length(lcc)

    return {
        "child_id":              child_id,
        "session_num":           session,
        "num_nodes":             n,
        "avg_degree":            avg_degree,
        "avg_clustering":        avg_clustering,
        "avg_geodesic_distance": avg_geodesic,
    }

obs_metrics = pd.DataFrame([graph_metrics(cid, sess, G) for (cid, sess), G in graphs.items()])
obs_metrics_full = obs_metrics.merge(talker_lookup, on="child_id", how="left")

print(f"Computed metrics for {len(obs_metrics)} graphs")
print(obs_metrics_full.head())
print("\nMetrics by Talker_Type:")
print(obs_metrics_full.groupby("Talker_Type").mean(numeric_only=True).round(3))

Computed metrics for 1162 graphs
  child_id  session_num  num_nodes  avg_degree  avg_clustering  \
0     4139            1         60   21.700000        0.761027   
1     4139            2         89   32.898876        0.779075   
2     4139            3        112   39.267857        0.780101   
3     4139            4        122   42.950820        0.779165   
4     4139            5        149   53.302013        0.782518   

   avg_geodesic_distance Talker_Type  
0               2.395091         NaN  
1               2.232635         NaN  
2               2.272683         NaN  
3               2.232760         NaN  
4               2.210230         NaN  

Metrics by Talker_Type:
             session_num  num_nodes  avg_degree  avg_clustering  \
Talker_Type                                                       
Faller             6.318     95.545      30.154           0.760   
LB                 6.409    117.695      40.109           0.702   
PLT                6.575     47.938      14

### Step 5a: Erdős–Rényi random graphs (preserve n and m)

In [18]:
N_SAMPLES = 1   # more samples = more stable baseline (was 10)

def er_random_metrics(child_id, session, G, n_samples=N_SAMPLES):
    """Generate ER random graphs with same n and m; average metrics."""
    n, m = G.number_of_nodes(), G.number_of_edges()
    rows = []
    for _ in range(n_samples):
        R = nx.gnm_random_graph(n, m)
        rows.append(graph_metrics(child_id, session, R))
    result_dict = pd.DataFrame(rows).mean(numeric_only=True).to_dict()
    result_dict.update({"child_id": child_id, "session_num": session})
    return result_dict

er_metrics = pd.DataFrame([er_random_metrics(cid, sess, G) for (cid, sess), G in graphs.items()])
er_metrics = er_metrics.rename(columns={c: f"er_{c}" for c in er_metrics.columns if c not in ["child_id", "session_num"]})
er_metrics["child_id"] = er_metrics["child_id"].astype(str)

### Step 5b: Semantic random graphs (preserve n, random words from sim_vocab)

In [19]:
def semantic_random_metrics(child_id, session, G, sim_matrix, vocab, threshold, n_samples=N_SAMPLES):
    """Sample n random words from full vocab, connect by similarity; average metrics."""
    n = G.number_of_nodes()
    vocab_list = list(vocab)
    rows = []
    for _ in range(n_samples):
        words = random.sample(vocab_list, min(n, len(vocab_list)))
        R = build_semantic_graph(words, sim_matrix, threshold)
        rows.append(graph_metrics(child_id, session, R))
    result_dict = pd.DataFrame(rows).mean(numeric_only=True).to_dict()
    result_dict.update({"child_id": child_id, "session_num": session})
    return result_dict

sem_metrics = pd.DataFrame([
    semantic_random_metrics(cid, sess, G, similarity_matrix, sim_vocab, THRESHOLD)
    for (cid, sess), G in graphs.items()
])
sem_metrics = sem_metrics.rename(columns={c: f"sem_{c}" for c in sem_metrics.columns if c not in ["child_id", "session_num"]})
sem_metrics["child_id"] = sem_metrics["child_id"].astype(str)

### Step 6: Merge and compare

In [None]:
comparison = (
    obs_metrics_full
    .merge(er_metrics,  on=["child_id", "session_num"], how="left")
    .merge(sem_metrics, on=["child_id", "session_num"], how="left")
)

# Observed minus each baseline
for metric in ["avg_clustering", "avg_geodesic_distance"]:
    comparison[f"{metric}_vs_er"]  = comparison[metric] - comparison[f"er_{metric}"]
    comparison[f"{metric}_vs_sem"] = comparison[metric] - comparison[f"sem_{metric}"]

by_talker = (
    comparison.groupby("Talker_Type")
    .agg(
        n=("child_id", "count"),
        # Nodes
        obs_num_nodes=("num_nodes", "mean"),
        er_num_nodes=("er_num_nodes", "mean"),
        sem_num_nodes=("sem_num_nodes", "mean"),
        # Degree
        obs_degree=("avg_degree", "mean"),
        er_degree=("er_avg_degree", "mean"),
        sem_degree=("sem_avg_degree", "mean"),
        # Clustering
        obs_clustering=("avg_clustering", "mean"),
        er_clustering=("er_avg_clustering", "mean"),
        sem_clustering=("sem_avg_clustering", "mean"),
        # Geodesic
        obs_geodesic=("avg_geodesic_distance", "mean"),
        er_geodesic=("er_avg_geodesic_distance", "mean"),
        sem_geodesic=("sem_avg_geodesic_distance", "mean"),
        
    )
    .round(4)
)

print(f"Comparison shape: {comparison.shape}")
print(by_talker)

Comparison shape: (1162, 19)
               n  obs_num_nodes  er_num_nodes  sem_num_nodes  obs_degree  \
Talker_Type                                                                
Faller        22        95.5455       95.5455        95.5455     30.1544   
LB           279       117.6953      117.6953       117.6953     40.1088   
PLT           80        47.9375       47.9375        47.9375     14.7805   
TT           626       201.8291      201.8291       201.8291     69.3403   

             er_degree  sem_degree  obs_clustering  er_clustering  \
Talker_Type                                                         
Faller         30.1544     33.5890          0.7597         0.3125   
LB             40.1088     41.6634          0.7019         0.3077   
PLT            14.7805     16.7822          0.6048         0.2515   
TT             69.3403     71.4373          0.7686         0.3371   

             sem_clustering  obs_geodesic  er_geodesic  sem_geodesic  
Talker_Type                 

In [21]:
diff_by_talker = pd.DataFrame({
    "n": by_talker["n"],
    # vs ER
    "nodes_vs_er":      by_talker["obs_num_nodes"]   - by_talker["er_num_nodes"],
    "degree_vs_er":     by_talker["obs_degree"]       - by_talker["er_degree"],
    "clustering_vs_er": by_talker["obs_clustering"]   - by_talker["er_clustering"],
    "geodesic_vs_er":   by_talker["obs_geodesic"]     - by_talker["er_geodesic"],
    # vs Semantic Random
    "nodes_vs_sem":      by_talker["obs_num_nodes"]  - by_talker["sem_num_nodes"],
    "degree_vs_sem":     by_talker["obs_degree"]      - by_talker["sem_degree"],
    "clustering_vs_sem": by_talker["obs_clustering"]  - by_talker["sem_clustering"],
    "geodesic_vs_sem":   by_talker["obs_geodesic"]    - by_talker["sem_geodesic"],
}).round(4)

print("=== Observed minus baseline (positive = observed higher) ===")
print(diff_by_talker)

=== Observed minus baseline (positive = observed higher) ===
               n  nodes_vs_er  degree_vs_er  clustering_vs_er  geodesic_vs_er  \
Talker_Type                                                                     
Faller        22          0.0           0.0            0.4472          0.6915   
LB           279          0.0           0.0            0.3942          0.2154   
PLT           80          0.0           0.0            0.3533          0.0666   
TT           626          0.0           0.0            0.4315          0.4470   

             nodes_vs_sem  degree_vs_sem  clustering_vs_sem  geodesic_vs_sem  
Talker_Type                                                                   
Faller                0.0        -3.4346            -0.0007           0.6152  
LB                    0.0        -1.5546             0.0053           0.1040  
PLT                   0.0        -2.0017             0.0206           0.2479  
TT                    0.0        -2.0970            -0.00

### Step 7: Export

In [None]:
comparison.to_csv("graph_metrics_comparison.csv", index=False)
print("Saved graph_metrics_comparison.csv")

Saved graph_metrics_comparison.csv
