In [1]:
#packages
import pandas as pd
import numpy as np


In [2]:
#steps to complete the project
# similarity matrix and CDI
#step 1: clean CDI data x
    #define PLT, LB, TT, Faller
    # once a word is learned, it stays learned x
    # binary known / unknown words x
    #written in Long_Cleaning_Categorization.ipynb
#step 2:
#   build a network for each child at each time point using 
    # (a) similarity matrix and 
    # (b) phono matrix
# step 3: built random networks for each child at each time point
    # preseverve size and how to connect to determine if network structure 
    # arises from english language or child-specific learning patterns 
# step 4: measure network properties at each time point for each child
    # size
    # clustering coefficient
    # average path length
    # mean degree 
    # redundancy
    # average geodesic distance
# step 5: statistical analyses
    # compare network properties across PLT and Late Bloomer groups
        # determine differences controlling for vocabulary size
    # compare to random networks
# step 6: visualizations


#### STEP 1: Clean CDI data

In [3]:
#step 1: upload clean CDI data
CDI= pd.read_csv('/Users/abbyhultquist/Documents/First Year Project/long_categorization_6.csv')
# normalize child_id type for consistent joins
CDI['child_id'] = CDI['child_id'].astype(str)

#creating a list of ALL words 
word_cols = CDI.columns[21:].tolist()
metadata_cols = CDI.columns[:21].tolist()
print("total # words considered:", len(word_cols))
print("metadata columns start and end:", metadata_cols[0],",", metadata_cols[-1])
print("word columns start and end:", word_cols[0],",", word_cols[-1])

total # words considered: 680
metadata columns start and end: child_id , Talker_Type
word columns start and end: baa baa , then


#### STEP 2: Known word networks

In [4]:
# semantic similarity matrix
similarity_matrix = pd.read_csv("/Users/abbyhultquist/Documents/First Year Project/similarity_mat/nouns.csv")

# Set index for similarity matrix
similarity_matrix = similarity_matrix.set_index('Unnamed: 0')

#thresholding similarity matrix to create binary connections
threshold = 0.5  

# Create a list of known words for each child at each session
known_words_list = []
for idx, row in CDI.iterrows():
    child = row['child_id']
    session = row['session_num']
    known = [col for col in word_cols if row[col] == 1]
    known_words_list.append({'child_id': child, 'session_num': session, 'known_words': known, 'num_known': len(known)})

# Convert to DataFrame
known_words_df = pd.DataFrame(known_words_list)

# Display the first few rows
known_words_df.head(100)

Unnamed: 0,child_id,session_num,known_words,num_known
0,4139,1,"[baa baa, moo, yum yum, bear, bird, cat, dog, ...",94
1,4139,2,"[baa baa, grr, meow, moo, ouch, uh oh, vroom, ...",164
2,4139,3,"[baa baa, choo choo, grr, meow, moo, ouch, uh ...",204
3,4139,4,"[baa baa, choo choo, grr, meow, moo, ouch, qua...",230
4,4139,5,"[baa baa, choo choo, grr, meow, moo, ouch, qua...",268
...,...,...,...,...
95,4250,3,"[quack quack, woof woof, dog, goose, kitty, tr...",16
96,4250,4,"[quack quack, woof woof, dog, goose, kitty, tr...",21
97,4250,5,"[choo choo, quack quack, uh oh, woof woof, bee...",37
98,4250,6,"[choo choo, grr, meow, moo, quack quack, uh oh...",57


In [17]:
CDI[["child_id", "Talker_Type"]] 

Unnamed: 0,child_id,Talker_Type
0,4139,TT
1,4139,TT
2,4139,TT
3,4139,TT
4,4139,TT
...,...,...
1162,6054,
1163,6076,
1164,6248,
1165,6334,


In [None]:
# Count rows, sum word columns, and average words known by Talker_Type (session 1 only)
session1 = CDI[CDI['session_num'] == 1]

rows_by_talker = session1.groupby('Talker_Type').size().to_frame('n_rows')
words_by_talker = session1.groupby('Talker_Type')[word_cols].sum().sum(axis=1).to_frame('total_words_known')
avg_words_known = (
    session1[word_cols]
    .sum(axis=1)
    .groupby(session1['Talker_Type'])
    .mean()
    .to_frame('avg_words_known')
)

print("average words known by Talker_Type in session 1:", avg_words_known) #this should match # of nodes


average words known by Talker_Type in session 1:              avg_words_known
Talker_Type                 
Faller             31.000000
LB                 19.526316
PLT                11.882353
TT                117.016393


In [9]:
# Build a graph for every child in session 1

import networkx as nx

session = 1
graphs = {}
missing = []

for child_id in known_words_df['child_id'].unique():
    sel = known_words_df[(known_words_df['child_id'] == child_id) & (known_words_df['session_num'] == session)]
    if sel.empty:
        missing.append(child_id)
        continue
    known_words = sel['known_words'].iloc[0]
    G = nx.Graph()
    G.add_nodes_from(known_words)
    for i, word_i in enumerate(known_words):
        for j, word_j in enumerate(known_words[i+1:], start=i+1):
            if word_i in similarity_matrix.index and word_j in similarity_matrix.columns:
                sim = similarity_matrix.loc[word_i, word_j]
                if sim > threshold:
                    G.add_edge(word_i, word_j, weight=sim)
    graphs[child_id] = G

print(f"Built {len(graphs)} graphs for session {session}.")
if missing:
    print(f"Missing session {session} for {len(missing)} children.")

Built 121 graphs for session 1.
Missing session 1 for 1 children.


In [36]:
# Build a per-child metrics table for session 1 graphs and aggregate by Talker_Type
graph_metrics = pd.DataFrame(
    [
        {
            'child_id': cid,
            'num_nodes': G.number_of_nodes(),
            'num_edges': G.number_of_edges(),
            'avg_degree': (2 * G.number_of_edges() / G.number_of_nodes()) if G.number_of_nodes() > 0 else 0,
            'avg_clustering': nx.average_clustering(G) if G.number_of_nodes() > 0 else 0,
            'avg_geodesic_distance': (
                nx.average_shortest_path_length(G)
                if G.number_of_nodes() > 1 and nx.is_connected(G)
                else (
                    nx.average_shortest_path_length(G.subgraph(max(nx.connected_components(G), key=len)))
                    if G.number_of_nodes() > 1 and G.number_of_edges() > 0
                    else 0
                )
            ),
            'redundancy': (
                (G.number_of_edges() - (G.number_of_nodes() - 1)) / (G.number_of_nodes() - 1)
                if G.number_of_nodes() > 1
                else 0
            ),
        }
        for cid, G in graphs.items()
    ]
)

talker_lookup = (
    CDI[['child_id', 'Talker_Type']]
    .drop_duplicates()
    .assign(child_id=lambda df: df['child_id'].astype(str))
 )

graph_metrics_with_talker = graph_metrics.merge(talker_lookup, on='child_id', how='left')

metrics_by_talker = (
    graph_metrics_with_talker
    .groupby('Talker_Type', dropna=False)
    .agg(
        n_children=('child_id', 'count'),
        avg_nodes=('num_nodes', 'mean'),
        avg_degree=('avg_degree', 'mean'),
        avg_clustering=('avg_clustering', 'mean'),
        avg_geodesic_distance=('avg_geodesic_distance', 'mean'),
        avg_redundancy=('redundancy', 'mean'),
    )
    .sort_index()
 )

metrics_by_talker

Unnamed: 0_level_0,n_children,avg_nodes,avg_degree,avg_clustering,avg_geodesic_distance,avg_redundancy
Talker_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Faller,1,31.0,1.677419,0.316129,1.357143,-0.133333
LB,19,19.526316,1.12771,0.249325,1.262107,-0.404105
PLT,17,11.882353,0.690603,0.14322,0.861991,-0.562022
TT,61,117.016393,12.173107,0.410511,1.95411,5.135871
,23,108.304348,9.513231,0.336644,1.725232,3.837097


In [35]:
graph_metrics_with_talker.head()

Unnamed: 0,child_id,num_nodes,num_edges,avg_degree,avg_clustering,Talker_Type
0,4139,94,651,13.851064,0.485762,TT
1,4155,60,136,4.533333,0.376946,TT
2,4162,127,1153,18.15748,0.497102,TT
3,4186,117,782,13.367521,0.470636,TT
4,4189,48,99,4.125,0.394439,PLT
