In [1]:
import random
import sys
import tkinter as tkinter

from tkinter import filedialog as tkfiledialog

ENTRY_WIDTH = 20  # width of text entry field

In [2]:
import sys, importlib
print("sys.executable:", sys.executable)
print("sys.version:", sys.version)
print("pandas available:", importlib.util.find_spec("pandas") is not None)
# safe install using the kernel's interpreter
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas"])
%pip install pandas

sys.executable: /usr/local/bin/python3
sys.version: 3.14.2 (v3.14.2:df793163d58, Dec  5 2025, 12:18:06) [Clang 16.0.0 (clang-1600.0.26.6)]
pandas available: True
Note: you may need to restart the kernel to use updated packages.


In [3]:
import sys, importlib
print("sys.executable:", sys.executable)
print("sys.version:", sys.version)
print("pandas available:", importlib.util.find_spec("pandas") is not None)

sys.executable: /usr/local/bin/python3
sys.version: 3.14.2 (v3.14.2:df793163d58, Dec  5 2025, 12:18:06) [Clang 16.0.0 (clang-1600.0.26.6)]
pandas available: True


In [4]:
SIM_THRESHOLD = 0.8
import pandas as pd
sim = pd.read_csv("/Users/abbyhultquist/Documents/First Year Project/nouns.csv", index_col=0)


# builds a list of edges where similarity >= 0.9, in the variable `edges`
edges = []
for w1 in sim.index:
    for w2 in sim.columns:
        if w1 != w2 and sim.loc[w1, w2] >= SIM_THRESHOLD:
            edges.append((w1, w2))

In [5]:
# load (no index_col) or reset index if already loaded with index_col=0
cdi = pd.read_csv("/Users/abbyhultquist/Documents/First Year Project/CDI_cleaned.csv")
# or if already read with index_col=0: cdi = cdi.reset_index()

# normalize and coerce key columns
cdi.columns = cdi.columns.str.strip()
cdi['child_id'] = pd.to_numeric(cdi['child_id'], errors='coerce')
cdi['session_num'] = pd.to_numeric(cdi['session_num'], errors='coerce')

child_id = 4139
session_num = 1

mask = (cdi['child_id'] == int(child_id)) & (cdi['session_num'] == int(session_num))
print("matches:", mask.sum(), "matching indices:", cdi.index[mask].tolist())

rows = cdi.loc[mask]
if rows.empty:
    raise ValueError(f"No rows match child_id={child_id} session_num={session_num}")
row = rows.iloc[0]
print(row.to_dict())

row = cdi[
    (cdi.child_id == child_id) &
    (cdi.session_num == session_num)
].iloc[0]

# CDI word columns start after metadata
metadata_cols = [
    "child_id","study id","study","gender","age","birthday",
    "session_date","session_num","total num sessions",
    "words_spoken","items excluded","percentile",
    "extra categories","revision","languages",
    "num languages","cdi type","hard of hearing","deleted", "child_column_id", 
    "first_session_num", "late_talker","last_session_num",	"late_bloomer",	"plt",	"class",	
    "typical_talker",	"group"
]

word_cols = [c for c in cdi.columns if c not in metadata_cols]

known_words = {w for w in word_cols if row[w] == 1}

matches: 1 matching indices: [0]
{'child_id': 4139, 'study id': 'LTP102', 'study': 'Long-1', 'gender': 'F', 'age': 16.9953977646285, 'birthday': '2008/10/17', 'session_date': '2010/03/18', 'session_num': 1, 'total num sessions': 12, 'words_spoken': 91.0, 'items excluded': 0, 'percentile': 63.6363636363636, 'extra categories': 2, 'revision': 0, 'languages': 'english', 'num languages': 1, 'cdi type': 'fullenglishmcdi', 'hard of hearing': 0, 'deleted': 0, 'vocab_size': 91.0, 'Talker_Type': 'TT', 'baa baa': 1, 'choo choo': 0, 'cockadoodledoo': 0, 'grr': 0, 'meow': 0, 'moo': 1, 'ouch': 0, 'quack quack': 0, 'uh oh': 0, 'vroom': 0, 'woof woof': 0, 'yum yum': 1, 'alligator': 0, 'animal': 0, 'ant': 0, 'bear': 1, 'bee': 0, 'bird': 1, 'bug': 0, 'bunny': 0, 'butterfly': 0, 'cat': 1, 'chicken(animal)': 0, 'cow': 0, 'deer': 0, 'dog': 1, 'donkey': 0, 'duck': 1, 'elephant': 0, 'fish(animal)': 1, 'frog': 1, 'giraffe': 0, 'goose': 0, 'hen': 0, 'horse': 0, 'kitty': 0, 'lamb': 0, 'lion': 0, 'monkey': 0, '

In [6]:
# count known words
num = len(known_words)
print("known_words count:", num)

num_tot = len(word_cols)
print("total words count:", num_tot)


known_words count: 94
total words count: 691


In [7]:
import itertools

# build list of edges where similarity >= SIM_THRESHOLD, in the variable `child_edges`
child_edges = []
for w1, w2 in itertools.combinations(sorted(known_words), 2):
    try:
        if sim.loc[w1, w2] >= SIM_THRESHOLD:    # similarity threshold
            child_edges.append((w1, w2)) # add to list if similarity >= threshold
    except KeyError:
        # skip if either word not found in similarity matrix
        continue

print("child_edges count:", len(child_edges))
print("sample edges:", child_edges[:10])


child_edges count: 3
sample edges: [('banana', 'bread'), ('bread', 'cookie'), ('hat', 'shirt')]


In [8]:
all_words_df = pd.DataFrame(edges, columns=["word1", "word2"])
print("all_words edges:", len(all_words_df))
print(all_words_df.head())

# write to file (NO header, NO index)
all_words_path = "/Users/abbyhultquist/Documents/First Year Project/all_words.txt"
all_words_df.to_csv(all_words_path, index=False, header=False)

print("Saved all_words.txt to:", all_words_path)


all_words edges: 188
   word1       word2
0  ankle      street
1   aunt  babysitter
2   aunt        lady
3   aunt         man
4   aunt       woman
Saved all_words.txt to: /Users/abbyhultquist/Documents/First Year Project/all_words.txt


In [9]:
child_vocab_df = pd.DataFrame(child_edges, columns=["word1", "word2"])

print("child_vocab edges:", len(child_vocab_df))
print(child_vocab_df.head())

child_vocab_path = f"/Users/abbyhultquist/Documents/First Year Project/child_vocab_{child_id}_s{session_num}.txt"

child_vocab_df.to_csv(child_vocab_path, index=False, header=False)

print("Saved child vocabulary file to:", child_vocab_path)


child_vocab edges: 3
    word1   word2
0  banana   bread
1   bread  cookie
2     hat   shirt
Saved child vocabulary file to: /Users/abbyhultquist/Documents/First Year Project/child_vocab_4139_s1.txt


In [10]:
# how many unique words does the child network contain?
child_words_in_edges = set(child_vocab_df["word1"]).union(child_vocab_df["word2"])
print("unique words in child network:", len(child_words_in_edges))

# words known but isolated (no semantic edges)
isolated_words = known_words - child_words_in_edges
print("isolated known words (no edges):", len(isolated_words))


unique words in child network: 5
isolated known words (no edges): 89


In [11]:
import pandas as pd
import numpy as np
from scipy import stats


#Cleaning CDI Data
CDI_raw = pd.read_csv("/Users/abbyhultquist/Documents/First Year Project/CDI_raw.csv")
CDI = CDI_raw.copy()

print("Initial CDI shape:", CDI.shape)
CDI = CDI.sort_values(['child_id', 'session_num'])



#creating a list of ALL words 
word_cols = CDI.columns[19:].tolist()
metadata_cols = CDI.columns[:19].tolist()

print("total # words considered:", len(word_cols))

# Calculate initial vocab_size
CDI['vocab_size'] = CDI[word_cols].sum(axis=1)
print("Initial vocab_size calculated")




# Remove children with less than 4 sessions
session_counts = CDI.groupby('child_id')['session_num'].count()
valid_children = session_counts[session_counts >= 4].index
CDI = CDI[CDI['child_id'].isin(valid_children)]

print(f"After removing children with <4 sessions: {CDI.shape}")
print(f"Removed {len(session_counts) - len(valid_children)} children with < 4 sessions")


# Find and remove outliers
outliers_sessions = set(CDI[CDI['session_num'] > 12]['child_id'])
outliers_vocab = set(CDI[CDI['words_spoken'] > len(word_cols)]['child_id'])
z_scores = stats.zscore(CDI['words_spoken'])
outliers_zscore = set(CDI[np.abs(z_scores) > 3]['child_id'])

all_outliers = outliers_sessions | outliers_vocab | outliers_zscore

print(f"Removed {len(all_outliers)} additional outlier children:")
print(f"  - {len(outliers_sessions)} for session_num > 12")
print(f"  - {len(outliers_vocab)} for vocab_size > # words in CDI")
print(f"  - {len(outliers_zscore)} for zscore > 3 on vocab_size")

CDI = CDI[~CDI['child_id'].isin(all_outliers)].copy()

print(f"After removing outliers: {CDI.shape}")






#Once a word is learned, it stays learned
grouped = CDI.groupby('child_id')
for child_id, group in grouped:
    group = group.sort_values('session_num')
    for word in word_cols:
        # Find first session where word is known
        known_sessions = group[group[word] == 1]['session_num']
        if not known_sessions.empty:
            first_known = known_sessions.min()
            # Set word as known from first_known onwards
            CDI.loc[(CDI['child_id'] == child_id) & (CDI['session_num'] >= first_known), word] = 1
# Update words_spoken to reflect the cumulative knowledge
CDI['words_spoken'] = CDI[word_cols].sum(axis=1)

# Also update vocab_size
CDI['vocab_size'] = CDI['words_spoken']

print()
print("After ensuring cumulative knowledge and recalculating vocab_size")
print("Sample vocab_size stats:", CDI['vocab_size'].describe())
print("Updated words_spoken to match cumulative vocab_size")




#defining late talkers (0 = TT, 1 = LT)
#defining PLT (0 = Late bloomer, 1 = PLT, NA means they are a TT)
CDI["Late_Talker"] = 0
CDI["Persistent_Late_Talker"] = np.nan #Only LT get set to 0 or 1

grouped = CDI.groupby('child_id')

for child_id, group in grouped:
    first_session = group['session_num'].min()
    last_session = group['session_num'].max()
    
    # Check first session for this child
    first_session_data = group[group['session_num'] == first_session]
    
    if len(first_session_data) > 0:
        if first_session_data['percentile'].values[0] < 20:
            CDI.loc[CDI['child_id'] == child_id, 'Late_Talker'] = 1
        else:
            CDI.loc[CDI['child_id'] == child_id, 'Late_Talker'] = 0
    
    # Check last session for this child
    last_session_data = group[group['session_num'] == last_session]
    
    if len(last_session_data) > 0:
        late_talker_status = CDI.loc[CDI['child_id'] == child_id, 'Late_Talker'].values[0]
        
        if late_talker_status == 1:  # Was a late talker
            if last_session_data['percentile'].values[0] < 20:  # FIXED: Still below 20%
                CDI.loc[CDI['child_id'] == child_id, 'Persistent_Late_Talker'] = 1
            else:  # Improved (late bloomer)
                CDI.loc[CDI['child_id'] == child_id, 'Persistent_Late_Talker'] = 0
        else:  # Not a late talker
            CDI.loc[CDI['child_id'] == child_id, 'Persistent_Late_Talker'] = np.nan  # FIXED: Use np.nan

# Verify 
print("Late Talkers:", CDI[CDI['Late_Talker'] == 1]['child_id'].nunique())
print("Persistent Late Talkers:", CDI[CDI['Persistent_Late_Talker'] == 1]['child_id'].nunique())
print("Late Bloomers:", CDI[(CDI['Late_Talker'] == 1) & (CDI['Persistent_Late_Talker'] == 0)]['child_id'].nunique())

print("Final CDI shape:", CDI.shape)

# Reorder columns: metadata, added columns, word columns
added_cols = ['vocab_size', 'Late_Talker', 'Persistent_Late_Talker']
new_order = metadata_cols + added_cols + word_cols
CDI = CDI[new_order]

CDI.to_csv('CDI_cleaned.csv', index=False)

Initial CDI shape: (1167, 699)
total # words considered: 680
Initial vocab_size calculated
After removing children with <4 sessions: (1154, 700)
Removed 11 children with < 4 sessions
Removed 1 additional outlier children:
  - 1 for session_num > 12
  - 0 for vocab_size > # words in CDI
  - 0 for zscore > 3 on vocab_size
After removing outliers: (1141, 700)

After ensuring cumulative knowledge and recalculating vocab_size
Sample vocab_size stats: count    1141.000000
mean      318.234005
std       221.384675
min         0.000000
25%       105.000000
50%       305.000000
75%       523.000000
max       682.000000
Name: vocab_size, dtype: float64
Updated words_spoken to match cumulative vocab_size
Late Talkers: 41
Persistent Late Talkers: 9
Late Bloomers: 32
Final CDI shape: (1141, 702)


In [12]:
import threading
import tkinter as tkinter
from tkinter import filedialog as tkfiledialog

ENTRY_WIDTH = 20


def browse_for_entry(entry_box, dialog_type, filetype):
    if dialog_type == "open":
        name = tkfiledialog.askopenfilename(filetypes=[filetype])
    else:
        name = tkfiledialog.asksaveasfilename(filetypes=[filetype])
    entry_box.delete(0, tkinter.END)
    entry_box.insert(0, name)


def show_finished(status_label, run_button):
    run_button.configure(text="Run Again", state=tkinter.NORMAL)
    status_label.configure(text="Done.")


def run_dummy(status_label, run_button):
    # placeholder for now
    status_label.configure(text="Pretend model ran âœ”")
    show_finished(status_label, run_button)


if __name__ == "__main__":
    master = tkinter.Tk()
    master.title("Growth Model Runner")

    tkinter.Label(master, text="Child vocabulary:").grid(row=0, column=0, sticky=tkinter.E)
    tkinter.Label(master, text="All words:").grid(row=1, column=0, sticky=tkinter.E)
    tkinter.Label(master, text="Output summary:").grid(row=2, column=0, sticky=tkinter.E)

    child_vocab_entry = tkinter.Entry(master, width=40)
    child_vocab_entry.grid(row=0, column=1)

    all_words_entry = tkinter.Entry(master, width=40)
    all_words_entry.grid(row=1, column=1)

    output_entry = tkinter.Entry(master, width=40)
    output_entry.grid(row=2, column=1)

    tkinter.Button(
        master, text="Browse",
        command=lambda: browse_for_entry(child_vocab_entry, "open", ("Text files", ".txt"))
    ).grid(row=0, column=2)

    tkinter.Button(
        master, text="Browse",
        command=lambda: browse_for_entry(all_words_entry, "open", ("Text files", ".txt"))
    ).grid(row=1, column=2)

    tkinter.Button(
        master, text="Browse",
        command=lambda: browse_for_entry(output_entry, "saveas", ("CSV files", ".csv"))
    ).grid(row=2, column=2)

    status_label = tkinter.Label(master, text="Ready", width=ENTRY_WIDTH)
    status_label.grid(row=4, column=0, columnspan=3)

    run_button = tkinter.Button(
        master, text="Run Growth",
        command=lambda: run_dummy(status_label, run_button)
    )
    run_button.grid(row=3, column=0, columnspan=3, sticky=tkinter.E+tkinter.W)

    master.mainloop()
