# Imports

In [59]:
%%capture
import pandas as pd 
import re
from pathlib import Path
from datasets import load_dataset
from importlib import reload

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = 'mps'

# pd.set_option('display.max_rows', 100)


In [14]:
# load transcripts
dir = Path('../data/transcripts')
doc_paths = sorted(dir.glob('*.txt'))

# Eval
"Common sense" checks using openrouter

In [21]:
# create string representing a single conversation
doc = doc_paths[0]

with doc.open('r', encoding = 'utf-8') as f:
    lines = [" ".join(line.split()) for line in f if line.strip()]

In [5]:
# need to think more abt. prompting strategy...lot of variance...
task = '''This transcript contains speaker labels. Some labels refer to the same real person.

Tasks:
1) How many unique real participants are there?
2) Which speaker labels are duplicates of the same person?
3) For each duplicate, say which label it should be merged into.
4) Give one short sentence describing each real participant.'''


In [5]:
messages = [{'role': 'user', 'content': f'{task}: {conversation}'}]

response, reasoning, refusal, provider = send_openrouter_request(messages, model = 'allenai/olmo-3.1-32b-instruct')
print(response)

Here’s a detailed answer to your four tasks based on the provided transcript:

---

### 1) How many unique real participants are there?

There are **three unique real participants** in the transcript:

- **Jasmine** (the interviewer/reporter from NBC News)
- **Speaker 4 / Lynn** (the CEO and co-founder of Fireworks, also referred to as "Lynn" by Jasmine at the end)
- **Speaker 1** (whose identity is not explicitly named, but appears to be someone else in the conversation, possibly a colleague of Jasmine or part of her team)

However, upon close reading, "Speaker 1" is likely the same as Jasmine, based on the context and conversational flow. Jasmine is the one initiating and leading the interview, and "Speaker 1" seems to be her speaking at several points (e.g., "Speaker 1 12:33:04 ... we represent fireworks..."). The transcript labels may be inconsistent, but the content suggests only two main people are speaking before Lynn joins: Jasmine and possibly another person (Speaker 2 and Spe

# Cleaning
Creating a speaker:token mapping


In [51]:
# structure message dictionary
time = re.compile(r'\d{2}:\d{2}:\d{2}')

turns, speaker, msgs = [], None, []

for ln in lines:
    if time.search(ln):
        if speaker and msgs:
            speaker = time.sub('', speaker).strip()
            turns.append({'speaker': speaker, 'text': ' '.join(msgs)})
        speaker, msgs = ln, []
    else:
        if speaker:
            msgs.append(ln)

if speaker and msgs:
    speaker = time.sub('', speaker).strip()
    turns.append({"speaker": speaker, "text": " ".join(msgs)})

In [52]:
# decompose to word level
words = []
for t in turns:
    # print(t['text'])
    for w in t['text'].split():
        words.append({'speaker': t['speaker'], 'word': w})

df = pd.DataFrame(words)

df.tail(100)

Unnamed: 0,speaker,word
4085,Lin,across
4086,Lin,industry
4087,Lin,to
4088,Lin,standardize
4089,Lin,more.
...,...,...
4180,Jasmine,so
4181,Jasmine,"much,"
4182,Jasmine,and
4183,Jasmine,really


# Activations
!important: make sure you have required mem, especially if running locally.

In [56]:
# from huggingface_hub import snapshot_download
# mid = 'allenai/Olmo-3-7B-Instruct'
# snapshot_download(
#             repo_id = mid
            
#         )

Fetching 15 files: 100%|██████████| 15/15 [17:47<00:00, 71.17s/it]


'/Users/jasminecui/.cache/huggingface/hub/models--allenai--Olmo-3-7B-Instruct/snapshots/096bb5469fe34348bc88d851a69edb3bf6f40df4'

In [None]:
# load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained('allenai/Olmo-3-7B-Instruct', local_files_only = True)
model = AutoModelForCausalLM.from_pretrained(
    'allenai/Olmo-3-7B-Instruct',
    torch_dtype = torch.bfloat16,
    local_files_only = True
    ).to(device)


Loading weights: 100%|██████████| 355/355 [00:00<00:00, 2907.19it/s, Materializing param=model.norm.weight]                                


In [70]:
print("is_fast:", tokenizer.is_fast)
print("has_chat_template:", tokenizer.chat_template is not None)

is_fast: True
has_chat_template: True


In [73]:
messages = [{"role": "user", "content": "Tell me a fun fact about fish"}]

enc = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
)

# enc is likely a BatchEncoding; pass the tensor explicitly:
input_ids = enc["input_ids"].to(device)

with torch.no_grad():
    out = model.generate(
        input_ids=input_ids,
        max_new_tokens=50,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

print(tokenizer.decode(out[0], skip_special_tokens=True))


system
You are a helpful function-calling AI assistant. You do not currently have access to any functions. <functions></functions>
user
Tell me a fun fact about fish
assistant
Sure! Here’s a fun fact: **Some species of fish can regrow their fins and even parts of their tails if they lose them during an encounter with a predator or while swimming through rough water.** This ability is called **fin regeneration**,


In [66]:
model

Olmo3ForCausalLM(
  (model): Olmo3Model(
    (embed_tokens): Embedding(100278, 4096, padding_idx=100277)
    (layers): ModuleList(
      (0-31): 32 x Olmo3DecoderLayer(
        (self_attn): Olmo3Attention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (q_norm): Olmo3RMSNorm((4096,), eps=1e-06)
          (k_norm): Olmo3RMSNorm((4096,), eps=1e-06)
        )
        (mlp): Olmo3MLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (post_attention_layernorm): Olmo3RMSNorm((4096,), eps=1e