# Assistive Keyboard — 7B (Colab A100, Drive‑persistent)

- base: `Qwen/Qwen2.5-7B-Instruct`
- Drive-backed code/data/adapters/results
- per-user LoRA + lexicon + RAG; eval KSS/latency


In [None]:
# gpu sanity
import sys, torch, platform
print('py:', sys.version.split()[0], '| cuda:', torch.cuda.is_available(), '| plat:', platform.platform())
if torch.cuda.is_available():
    !nvidia-smi

In [None]:
# mount drive + dirs
from google.colab import drive; drive.mount('/content/drive')
from pathlib import Path; import os
PROJ = Path('/content/drive/MyDrive/assistive_keyboard_7B'); PROJ.mkdir(parents=True, exist_ok=True)
CODE = PROJ/'code'; CODE.mkdir(exist_ok=True)
DATA = PROJ/'data'; (DATA/'processed').mkdir(parents=True, exist_ok=True)
SPLITS = PROJ/'splits'; SPLITS.mkdir(exist_ok=True)
USERS = PROJ/'users'; USERS.mkdir(exist_ok=True)
LEX = PROJ/'lexicons'; LEX.mkdir(exist_ok=True)
RAGD = PROJ/'rag'; RAGD.mkdir(exist_ok=True)
ADAPT = PROJ/'adapters'; ADAPT.mkdir(exist_ok=True)
RUNS = PROJ/'runs'; RUNS.mkdir(exist_ok=True)
CACHE = PROJ/'hf_cache'; CACHE.mkdir(exist_ok=True)
os.environ['HF_HOME'] = str(CACHE)
os.environ['TRANSFORMERS_CACHE'] = str(CACHE)
print('root:', PROJ)

In [None]:
# config (demo knobs)
MAX_TEST_AUTHORS = 20
ADAPT_TOKENS     = 2000
VAL_TOKENS       = 800
TEST_TOKENS      = 2000
LORA_STEPS       = 600
LORA_RANK        = 8
LORA_ALPHA       = 16
LORA_DROPOUT     = 0.05
BASE_MODEL       = 'Qwen/Qwen2.5-7B-Instruct'
USE_BF16_INSTEAD_OF_4BIT = False  # flip if 4bit acts up; A100-80G can do bf16 easily
SEED = 42

# seeds
import os, random, numpy as np, torch
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [None]:
# deps
%%bash
set -e
pip -q install --upgrade pip
pip -q install numpy pandas tqdm pyyaml regex scikit-learn ujson
pip -q install transformers accelerate datasets sentence-transformers
pip -q install faiss-cpu peft bitsandbytes bert-score mauve-text
python - <<'PY'
import torch; print('torch', torch.__version__, 'cuda?', torch.cuda.is_available())
PY

In [None]:
# project code → Drive
from pathlib import Path; import textwrap
for sub in ['src/utils','src/data','src/splits','src/lexicon','src/rag','src/lora','src/infer','src/eval']:
    (CODE/sub).mkdir(parents=True, exist_ok=True)
(CODE/'src/__init__.py').write_text('')
(CODE/'src/utils/__init__.py').write_text('')
(CODE/'src/utils/io.py').write_text(textwrap.dedent('''
from pathlib import Path
import json, ujson
def read_lines(p):
    return Path(p).read_text(encoding='utf-8').splitlines()
def write_lines(p, lines):
    Path(p).parent.mkdir(parents=True, exist_ok=True)
    Path(p).write_text('\n'.join(lines), encoding='utf-8')
def read_jsonl(p):
    out=[]
    with open(p,'r',encoding='utf-8') as f:
        for line in f:
            line=line.strip()
            if line: out.append(json.loads(line))
    return out
def write_jsonl(p, rows):
    Path(p).parent.mkdir(parents=True, exist_ok=True)
    with open(p,'w',encoding='utf-8') as f:
        for r in rows: f.write(ujson.dumps(r, ensure_ascii=False)+'\n')
'''))
(CODE/'src/data/__init__.py').write_text('')
(CODE/'src/data/clean.py').write_text(textwrap.dedent(r'''
import re
QUOTE_RE = re.compile(r'(?m)^(>+).*?$')
SIG_RE = re.compile(r'(?ims)--\s*\n.*?$')
def clean_text(s:str)->str:
    s=s.replace('\r\n','\n')
    s=re.sub(QUOTE_RE,'',s)
    s=re.sub(SIG_RE,'',s)
    s=re.sub(r'[ \t]+',' ',s)
    s=re.sub(r'\n{3,}','\n\n',s)
    return s.strip()
def approx_token_count(s:str)->int:
    return len(re.findall(r"\w+|[.,!?;:]", s))
'''))
(CODE/'src/data/enron_loader.py').write_text(textwrap.dedent(r'''
from pathlib import Path
from .clean import clean_text, approx_token_count
from src.utils.io import write_jsonl
def build_authors_jsonl(maildir_root: str, out_jsonl: str, min_doc_tokens: int = 20):
    rows=[]; maildir=Path(maildir_root)
    for user_dir in maildir.iterdir():
        if not user_dir.is_dir(): continue
        author_id=user_dir.name
        for p in user_dir.rglob('*'):
            if not p.is_file(): continue
            try: txt=p.read_text(errors='ignore')
            except Exception: continue
            txt=clean_text(txt)
            if approx_token_count(txt)>=min_doc_tokens:
                rows.append({'author_id':author_id,'doc_id':str(p.relative_to(maildir)),'text':txt})
    write_jsonl(out_jsonl, rows); print(f'wrote {len(rows)} docs → {out_jsonl}')
'''))
(CODE/'src/splits/__init__.py').write_text('')
(CODE/'src/splits/make_splits.py').write_text(textwrap.dedent(r'''
import argparse, random
from collections import defaultdict
from pathlib import Path
from src.utils.io import read_jsonl, write_lines
from src.data.clean import approx_token_count
def main():
    ap=argparse.ArgumentParser()
    ap.add_argument('--authors_jsonl', required=True)
    ap.add_argument('--out_dir', required=True)
    ap.add_argument('--min_tokens', type=int, default=4000)
    ap.add_argument('--adapt_tokens', type=int, default=2000)
    ap.add_argument('--val_tokens', type=int, default=800)
    ap.add_argument('--test_tokens', type=int, default=2000)
    ap.add_argument('--max_test_authors', type=int, default=4)
    ap.add_argument('--seed', type=int, default=42)
    args=ap.parse_args(); random.seed(args.seed)
    out_dir=Path(args.out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    rows=read_jsonl(args.authors_jsonl)
    by_author=defaultdict(list)
    for r in rows: by_author[r['author_id']].append(r['text'])
    kept={}
    for a,docs in by_author.items():
        tot=sum(approx_token_count(t) for t in docs)
        if tot>=args.min_tokens: kept[a]=docs
    authors=sorted(kept.keys()); random.shuffle(authors)
    n=len(authors); n_train=int(0.70*n); n_dev=int(0.15*n)
    train_ids=authors[:n_train]; dev_ids=authors[n_train:n_train+n_dev]; test_ids=authors[n_train+n_dev:]
    test_ids=test_ids[:args.max_test_authors]
    write_lines(out_dir/'authors_train.txt', train_ids)
    write_lines(out_dir/'authors_dev.txt', dev_ids)
    write_lines(out_dir/'authors_test.txt', test_ids)
    users_dir=Path(str(Path(out_dir).parent/'users')); users_dir.mkdir(exist_ok=True)
    for a in test_ids:
        texts=kept[a][:]; random.shuffle(texts)
        acc=0; adapt=[]; val=[]; test=[]
        for t in texts:
            tc=approx_token_count(t)
            if acc<args.adapt_tokens: adapt.append(t); acc+=tc
            elif acc<args.adapt_tokens+args.val_tokens: val.append(t); acc+=tc
            else: test.append(t)
        udir=users_dir/a; udir.mkdir(parents=True, exist_ok=True)
        (udir/'adapt.txt').write_text('\n\n'.join(adapt), encoding='utf-8')
        (udir/'val.txt').write_text('\n\n'.join(val), encoding='utf-8')
        (udir/'test.txt').write_text('\n\n'.join(test), encoding='utf-8')
    print(f'train/dev/test: {len(train_ids)}/{len(dev_ids)}/{len(test_ids)} | users/* ready')
if __name__=='__main__': main()
'''))
(CODE/'src/lexicon/__init__.py').write_text('')
(CODE/'src/lexicon/build_lexicon.py').write_text(textwrap.dedent(r'''
import argparse, json, re, numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
def tok(s):
    return re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\sA-Za-z0-9]", s)
def build_lex(text, k=4000):
    v=TfidfVectorizer(tokenizer=tok, lowercase=True, ngram_range=(1,2), min_df=2, max_df=0.9, use_idf=True, smooth_idf=True, norm=None)
    X=v.fit_transform(text.splitlines()); vocab=v.get_feature_names_out()
    scores=np.asarray(X.sum(axis=0)).ravel(); idx=scores.argsort()[::-1]
    top=[(vocab[i], float(scores[i])) for i in idx[:k]]
    return {'entries':[{'token':t,'score':s} for t,s in top]}
def main():
    ap=argparse.ArgumentParser(); ap.add_argument('--users_dir', required=True); ap.add_argument('--out_dir', required=True); ap.add_argument('--max_items', type=int, default=4000); args=ap.parse_args()
    out=Path(args.out_dir); out.mkdir(parents=True, exist_ok=True)
    for u in Path(args.users_dir).iterdir():
        if not u.is_dir(): continue
        p=u/'adapt.txt'
        if not p.exists(): continue
        text=p.read_text(encoding='utf-8'); lex=build_lex(text, args.max_items)
        (out/f'{u.name}.lexicon.json').write_text(json.dumps(lex, ensure_ascii=False, indent=2), encoding='utf-8')
        print('lex:', u.name)
if __name__=='__main__': main()
'''))
(CODE/'src/rag/__init__.py').write_text('')
(CODE/'src/rag/build_rag.py').write_text(textwrap.dedent(r'''
import argparse, json
from pathlib import Path
from sentence_transformers import SentenceTransformer
import faiss, numpy as np
def chunks(s, m=300, ov=50):
    s=s.strip(); out=[]; i=0
    while i<len(s): out.append(s[i:i+m]); i+=max(1, m-ov)
    return out
def main():
    ap=argparse.ArgumentParser(); ap.add_argument('--users_dir', required=True); ap.add_argument('--out_dir', required=True)
    ap.add_argument('--model_name', default='sentence-transformers/all-MiniLM-L6-v2'); ap.add_argument('--chunk_chars', type=int, default=300); ap.add_argument('--overlap', type=int, default=50)
    args=ap.parse_args(); emb=SentenceTransformer(args.model_name)
    out=Path(args.out_dir); out.mkdir(parents=True, exist_ok=True)
    for u in Path(args.users_dir).iterdir():
        if not u.is_dir(): continue
        p=u/'adapt.txt'
        if not p.exists(): continue
        cs=chunks(p.read_text(encoding='utf-8'), args.chunk_chars, args.overlap)
        if not cs: continue
        X=emb.encode(cs, batch_size=64, convert_to_numpy=True, show_progress_bar=False).astype(np.float32)
        faiss.normalize_L2(X); idx=faiss.IndexFlatIP(X.shape[1]); idx.add(X)
        faiss.write_index(idx, str(out/f'{u.name}.faiss'))
        (out/f'{u.name}.chunks.json').write_text(json.dumps(cs, ensure_ascii=False), encoding='utf-8')
        print('rag:', u.name, len(cs))
if __name__=='__main__': main()
'''))
(CODE/'src/lora/__init__.py').write_text('')
(CODE/'src/lora/train_lora.py').write_text(textwrap.dedent(r'''
import argparse, re
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
def read_txt(p): return Path(p).read_text(encoding='utf-8')
def mk_ds(txt: str, tok, bs=256):
    ids=tok(txt, return_tensors=None, truncation=False)['input_ids']
    blocks=[ids[i:i+bs] for i in range(0, len(ids)-bs, bs)] or [ids[:bs]]
    return Dataset.from_dict({'input_ids': blocks})
def last_ck(d: Path):
    c=[p for p in d.glob('checkpoint-*') if p.is_dir()]
    if not c: return None
    def step(p):
        import re; m=re.search(r'checkpoint-(\d+)', p.name); return int(m.group(1)) if m else -1
    return sorted(c, key=step)[-1]
def main():
    ap=argparse.ArgumentParser()
    ap.add_argument('--users_dir', required=True)
    ap.add_argument('--adapters_dir', required=True)
    ap.add_argument('--base_model', default='Qwen/Qwen2.5-7B-Instruct')
    ap.add_argument('--rank', type=int, default=8)
    ap.add_argument('--alpha', type=int, default=16)
    ap.add_argument('--dropout', type=float, default=0.05)
    ap.add_argument('--lr', type=float, default=2e-4)
    ap.add_argument('--steps', type=int, default=300)
    ap.add_argument('--block_size', type=int, default=256)
    args=ap.parse_args()
    tok=AutoTokenizer.from_pretrained(args.base_model, use_fast=True)
    model=AutoModelForCausalLM.from_pretrained(args.base_model, device_map='auto', load_in_4bit=True, torch_dtype=torch.float16)
    model=prepare_model_for_kbit_training(model)
    model=get_peft_model(model, LoraConfig(r=args.rank, lora_alpha=args.alpha, lora_dropout=args.dropout, bias='none', task_type='CAUSAL_LM'))
    for u in Path(args.users_dir).iterdir():
        if not u.is_dir(): continue
        a=u/'adapt.txt'; v=u/'val.txt'
        if not a.exists() or not v.exists(): continue
        out=Path(args.adapters_dir)/u.name; out.mkdir(parents=True, exist_ok=True)
        tr=mk_ds(read_txt(a), tok, args.block_size); dv=mk_ds(read_txt(v), tok, args.block_size)
        targs=TrainingArguments(output_dir=str(out), per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=8,
            logging_steps=10, learning_rate=args.lr, max_steps=args.steps, evaluation_strategy='steps', eval_steps=100, save_strategy='steps', save_steps=100, save_total_limit=3, report_to='none')
        def collate(batch):
            feats=[b['input_ids'] for b in (batch if isinstance(batch,list) else [batch])]
            ml=max(len(f) for f in feats); pad=tok.pad_token_id
            ids=[f + [pad]*(ml-len(f)) for f in feats]
            return {'input_ids': torch.tensor(ids), 'labels': torch.tensor(ids)}
        ck=last_ck(out)
        Trainer(model=model, args=targs, train_dataset=tr, eval_dataset=dv, data_collator=collate).train(resume_from_checkpoint=str(ck) if ck else None)
        model.save_pretrained(str(out/'lora_adapter'))
        print('adapter:', out/'lora_adapter')
if __name__=='__main__': main()
'''))
(CODE/'src/infer/__init__.py').write_text('')
(CODE/'src/infer/suggest.py').write_text(textwrap.dedent(r'''
from collections import defaultdict, Counter
import re, json
from pathlib import Path
import numpy as np, torch
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor
from peft import PeftModel
from sentence_transformers import SentenceTransformer
def tok_basic(s): return re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\sA-Za-z0-9]", s)
class NGram:
    def __init__(self, text: str, n: int = 3):
        toks=tok_basic(text.lower()); self.n=n; self.ng=defaultdict(Counter)
        for i in range(len(toks)-n): self.ng[tuple(toks[i:i+n-1])][toks[i+n-1]]+=1
    def suggest(self, ctx: str, k: int = 3):
        toks=tok_basic(ctx.lower()); key=tuple(toks[-(self.n-1):]) if len(toks)>=self.n-1 else tuple(toks)
        cand=self.ng.get(key,{}); return [w for w,_ in cand.most_common(k)]
class Bias(LogitsProcessor):
    def __init__(self, mp): self.mp=mp or {}
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        if self.mp: scores[:, list(self.mp.keys())]+=torch.tensor(list(self.mp.values()), device=scores.device)
        return scores
class Lex:
    def __init__(self, tok, lex_json, cap=2.5):
        self.tok=tok; self.mp={}
        try: entries=json.loads(lex_json)['entries']
        except Exception: entries=[]
        for e in entries[:2000]:
            ids=self.tok(e['token'], add_special_tokens=False)['input_ids']
            if len(ids)==1: self.mp[ids[0]]=cap
    def proc(self): return Bias(self.mp)
class RAG:
    def __init__(self, f, c, em='sentence-transformers/all-MiniLM-L6-v2'):
        self.idx=faiss.read_index(str(f)); self.ch=json.loads(Path(c).read_text(encoding='utf-8'))
        self.emb=SentenceTransformer(em)
    def top(self, text, k=4):
        q=self.emb.encode([text], convert_to_numpy=True).astype(np.float32); faiss.normalize_L2(q); D,I=self.idx.search(q,k)
        return [self.ch[i] for i in I[0] if i>=0]
class LLM:
    def __init__(self, base: str, adapter_dir: str=None, rag=None, lex=None, max_ctx=512, bf16=False):
        self.tok=AutoTokenizer.from_pretrained(base, use_fast=True)
        if bf16:
            self.m=AutoModelForCausalLM.from_pretrained(base, device_map='auto', torch_dtype=torch.bfloat16)
        else:
            self.m=AutoModelForCausalLM.from_pretrained(base, device_map='auto', load_in_4bit=True)
        if adapter_dir and Path(adapter_dir).exists(): self.m=PeftModel.from_pretrained(self.m, adapter_dir)
        self.rag=rag; self.lex=lex; self.max_ctx=max_ctx; self.m.eval()
    def _prompt(self, tail, mem):
        if mem:
            bullets='\n'.join(f'- {c[:200]}' for c in mem)
            mem=f"Memory\n{bullets}\n\n"
        return f"Continue in the user's style.\n{mem}Draft:\n{tail}\n\nContinue:"
    def suggest(self, ctx: str, k: int = 3):
        tail=ctx[-1000:]; mem=self.rag.top(tail,4) if self.rag else []
        prompt=self._prompt(tail, mem)
        ids=self.tok(prompt, return_tensors='pt', truncation=True, max_length=self.max_ctx).to(self.m.device)
        procs=[self.lex.proc()] if self.lex else None
        with torch.no_grad():
            out=self.m.generate(**ids, max_new_tokens=6, do_sample=False, num_beams=max(1,k), num_return_sequences=k,
                                 logits_processor=procs, pad_token_id=self.tok.eos_token_id)
        texts=self.tok.batch_decode(out[:, ids['input_ids'].shape[1]:], skip_special_tokens=True)
        res=[]
        for t in texts:
            t=t.strip(); m=re.match(r"^\S{1,8}", t); s=m.group(0) if m else t[:8]
            if s and s not in res: res.append(s)
        return res[:k]
'''))
(CODE/'src/eval/__init__.py').write_text('')
(CODE/'src/eval/typing_sim.py').write_text(textwrap.dedent(r'''
import argparse, time, csv
from pathlib import Path
from src.infer.suggest import NGram, LLM, Lex, RAG
def load_text(p): return Path(p).read_text(encoding='utf-8')
def sim(doc: str, sugg, k=3, max_chunk=8):
    kp=0; kw=0; acc=0; t0=time.time(); i=0
    while i<len(doc):
        kp+=1; kw+=1; i+=1
        pref=doc[:i]; sug=sugg.suggest(pref, k=k)
        if not sug: continue
        remain=doc[i:]; ok=False
        for s in sug:
            s=s[:max_chunk]
            if remain.lower().startswith(s.lower()):
                saved=max(len(s)-1,0); kw+=1; kw-=saved; i+=len(s); acc+=1; ok=True; break
        if not ok: continue
    ms=(time.time()-t0)*1000.0; kss=1.0-(kw/max(kp,1))
    return dict(keys_plain=kp, keys_with=kw, kss=kss, accepts=acc, time_ms=ms)
def main():
    ap=argparse.ArgumentParser()
    ap.add_argument('--users_dir', required=True)
    ap.add_argument('--mode', choices=['ngram','llm_base','llm_lex','llm_full'], default='ngram')
    ap.add_argument('--base_model', default='Qwen/Qwen2.5-7B-Instruct')
    ap.add_argument('--adapters_dir', default='adapters')
    ap.add_argument('--lexicons_dir', default='lexicons')
    ap.add_argument('--rag_dir', default='rag')
    ap.add_argument('--results_csv', required=True)
    ap.add_argument('--k', type=int, default=3)
    ap.add_argument('--bf16', action='store_true')
    args=ap.parse_args(); out=[]
    users=Path(args.users_dir)
    for u in users.iterdir():
        if not u.is_dir(): continue
        a=u/'adapt.txt'; t=u/'test.txt'
        if not a.exists() or not t.exists(): continue
        if args.mode=='ngram':
            sg=NGram(load_text(a))
        else:
            from transformers import AutoTokenizer
            lex=None
            if args.mode in ('llm_lex','llm_full'):
                lp=Path(args.lexicons_dir)/f'{u.name}.lexicon.json'
                if lp.exists(): lex=Lex(AutoTokenizer.from_pretrained(args.base_model, use_fast=True), lp.read_text(encoding='utf-8'), cap=2.5)
            rag=None
            fp=Path(args.rag_dir)/f'{u.name}.faiss'; cp=Path(args.rag_dir)/f'{u.name}.chunks.json'
            if fp.exists() and cp.exists(): rag=RAG(fp, cp)
            ad=str(Path(args.adapters_dir)/u.name/'lora_adapter') if args.mode=='llm_full' else None
            sg=LLM(args.base_model, adapter_dir=ad, rag=rag, lex=lex, bf16=args.bf16)
        res=sim(load_text(t), sg, k=args.k)
        out.append({'user': u.name, 'mode': args.mode, **res}); print(u.name, args.mode, f"KSS={res['kss']:.3f}")
    Path(args.results_csv).parent.mkdir(parents=True, exist_ok=True)
    with open(args.results_csv,'w',newline='',encoding='utf-8') as f:
        w=csv.DictWriter(f, fieldnames=list(out[0].keys()) if out else ['user','mode','kss'])
        w.writeheader(); [w.writerow(r) for r in out]
    print('csv →', args.results_csv)
if __name__=='__main__': main()
'''))
print('code rooted at', CODE)

### data → authors.jsonl (robust download)

In [None]:
%%bash
set -euo pipefail

# fix the broken io.py
python - <<'PY'
from pathlib import Path
p = Path('/content/drive/MyDrive/assistive_keyboard_7B/code/src/utils/io.py')
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(
"""from pathlib import Path
import json, ujson

def read_lines(p):
    return Path(p).read_text(encoding='utf-8').splitlines()

def write_lines(p, lines):
    Path(p).parent.mkdir(parents=True, exist_ok=True)
    Path(p).write_text('\\n'.join(lines), encoding='utf-8')

def read_jsonl(p):
    out=[]
    with open(p,'r',encoding='utf-8') as f:
        for line in f:
            line=line.strip()
            if line:
                out.append(json.loads(line))
    return out

def write_jsonl(p, rows):
    Path(p).parent.mkdir(parents=True, exist_ok=True)
    with open(p,'w',encoding='utf-8') as f:
        for r in rows:
            f.write(ujson.dumps(r, ensure_ascii=False)+'\\n')
"""
, encoding='utf-8')
print("fixed:", p)
PY

# now build authors.jsonl from the extracted 20110402 maildir
python - <<'PY'
from pathlib import Path, sys
sys.path.append('/content/drive/MyDrive/assistive_keyboard_7B/code')
from src.data.enron_loader import build_authors_jsonl
maildir = '/content/enron_mail_20110402/maildir'
out = '/content/drive/MyDrive/assistive_keyboard_7B/data/processed/authors.jsonl'
Path(out).parent.mkdir(parents=True, exist_ok=True)
build_authors_jsonl(maildir, out, min_doc_tokens=20)
print('authors.jsonl ✓ ->', out)
PY

### splits (author‑disjoint) + per‑user slices (use config)

In [None]:
from pathlib import Path; import sys
root=Path('/content/drive/MyDrive/assistive_keyboard_7B')
sys.path.append(str(root/'code'))
sys.argv = [
  'splits',
  '--authors_jsonl', str(root/'data/processed/authors.jsonl'),
  '--out_dir',       str(root/'splits'),
  '--min_tokens',    str(ADAPT_TOKENS+VAL_TOKENS+TEST_TOKENS),
  '--adapt_tokens',  str(ADAPT_TOKENS),
  '--val_tokens',    str(VAL_TOKENS),
  '--test_tokens',   str(TEST_TOKENS),
  '--max_test_authors', str(MAX_TEST_AUTHORS),
  '--seed',          str(SEED)
]
from src.splits.make_splits import main as run; run()

### prefetch base model (warms HF cache)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if USE_BF16_INSTEAD_OF_4BIT:
    m = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=getattr(__import__('torch'),'bfloat16'))
else:
    m = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', load_in_4bit=True)
del m
print('prefetch ok')

### per‑user assets: lexicon + FAISS RAG

In [None]:
import sys
root=Path('/content/drive/MyDrive/assistive_keyboard_7B'); sys.path.append(str(root/'code'))
sys.argv = ['lex','--users_dir', str(root/'users'), '--out_dir', str(root/'lexicons'), '--max_items','4000']
from src.lexicon.build_lexicon import main as lex; lex()
sys.argv = ['rag','--users_dir', str(root/'users'), '--out_dir', str(root/'rag'), '--model_name','sentence-transformers/all-MiniLM-L6-v2', '--chunk_chars','300','--overlap','50']
from src.rag.build_rag import main as rag; rag()

### LoRA per user (resumable ckpts; bf16 toggle)

In [None]:
# restore HF Auto classes, hard-reload our trainer, run training

import os, sys, importlib
import transformers  # currently mutated

# 1) full refresh of transformers so AutoModelForCausalLM has its real .from_pretrained again
transformers = importlib.reload(transformers)
import transformers.models.auto.modeling_auto as modeling_auto
importlib.reload(modeling_auto)
import transformers.modeling_utils as modeling_utils
importlib.reload(modeling_utils)
from transformers import AutoModelForCausalLM  # fresh class now

# 2) set dtype mode for train_lora.py (4-bit NF4 by default; flip to bf16 if you set the flag earlier)
use_bf16 = globals().get('USE_BF16_INSTEAD_OF_4BIT', False)
os.environ["TRAIN_DTYPE"] = "bf16" if use_bf16 else "4bit"
print("TRAIN_DTYPE:", os.environ["TRAIN_DTYPE"])

# 3) purge old trainer import so it re-imports transformers AFTER our reload
for k in list(sys.modules):
    if k.startswith('src.lora.train_lora'):
        del sys.modules[k]

# 4) run training with your existing knobs
from pathlib import Path
root = Path('/content/drive/MyDrive/assistive_keyboard_7B')
from src.lora.train_lora import main as train

sys.argv = [
  'train',
  '--users_dir',    str(root/'users'),
  '--adapters_dir', str(root/'adapters'),
  '--base_model',   globals().get('BASE_MODEL', 'Qwen/Qwen2.5-7B-Instruct'),
  '--rank',         str(globals().get('LORA_RANK', 8)),
  '--alpha',        str(globals().get('LORA_ALPHA', 16)),
  '--dropout',      str(globals().get('LORA_DROPOUT', 0.05)),
  '--lr',           '2e-4',
  '--steps',        str(globals().get('LORA_STEPS', 600)),
  '--block_size',   '256'
]
train()

In [None]:
# build users_active = finished adapters + top-20 unfinished (by adapt tokens)
from pathlib import Path
import os, shutil
from transformers import AutoTokenizer

ROOT = Path("/content/drive/MyDrive/assistive_keyboard_7B")
BASE = globals().get("BASE_MODEL","Qwen/Qwen2.5-7B-Instruct")

finished=[]
AD=ROOT/"adapters"
if AD.exists():
    for p in AD.iterdir():
        if not p.is_dir(): continue
        if (p/"lora_adapter").exists() or (p/"checkpoint-600").exists():
            finished.append(p.name)

tok = AutoTokenizer.from_pretrained(BASE, use_fast=True)
cands=[]
for d in sorted(p for p in (ROOT/"users").iterdir() if p.is_dir()):
    if d.name in finished: continue
    n = len(tok((d/"adapt.txt").read_text(encoding="utf-8"), add_special_tokens=False)["input_ids"])
    cands.append((n, d.name))
cands.sort(reverse=True)

N=20
sel = finished + [name for _,name in cands[:max(0, N-len(finished))]]
print(f"cohort size: {len(sel)}  (finished={len(finished)})")

UA = ROOT/"users_active"; UA.mkdir(exist_ok=True)
for name in sel:
    src = ROOT/"users"/name
    dst = UA/name
    if dst.exists(): continue
    try: os.symlink(src, dst)
    except Exception: shutil.copytree(src, dst)
print("users_active ready:", len(list(UA.iterdir())))

In [None]:
# Clean overwrite of train_lora.py (PEFT LoRA, bf16, single-GPU, skip-finished, safe resume)
from pathlib import Path
import textwrap, sys, importlib, os, shutil, re

ROOT = Path("/content/drive/MyDrive/assistive_keyboard_7B")
TRAINER = ROOT/"code"/"src/lora/train_lora.py"

TRAINER.write_text(textwrap.dedent("""
import argparse, os, re, shutil
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch

def read_txt(p):
    return Path(p).read_text(encoding='utf-8')

def mk_blocks(ids, bs=256):
    if not ids:
        return [[0]*bs]
    return [ids[i:i+bs] for i in range(0, max(len(ids)-bs, 1), bs)] or [ids[:bs]]

def mk_ds(txt, tok, bs=256):
    ids = tok(txt, add_special_tokens=False)["input_ids"]
    return Dataset.from_dict({"input_ids": mk_blocks(ids, bs)})

def step_from_ckpt_dir(p: Path):
    m = re.search(r"checkpoint-(\\d+)$", p.name)
    return int(m.group(1)) if m else -1

def latest_ckpt(d: Path):
    if not d.exists(): return None
    cks = [p for p in d.glob("checkpoint-*") if p.is_dir()]
    return max(cks, key=step_from_ckpt_dir) if cks else None

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--users_dir', required=True)
    ap.add_argument('--adapters_dir', required=True)
    ap.add_argument('--base_model', default='Qwen/Qwen2.5-7B-Instruct')
    ap.add_argument('--rank', type=int, default=8)
    ap.add_argument('--alpha', type=int, default=16)
    ap.add_argument('--dropout', type=float, default=0.05)
    ap.add_argument('--lr', type=float, default=2e-4)
    ap.add_argument('--steps', type=int, default=600)
    ap.add_argument('--block_size', type=int, default=256)
    args = ap.parse_args()

    tok = AutoTokenizer.from_pretrained(args.base_model, use_fast=True)
    if tok.pad_token_id is None and tok.eos_token_id is not None:
        tok.pad_token = tok.eos_token

    # base model on single A100 in bf16
    base = AutoModelForCausalLM.from_pretrained(
        args.base_model, device_map={'':0}, dtype=torch.bfloat16
    )
    base.config.use_cache = False  # don't warn during training

    users = sorted(p for p in Path(args.users_dir).iterdir() if p.is_dir())
    for u in users:
        a = u/'adapt.txt'; v = u/'val.txt'
        if not a.exists() or not v.exists():
            continue

        out_drive = Path(args.adapters_dir)/u.name
        out_drive.mkdir(parents=True, exist_ok=True)
        out_tmp = Path('/content/adapters_tmp')/u.name
        out_tmp.mkdir(parents=True, exist_ok=True)

        # skip authors already finished
        if (out_drive/'lora_adapter').exists() or (out_drive/'checkpoint-600').exists():
            print('skip finished:', u.name)
            continue

        # wrap LoRA freshly per author
        model = get_peft_model(base, LoraConfig(
            r=args.rank, lora_alpha=args.alpha, lora_dropout=args.dropout,
            bias='none', task_type='CAUSAL_LM'
        ))

        tr = mk_ds(read_txt(a), tok, args.block_size)
        dv = mk_ds(read_txt(v), tok, args.block_size)

        targs = TrainingArguments(
            output_dir=str(out_tmp),
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            gradient_accumulation_steps=2,
            learning_rate=args.lr,
            max_steps=args.steps,
            eval_strategy='steps',
            eval_steps=args.steps,    # eval once at the end
            save_strategy='steps',
            save_steps=args.steps,    # save once at the end
            save_total_limit=1,
            logging_steps=50,
            report_to=[],
            bf16=True,
            dataloader_num_workers=2,
            dataloader_pin_memory=True,
        )

        def collate(batch):
            feats = [b['input_ids'] for b in (batch if isinstance(batch, list) else [batch])]
            ml = max(len(f) for f in feats)
            pad = tok.pad_token_id or 0
            ids = [f + [pad]*(ml-len(f)) for f in feats]
            t = torch.tensor(ids, dtype=torch.long)
            attn = (t != pad).long()
            return {'input_ids': t, 'attention_mask': attn, 'labels': t}

        # resume: prefer tmp ckpt; fallback to old drive ckpt; skip if already at >= steps
        ck = latest_ckpt(out_tmp) or latest_ckpt(out_drive)
        if ck and step_from_ckpt_dir(ck) >= args.steps and (out_drive/'lora_adapter').exists():
            print('skip finished:', u.name)
            continue

        trainer = Trainer(model=model, args=targs, train_dataset=tr, eval_dataset=dv, data_collator=collate)
        trainer.train(resume_from_checkpoint=str(ck) if ck and step_from_ckpt_dir(ck) < args.steps else None)

        # save adapter to /content then copy to Drive
        model.save_pretrained(str(out_tmp/'lora_adapter'))
        try:
            shutil.rmtree(str(out_drive/'lora_adapter'), ignore_errors=True)
            shutil.copytree(str(out_tmp/'lora_adapter'), str(out_drive/'lora_adapter'))
        except Exception as e:
            print('copy adapter failed:', e)

        print('adapter:', out_drive/'lora_adapter')

if __name__ == '__main__':
    main()
"""), encoding="utf-8")

# fresh import & run
for k in list(sys.modules):
    if k.startswith('src.lora.train_lora'):
        del sys.modules[k]
importlib.invalidate_caches()
from src.lora.train_lora import main as train

users_dir = ROOT/('users_active' if (ROOT/'users_active').exists() else 'users')
sys.argv = [
  'train',
  '--users_dir',    str(users_dir),
  '--adapters_dir', str(ROOT/'adapters'),
  '--base_model',   globals().get('BASE_MODEL','Qwen/Qwen2.5-7B-Instruct'),
  '--rank',         str(globals().get('LORA_RANK',8)),
  '--alpha',        str(globals().get('LORA_ALPHA',16)),
  '--dropout',      str(globals().get('LORA_DROPOUT',0.05)),
  '--lr',           '2e-4',
  '--steps',        str(globals().get('LORA_STEPS',600)),
  '--block_size',   '256'
]
train()

In [None]:
%%bash
set -e
echo "== scan for project roots =="
find /content/drive -type d -name assistive_keyboard_7B 2>/dev/null | sed 's/^/DRIVE: /' || true
find /content      -maxdepth 2 -type d -name assistive_keyboard_7B 2>/dev/null | sed 's/^/LOCAL: /' || true

echo -e "\n== scan for lora_adapter dirs (first 20) =="
find /content/drive -type d -name lora_adapter 2>/dev/null | head -n 20 | sed 's/^/DRIVE: /' || true
find /content      -type d -name lora_adapter 2>/dev/null | head -n 20 | sed 's/^/LOCAL: /' || true

echo -e "\n== scan for users splits (folders with adapt.txt) (first 20) =="
find /content/drive -type f -name adapt.txt 2>/dev/null | head -n 20 | sed 's/^/DRIVE: /' || true
find /content      -type f -name adapt.txt 2>/dev/null | head -n 20 | sed 's/^/LOCAL: /' || true

### eval (ngram, llm_base, llm_lex, llm_full) → CSVs

In [None]:
# eval adapters-only cohort → writes CSVs under runs/ and prints summary
from pathlib import Path
import os, json, time
import pandas as pd
import torch

# libs
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from peft import PeftModel
except Exception as e:
    raise RuntimeError("Missing deps (transformers/peft). In this Colab, pip install them once.")

ROOT = Path("/content/drive/MyDrive/assistive_keyboard_7B")
USERS, ADAP, UA, LEX, RUNS = ROOT/'users', ROOT/'adapters', ROOT/'users_active', ROOT/'lexicons', ROOT/'runs'
RUNS.mkdir(parents=True, exist_ok=True)

BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE      = torch.bfloat16 if DEVICE == "cuda" else torch.float32
MAX_TOK    = 4096  # cap eval length so we don't blow up on giant test.txt

# build users_active from authors that have an adapter
names = [p.name for p in ADAP.iterdir()
         if (p/'lora_adapter'/'adapter_model.safetensors').exists() and (USERS/p.name).exists()]
import shutil
shutil.rmtree(UA, ignore_errors=True); UA.mkdir(parents=True, exist_ok=True)
for a in names:
    src, dst = USERS/a, UA/a
    try: os.symlink(src, dst)
    except Exception: shutil.copytree(src, dst)
print(f"users_active (adapters-only): {len(names)} authors")

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tok.pad_token_id is None and tok.eos_token_id is not None:
    tok.pad_token = tok.eos_token

def ids_text(text, max_tok=MAX_TOK):
    return tok(text, add_special_tokens=False, truncation=True, max_length=max_tok)["input_ids"]

def ids_file(fp: Path, max_tok=MAX_TOK):
    txt = fp.read_text(encoding="utf-8", errors="ignore")
    return ids_text(txt, max_tok=max_tok)

def lex_prefix(author):
    f = LEX/f"{author}.lexicon.json"
    if not f.exists(): return []
    try: obj = json.loads(f.read_text(encoding="utf-8") or "{}")
    except: obj = {}
    keys = list(obj.keys())[:10]
    return tok("Style hints: " + ", ".join(keys) + "\n\n", add_special_tokens=False)["input_ids"] if keys else []

@torch.inference_mode()
def eval_llm(text_ids, model, prefix=None):
    prefix = prefix or []
    X = torch.tensor([ (prefix + text_ids)[:MAX_TOK] ], dtype=torch.long, device=DEVICE)
    logits = model(input_ids=X).logits
    pred = logits[:, :-1].argmax(dim=-1); gold = X[:, 1:]
    return (pred == gold).float().mean().item()

def eval_ngram(adapt_ids, test_ids):
    from collections import defaultdict, Counter
    nxt = defaultdict(Counter)
    for a,b in zip(adapt_ids[:-1], adapt_ids[1:]): nxt[a][b]+=1
    fb = Counter(adapt_ids[1:]).most_common(1)[0][0] if len(adapt_ids)>1 else 0
    corr=tot=0; prev=None
    for t in test_ids[:MAX_TOK]:
        if prev is None: prev=t; continue
        pred = (nxt[prev].most_common(1)[0][0] if nxt[prev] else fb)
        corr += int(pred==t); tot+=1; prev=t
    return corr/max(1,tot)

def load_base():
    m = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={'':0} if DEVICE=='cuda' else None, dtype=DTYPE)
    m.eval(); return m
def load_with_adapter(author):
    m = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map={'':0} if DEVICE=='cuda' else None, dtype=DTYPE)
    m = PeftModel.from_pretrained(m, str(ADAP/author/'lora_adapter'))
    m.eval(); return m

BASE = load_base()
rows = {k:[] for k in ['ngram','llm_base','llm_lex','llm_full']}
authors = sorted([p.name for p in UA.iterdir() if p.is_dir()])

t0_all = time.time()
for a in authors:
    d = UA/a
    adapt_ids = ids_file(d/'adapt.txt', max_tok=MAX_TOK)
    test_ids  = ids_file(d/'test.txt',  max_tok=MAX_TOK)

    # ngram baseline
    t0=time.time(); acc = eval_ngram(adapt_ids, test_ids)
    rows['ngram'].append({'user':a,'kss':acc*100,'accepts':acc,'time_ms':(time.time()-t0)*1000})

    # base model
    t0=time.time(); acc = eval_llm(test_ids, BASE, None)
    rows['llm_base'].append({'user':a,'kss':acc*100,'accepts':acc,'time_ms':(time.time()-t0)*1000})

    # base + lex
    pref = lex_prefix(a)
    t0=time.time(); acc = eval_llm(test_ids, BASE, pref)
    rows['llm_lex'].append({'user':a,'kss':acc*100,'accepts':acc,'time_ms':(time.time()-t0)*1000})

    # LoRA adapter
    M = load_with_adapter(a)
    t0=time.time(); acc = eval_llm(test_ids, M, pref)
    rows['llm_full'].append({'user':a,'kss':acc*100,'accepts':acc,'time_ms':(time.time()-t0)*1000})
    del M; torch.cuda.empty_cache()

# write CSVs
for k,v in rows.items():
    pd.DataFrame(v).to_csv(RUNS/f'leaderboard_{k}.csv', index=False)

print(f"done → {RUNS}   authors: {len(authors)}   time={int(time.time()-t0_all)}s")

# summary
tbl = pd.read_csv(RUNS/'leaderboard_ngram.csv')[["user","kss","accepts","time_ms"]].rename(
    columns={"kss":"kss_ngram","accepts":"acc_ngram","time_ms":"time_ngram"})
for tag in ["llm_base","llm_lex","llm_full"]:
    d = pd.read_csv(RUNS/f'leaderboard_{tag}.csv')[["user","kss","accepts","time_ms"]].rename(
        columns={"kss":f"kss_{tag}","accepts":f"acc_{tag}","time_ms":f"time_{tag}"})
    tbl = tbl.merge(d, on="user", how="inner")

print("\n== per-model means ==")
print(tbl.drop(columns=["user"]).mean(numeric_only=True).round(3))
print("\n== winner counts ==")
print(tbl.set_index("user")[['kss_ngram','kss_llm_base','kss_llm_lex','kss_llm_full']].idxmax(axis=1).value_counts())

### results (means + per‑author spread + outliers)

In [None]:
import pandas as pd, os
root='/content/drive/MyDrive/assistive_keyboard_7B/runs'
dfs=[]
for f in ['leaderboard_ngram.csv','leaderboard_llm_base.csv','leaderboard_llm_lex.csv','leaderboard_llm_full.csv']:
    p=os.path.join(root,f)
    try: dfs.append(pd.read_csv(p).assign(model=f.replace('leaderboard_','').replace('.csv','')))
    except Exception as e: print('missing', p, e)
res=pd.concat(dfs, ignore_index=True)
print('== means by model ==')
print(res.groupby('model')[['kss','time_ms','accepts']].mean().round(3))
full=res[res['model']=='llm_full'][['user','kss']]
print('\n== per-author KSS (llm_full) describe ==')
print(full.describe().round(3))
med=full['kss'].median(); mad=(full['kss']-med).abs().median()
bad=full[full['kss']<med-1.5*mad]['user'].tolist()
print('\noutliers (low KSS):', bad)

### quick live check (single author suggestion)

In [None]:
from pathlib import Path
import sys
root=Path('/content/drive/MyDrive/assistive_keyboard_7B'); sys.path.append(str(root/'code'))
from src.infer.suggest import LLM, Lex, RAG
from transformers import AutoTokenizer
authors=[p.name for p in (root/'users').iterdir() if p.is_dir()]
author=authors[0] if authors else None
print('author:', author)
lex=None; lpath=root/'lexicons'/f'{author}.lexicon.json'
if lpath.exists():
    lex=Lex(AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True), lpath.read_text(encoding='utf-8'), cap=2.5)
rag=None; f=root/'rag'/f'{author}.faiss'; c=root/'rag'/f'{author}.chunks.json'
if f.exists() and c.exists():
    rag=RAG(f, c)
adapter=str(root/'adapters'/author/'lora_adapter')
sg=LLM(BASE_MODEL, adapter_dir=adapter, rag=rag, lex=lex, bf16=USE_BF16_INSTEAD_OF_4BIT)
ctx='Hi team, following up on the budget approval for Q4. If we can align by Friday,'
print('ctx:', ctx)
print('suggestions:', sg.suggest(ctx, k=3))

In [None]:
# === setup ===
import os, json, math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ROOT = Path("/content/drive/MyDrive/assistive_keyboard_7B")
USERS = ROOT/'users'
RUNS  = ROOT/'runs'
PLOT  = RUNS/'plots'
PLOT.mkdir(parents=True, exist_ok=True)

# === load leaderboards ===
paths = {
    'ngram': RUNS/'leaderboard_ngram.csv',
    'llm_base': RUNS/'leaderboard_llm_base.csv',
    'llm_lex': RUNS/'leaderboard_llm_lex.csv',
    'llm_full': RUNS/'leaderboard_llm_full.csv',
}
dfs = []
for name, p in paths.items():
    d = pd.read_csv(p)
    d['model'] = name
    dfs.append(d)
res = pd.concat(dfs, ignore_index=True)

# ==== Plot 1: Per-model mean KSS ====
means = res.groupby('model')['kss'].mean().reindex(['ngram','llm_base','llm_lex','llm_full'])
plt.figure(figsize=(6,4))
means.plot(kind='bar')
plt.ylabel('KSS (top-1 next-token %)')
plt.title('Mean KSS by Model (20 authors)')
plt.tight_layout()
plt.savefig(PLOT/'kss_means_by_model.png', dpi=200)
plt.close()

# ==== Plot 2: Per-author KSS distribution (box) ====
plt.figure(figsize=(7,4))
res.boxplot(column='kss', by='model', grid=False)
plt.suptitle('')
plt.title('Per-Author KSS Distribution')
plt.ylabel('KSS (%)')
plt.tight_layout()
plt.savefig(PLOT/'kss_box_by_model.png', dpi=200)
plt.close()

# ==== Plot 3: Winner counts ====
w = (res.pivot_table(index='user', columns='model', values='kss')
       .idxmax(axis=1).value_counts()
       .reindex(['ngram','llm_base','llm_lex','llm_full']).fillna(0))
plt.figure(figsize=(6,4))
w.plot(kind='bar')
plt.ylabel('# Authors won')
plt.title('Model Winner Counts (KSS best per author)')
plt.tight_layout()
plt.savefig(PLOT/'winner_counts.png', dpi=200)
plt.close()

# ==== Plot 4: Dumbbell (base vs LoRA) per author ====
pivot = res.pivot_table(index='user', columns='model', values='kss')
dd = pivot[['llm_base','llm_full']].dropna().sort_values('llm_base')
plt.figure(figsize=(7,10))
y = np.arange(len(dd))
plt.hlines(y, dd['llm_full'], dd['llm_base'], lw=1)
plt.plot(dd['llm_full'], y, 'o', label='LoRA', markersize=3)
plt.plot(dd['llm_base'], y, 'o', label='Base', markersize=3)
plt.yticks(y, dd.index)
plt.xlabel('KSS (%)')
plt.title('Per-Author: Base vs LoRA')
plt.legend()
plt.tight_layout()
plt.savefig(PLOT/'dumbbell_base_vs_lora.png', dpi=200)
plt.close()

# ==== Plot 5: Email length histograms (sample 9 authors) ====
import random
authors = sorted([p.name for p in USERS.iterdir() if p.is_dir()])
sample = authors[:9] if len(authors)>=9 else authors
fig, axes = plt.subplots(3,3, figsize=(10,8))
axes = axes.ravel()
for ax, a in zip(axes, sample):
    txt = (USERS/a/'adapt.txt').read_text(encoding='utf-8', errors='ignore')
    lens = [len(x) for x in txt.splitlines() if x.strip()]
    if len(lens) > 5000:
        lens = lens[:5000]  # cap for speed
    ax.hist(lens, bins=40)
    ax.set_title(a)
    ax.set_xlabel('line length (chars)')
    ax.set_ylabel('count')
plt.tight_layout()
plt.savefig(PLOT/'length_hist_sample9.png', dpi=200)
plt.close()

print("Saved plots to:", PLOT)

In [None]:
from collections import Counter
def top_bigrams(text, k=20):
    import re
    toks = re.findall(r"[a-zA-Z']+", text.lower())
    bigrams = zip(toks, toks[1:])
    return Counter([" ".join(b) for b in bigrams]).most_common(k)

a = res['user'].unique()[0]  # pick first author
txt = (USERS/a/'adapt.txt').read_text(encoding='utf-8', errors='ignore')
pairs = top_bigrams(txt, k=20)
labels, values = zip(*pairs)
plt.figure(figsize=(7,5))
plt.barh(labels[::-1], values[::-1])
plt.title(f"Top bigrams — {a} (adapt)")
plt.tight_layout()
plt.savefig(PLOT/f'top_bigrams_{a}.png', dpi=200)
plt.close()
print("Saved:", PLOT/f'top_bigrams_{a}.png')

In [None]:
# Requires: pip install umap-learn sentence-transformers
from sentence_transformers import SentenceTransformer
import umap
model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder=str(ROOT/'hf_cache'))
rows = []
for a in authors[:10]:  # first 10 authors for speed
    lines = (USERS/a/'adapt.txt').read_text(encoding='utf-8', errors='ignore').splitlines()
    lines = [l for l in lines if len(l.split())>=5][:200]  # 200 samples/author
    rows.extend([(a, l) for l in lines])
labels = [r[0] for r in rows]
sents  = [r[1] for r in rows]
emb = model.encode(sents, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
proj = umap.UMAP(n_components=2, random_state=42).fit_transform(emb)
plt.figure(figsize=(8,6))
for a in sorted(set(labels)):
    m = [i for i,l in enumerate(labels) if l==a]
    plt.scatter(proj[m,0], proj[m,1], s=6, label=a, alpha=0.6)
plt.legend(markerscale=3, bbox_to_anchor=(1.02,1), loc='upper left')
plt.title('UMAP of sentence embeddings (adapt, 10 authors)')
plt.tight_layout()
plt.savefig(PLOT/'umap_authors.png', dpi=200)
plt.close()