In [None]:
# STEP 0 — Sanity check (30 seconds)
import sklearn
import pandas
print('ML environment ready')

## STEP 2 — Get your essays into the notebook

This loads all `.md` files in the workspace root into a list `docs` where each entry is a dict with `text`, `path` and `paragraphs` (split on blank lines). If you prefer to use `.txt` files instead, drop them into the workspace and this will pick them up as well.

In [None]:
from pathlib import Path
import re

ROOT = Path('.')
md_files = list(ROOT.glob('*.md'))
docs = []
for p in md_files:
    text = p.read_text(encoding='utf-8')
    # simple paragraph split on blank lines
    paragraphs = [pb.strip() for pb in re.split(r'
*
', text) if pb.strip()]
    docs.append({'path': str(p), 'text': text, 'paragraphs': paragraphs})

len(docs), md_files[:5]

## STEP 3 — Create Label Studio tasks (one candidate per inter-sentence boundary)

We create a JSONL `labelstudio_tasks.jsonl` where each task has `text_before`, `text_after`, `full_paragraph`, `doc`, `paragraph_index`, and `boundary_index`. Import into Label Studio using the *Import -> Tasks* menu (JSONL).
Label schema (labeling_config.xml) will show both sides and a 2-choice task: `split` / `no_split`.

In [None]:
import json
import nltk
from nltk import tokenize
nltk.download('punkt')

def sentence_split(paragraph):
    sents = [s.strip() for s in tokenize.sent_tokenize(paragraph) if s.strip()]
    return sents

tasks = []
for doc in docs:
    for pi, para in enumerate(doc['paragraphs']):
        sents = sentence_split(para)
        # skip short paragraphs
        if len(sents) < 2:
            continue
        for bi in range(len(sents)-1):
            before = ' '.join(sents[:bi+1])
            after = ' '.join(sents[bi+1:])
            tasks.append({'data': {'text_before': before, 'text_after': after, 'full_paragraph': para, 'doc': doc['path'], 'paragraph_index': pi, 'boundary_index': bi}})

out_path = ROOT / 'labelstudio_tasks.jsonl'
with open(out_path, 'w', encoding='utf-8') as f:
    for t in tasks:
        f.write(json.dumps(t, ensure_ascii=False) + '
')

print('Wrote', len(tasks), 'tasks to', out_path)

## Labeling config (for Label Studio) — use `labeling_config.xml` in the repo and import it as the project labeling config.

This shows `text_before` and `text_after` and a Choices control with `split` and `no_split`.

## After labeling — converting export to features and labels

Export your completed tasks from Label Studio (JSON export). Download it and place it at `labelstudio_export.json` in workspace root. Then run the cell below to convert it to a CSV suitable for training.

In [None]:
import math, csv
from statistics import mean

def extract_features(text):
    sentences = re.split(r'[.!?]', text)
    sentence_count = len([s for s in sentences if s.strip()])
    avg_sentence_length = sum(len(s.split()) for s in sentences if s.strip()) / max(1, sentence_count)
    comma_count = text.count(',')
    return [sentence_count, avg_sentence_length, comma_count]

def ls_export_to_training(export_path='labelstudio_export.json', out_csv='training_data.csv'):
    with open(export_path, 'r', encoding='utf-8') as f:
        records = json.load(f)
    rows = []
    for r in records:
        # Label Studio structure: predictions or annotations with results -> choices
        data = r.get('data', {})
        # Find completed annotation with choices result
        ann = None
        for a in r.get('annotations', []) + r.get('predictions', []):
            ann = a
            break
        if not ann:
            continue
        # try to find choice result
        results = ann.get('result', [])
        choice = None
        for res in results:
            if res.get('type') == 'choices' or res.get('type') == 'singlechoice':
                choice = res.get('value', {}).get('choices') or res.get('value')
                break
        if not choice:
            continue
        label = 1 if 'split' in choice else 0
        before = data.get('text_before')
        after = data.get('text_after')
        full = data.get('full_paragraph', before + ' ' + after)
        features = extract_features(full)
        rows.append({'label': label, 'features': features, 'doc': data.get('doc'), 'paragraph_index': data.get('paragraph_index'), 'boundary_index': data.get('boundary_index')})
    # write CSV with simple columns
    with open(out_csv, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['label', 'sentence_count', 'avg_sentence_length', 'comma_count', 'doc', 'paragraph_index', 'boundary_index'])
        for r in rows:
            writer.writerow([r['label']] + r['features'] + [r['doc'], r['paragraph_index'], r['boundary_index']])
    print('Wrote training CSV with', len(rows), 'rows to', out_csv)

# Example: ls_export_to_training('labelstudio_export.json', 'training_data.csv')

## STEP 5 — Train a tiny ML model (Logistic Regression)

Run the next cell after you have `training_data.csv` created by the previous step. This trains, prints metrics, inspects `coef_`, and exports the model JSON for Rebar.

In [None]:
import csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import json

def train_and_export(csv_path='training_data.csv', model_out='paragraph_split_model.json'):
    X = []
    y = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            feats = [float(row['sentence_count']), float(row['avg_sentence_length']), float(row['comma_count'])]
            X.append(feats)
            y.append(int(row['label']))
    X = np.array(X)
    y = np.array(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))
    print('Weights:', model.coef_, 'Bias:', model.intercept_)
    model_data = {'weights': model.coef_[0].tolist(), 'bias': float(model.intercept_[0])}
    with open(model_out, 'w', encoding='utf-8') as f:
        json.dump(model_data, f)
    print('Exported model to', model_out)

# Example: train_and_export('training_data.csv', 'paragraph_split_model.json')

## STEP 8 — Use this inside Rebar (TypeScript snippet)

Below is the snippet to run inside Rebar after you `git add` the `paragraph_split_model.json` to your assets. Put this into the Rebar scoring / suggestion code.

```ts
function score(features: number[], model: {weights:number[], bias:number}){
  return features.reduce((s,f,i) => s + f*model.weights[i], model.bias);
}
if (score(features, model) > 0) { suggestSplitParagraph(); }
```