# Screenshot predictions

## Vocabulary

**Label**: manually assigned categories from the `www/` labeling tool, e.g. "hockey", "food", etc. These are the training targets for sklearn classifiers. These are semantic and contextual, not perceptual.

**Class**: ResNet/ImageNet's 1000 output categories. "ice hockey", "puck", "hot dog", "broccoli". These are perceptual and specific.

**Embedding**: the 512-dim vector ResNet produces after stripping `fc`. A geometric representation of visual structure, no semantic meaning attached.

**Feature**: a dimension or region of signal used as input to a model. Embeddings are features. TF-IDF weights are features. Raw pixels are not.

**Prediction**: the output of a classifier against a defined set of targets. Your sklearn classifiers produce predictions against your labels.

**Probability / score**: the confidence value attached to a prediction.

## Classification Scheme

Instead of using the 512-dim embedding as features, we can run the full ResNet forward pass and take the 1000-dim class probability vector ("this image is 40% white ice, 30% crowds, 15% faces") and use *that* as features into a sklearn classifier.

The ImageNet classes become a perceptual vocabulary that your label classifier reads.

In [None]:
import json
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from PIL import Image
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from torchvision import models

Set pathing and shared data

In [None]:
def find_root(marker="data/labels.jsonl"):
    for p in (Path.cwd(), *Path.cwd().parents):
        if (p / marker).exists():
            return p
    raise FileNotFoundError(marker)


ROOT = find_root()
LABELS_PATH = ROOT / "data/labels.jsonl"

rows = [json.loads(line) for line in LABELS_PATH.read_text().splitlines() if line.strip()]
paths = [ROOT / r["input_path"] for r in rows]

### Helpers
- `make_df` wraps classifier output into a labeled DataFrame indexed by filename.
- `top_k` slices that DataFrame by label column and returns ranked hits as a list
of `{path, score}` dicts  the currency everything else operates on.
- `top_labels` picks the *n* highest-mass columns by summing probability across
all images. This is useful for a first-pass survey of what the model is confident about.
- `print_hits` and `plot_hits` produce table and gallery of images and score

In [None]:
SEED = 42
TEST_SIZE = 0.2
THRESHOLD = 0.2


def make_df(probs, binarizer):
    return pd.DataFrame(probs, columns=binarizer.classes_, index=[p.name for p in paths])


def top_k(df, label, k=12):
    scores = df[label].to_numpy()
    order = np.argsort(scores)[::-1][:k]
    return [{"path": paths[i], "score": scores[i]} for i in order]


def top_labels(df, n=5):
    return df.sum().nlargest(n).index.tolist()


def print_hits(hits):
    for h in hits:
        print(f"{h['score']:.4f}  {h['path'].parent.name}/{h['path'].name}")


def plot_hits(hits, columns=4, label=""):
    rows_ = (len(hits) + columns - 1) // columns
    fig, axes = plt.subplots(rows_, columns, figsize=(3 * columns, 3 * rows_))
    axes = np.array(axes).reshape(-1)
    for ax, hit in zip(axes, hits):
        ax.imshow(Image.open(hit["path"]))
        ax.set_title(f"{hit['score']:.3f}\n{hit['path'].name}")
        ax.axis("off")
    for ax in axes[len(hits) :]:
        ax.axis("off")
    if label:
        fig.suptitle(label)
    plt.tight_layout()
    plt.show()


def split_indices(n):
    idx = np.random.RandomState(SEED).permutation(n)
    cut = int(n * (1 - TEST_SIZE))
    return idx[:cut], idx[cut:]


def ensure_label_coverage(targets, train_idx, test_idx):
    train_set, test_set = set(train_idx.tolist()), set(test_idx.tolist())
    for col in range(targets.shape[1]):
        if targets[list(train_set), col].sum() > 0:
            continue
        candidates = [i for i in test_set if targets[i, col] == 1]
        if candidates:
            test_set.remove(candidates[0])
            train_set.add(candidates[0])
    return np.array(sorted(train_set)), np.array(sorted(test_set))

## Text model

Text pipeline bundle 

1. vectorizer
2. classifier
3. and binarizer
   
Then

4. resolves the OCR transcript for each image from `data/ocr/`
5. transforms the corpus into TF-IDF features
6. and scores every document against all labels

Produces `text_df`: one row per image by label column vectors. Elements are class probabilities.

### Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


def ocr_path(r):
    img = Path(r["input_path"])
    return ROOT / "data/ocr" / img.parent.name / (img.stem + ".txt")


text_ocr = [ocr_path(r).read_text().lower() for r in rows]
text_binarizer = MultiLabelBinarizer()
text_targets = text_binarizer.fit_transform([r["categories"] for r in rows])

text_vec = TfidfVectorizer(analyzer="char", ngram_range=(3, 5), max_df=0.95)
text_features = text_vec.fit_transform(text_ocr)

print(
    "samples:",
    len(rows),
    "labels:",
    len(text_binarizer.classes_),
    "features:",
    text_features.shape[1],
)

### Training on OCR

In [None]:
tr, te = ensure_label_coverage(text_targets, *split_indices(len(rows)))

text_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver="liblinear"))
text_clf.fit(text_features[tr], text_targets[tr])

text_probs_test = text_clf.predict_proba(text_features[te])
text_preds_test = (text_probs_test >= THRESHOLD).astype(int)

print("micro:", f1_score(text_targets[te], text_preds_test, average="micro", zero_division=0))
print("macro:", f1_score(text_targets[te], text_preds_test, average="macro", zero_division=0))

### Score and save

In [None]:
text_probs = text_clf.predict_proba(text_features)
text_df = make_df(text_probs, text_binarizer)

joblib.dump(
    {"vectorizer": text_vec, "classifier": text_clf, "binarizer": text_binarizer},
    ROOT / "data/models/text.joblib",
)

In [None]:
text_df[top_labels(text_df)].head()

In [None]:
for label in ["hockey", "browser", "food"]:
    plot_hits(top_k(text_df, label, k=8), label=label)

## Image model

Rather than training a vision model from scratch, we use ResNet18 as a feature extractor. ResNet18 was pretrained on ImageNet and has learned general visual structure: edges, textures, shapes, compositions. 

The final layer of ResNet18 is a fully connected layer (`fc`) that maps learned features to a probability distribution over ImageNet's 1000 classes. 

We replace it with an identity function, which passes its input through unchanged. This stops the network before it commits to ImageNet categories and gives us the raw 512-dimensional feature vector instead. 

Those embeddings were used to train the sklearn classifier in the bundle, so at inference time we run the same extraction and hand the vectors to it. The model never sees pixel values directly. Only ResNet's learned representation of them.

In [None]:
weights = models.ResNet18_Weights.DEFAULT
resnet = models.resnet18(weights=weights)
resnet.fc = torch.nn.Identity()
resnet.eval()
preprocess = weights.transforms()


def embed_images(image_paths):
    vectors = []
    for path in image_paths:
        with Image.open(path) as img:
            tensor = preprocess(img.convert("RGB")).unsqueeze(0)
        with torch.no_grad():
            vectors.append(resnet(tensor).cpu().numpy().squeeze(0))
    return np.vstack(vectors)

### Features

In [None]:
image_binarizer = MultiLabelBinarizer()
image_targets = image_binarizer.fit_transform([r["categories"] for r in rows])
image_embeddings = embed_images(paths)

print(
    "samples:",
    len(rows),
    "labels:",
    len(image_binarizer.classes_),
    "dims:",
    image_embeddings.shape[1],
)

### Training

In [None]:
tr, te = ensure_label_coverage(image_targets, *split_indices(len(rows)))

image_clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver="liblinear"))
image_clf.fit(image_embeddings[tr], image_targets[tr])

image_probs_test = image_clf.predict_proba(image_embeddings[te])
image_preds_test = (image_probs_test >= THRESHOLD).astype(int)

print("micro:", f1_score(image_targets[te], image_preds_test, average="micro", zero_division=0))
print("macro:", f1_score(image_targets[te], image_preds_test, average="macro", zero_division=0))

### Score and save

In [None]:
image_probs = image_clf.predict_proba(image_embeddings)
image_df = make_df(image_probs, image_binarizer)

joblib.dump(
    {"classifier": image_clf, "binarizer": image_binarizer}, ROOT / "data/models/image.joblib"
)

In [None]:
image_df[top_labels(image_df)].head()

In [None]:
for label in ["hockey", "browser", "food"]:
    plot_hits(top_k(image_df, label, k=8), label=label)