## Preprocessing Data Pipeline
We have a lot of messy data, LLM's with 3b parameters can hallucinate a lot of random data to fit the schema.

In [1]:
import json

### Load Data

In [8]:
reviews = []
with open("./ministral_3b_outputs/amazon_train_output_split_0.jsonl", "r") as f:
    for line in f:
        reviews.append(json.loads(line))
print("Loaded", len(reviews), "reviews")

Loaded 21898 reviews


In [10]:
from collections import Counter

subject_counts = Counter()
bad_triple_types = Counter()

for review in reviews:
    triples = review.get("triples", [])
    if not isinstance(triples, list):
        bad_triple_types[type(triples).__name__] += 1
        continue

    for triple in triples:
        if not isinstance(triple, dict):
            bad_triple_types[type(triple).__name__] += 1
            continue

        subject = triple.get("subject")
        if isinstance(subject, str) and subject.strip():
            subject_counts[subject.strip()] += 1

print("Unique subjects:", len(subject_counts))
print("Total subject mentions:", sum(subject_counts.values()))
print("Bad triple item types:", dict(bad_triple_types))

Unique subjects: 7180
Total subject mentions: 64255
Bad triple item types: {'str': 1957, 'NoneType': 30, 'dict': 7, 'float': 3, 'bool': 14, 'int': 10, 'list': 2}


In [20]:
subject_counts.most_common()

[('product', 23133),
 ('user', 15267),
 ('hair', 1080),
 ('is', 364),
 ('has_attribute', 349),
 ('item', 336),
 ('color', 308),
 ('great', 275),
 ('scent', 236),
 ('price', 230),
 ('perfume', 215),
 ('quality', 196),
 ('video', 173),
 ('smell', 146),
 ('faux', 143),
 ('nice', 133),
 ('wig', 131),
 ('it', 130),
 ('easy', 129),
 ('has_feature', 125),
 ('good', 119),
 ('mask', 119),
 ('kit', 118),
 ('lashes', 116),
 ('brush', 115),
 ('faux_nails', 113),
 ('works', 102),
 ('purchase', 100),
 ('shampoo', 96),
 ('wants', 95),
 ('review', 95),
 ('brand', 91),
 ('set', 91),
 ('mascara', 90),
 ('uses', 86),
 ('material', 85),
 ('skin', 85),
 ('tool', 81),
 ('best', 76),
 ('daughter', 73),
 ('wife', 73),
 ('has', 71),
 ('soap', 71),
 ('smells', 70),
 ('husband', 69),
 ('colors', 68),
 ('easy_to_use', 65),
 ('bottle', 65),
 ('cute', 65),
 ('these', 65),
 ('product_1', 64),
 ('thing', 63),
 ('love', 63),
 ('order', 62),
 ('the', 61),
 ('faux_nail', 61),
 ('perfect', 59),
 ('gift', 59),
 ('clips', 