In [None]:
import json
import matplotlib.pyplot as plt
from pathlib import Path
import random

import numpy as np

### Read dataset

In [None]:
dataset_dir = Path("../data/final/2025-08-18-08-46-15")
link_to_freq_path = Path("../data/processed/2025-08-18-07-41-21/link_to_freq.json")

dataset_path = dataset_dir / "dataset.jsonl"
link_expansion_count_path = dataset_dir / "link_expansion_count.json"


In [None]:
def read_jsonl_stream(filename):
    with open(filename, "r") as file:
        for line in file:
            yield json.loads(line.strip())


n_tokens = {}
n_links_expanded = {}
titles = set()

for sample in read_jsonl_stream(dataset_path):
    title = sample["title"]
    titles.add(title)
    n_links_expanded[title] = sample["n_links_expanded"]
    n_tokens[title] = sample["n_tokens"]

### Token Count Distribution

In [None]:
token_counts = list(n_tokens.values())

print(f"Min tokens: {np.min(token_counts):,}")
print(f"Max tokens: {np.max(token_counts):,}")
print(f"Mean tokens: {np.mean(token_counts):,.2f}")

plt.hist(token_counts, bins=50, color="darkorange", edgecolor="black")
plt.xlabel("Number of Tokens")
plt.ylabel("Number of Samples")
plt.title("Distribution of Token Counts per Sample")
plt.show()

### Distribution of Number of Links Expanded per Sample

In [None]:
links_expanded_values = list(n_links_expanded.values())

plt.hist(links_expanded_values, bins=30, color="mediumseagreen", edgecolor="black")
plt.xlabel("Number of Links Expanded")
plt.ylabel("Number of Samples")
plt.title("Distribution of Links Expanded per Sample")
plt.yscale("log")
plt.show()

### Link Expansion Counts


In [None]:
with open(link_expansion_count_path, "r", encoding="utf-8") as f:
    link_expansion_count = json.load(f)

link_expanded_counts = list(link_expansion_count.values())

plt.figure(figsize=(8, 5))
plt.hist(link_expanded_counts, bins=50, color="cornflowerblue", edgecolor="black", log=True)
plt.xlabel("Number of Times Link Expanded")
plt.ylabel("Number of Links")
plt.title("Distribution of Link Expansion Counts")
plt.show()



### Print random sample


In [None]:
def get_random_title_and_text(token_threshold=0):
    # Find all titles with more than the specified number of tokens
    titles_over_threshold = [title for title, n_tokens in n_tokens.items() if n_tokens > token_threshold]

    print(f"Number of titles with more than {token_threshold} tokens: {len(titles_over_threshold)}")

    if titles_over_threshold:
        # Pick a random title from the list
        random_title = random.choice(titles_over_threshold)

        # Find the text for the given title by iterating through the dataset
        found_text = None
        for sample in read_jsonl_stream(dataset_path):
            if sample.get("title") == random_title:
                found_text = sample.get("expanded_text")
                break

        if found_text is not None:
            print(f"Text for random title '{random_title}':\n")
            print(found_text)  # Print first 2000 characters for inspection
        else:
            print(f"Title '{random_title}' not found in dataset.")

get_random_title_and_text()

### Print sample for a specific title


In [None]:
def print_text_for_title(specific_title):
    """
    Find and print the expanded text for a specific title from the dataset.
    """
    found_text = None
    for sample in read_jsonl_stream(dataset_path):
        if sample.get("title") == specific_title:
            found_text = sample.get("expanded_text")
            break

    if found_text is not None:
        print(f"Text for title '{specific_title}':\n")
        print(found_text)
    else:
        print(f"Title '{specific_title}' not found in dataset.")

# Example usage:
print_text_for_title("slaget ved chancellorsville")