In [None]:
import json
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
processed_dir = Path("../data/processed/2025-08-14-07-58-11")

title_to_text_path = processed_dir / "title_to_text.json"
link_to_freq_path = processed_dir / "link_to_freq.json"
title_to_links_path = processed_dir / "title_to_links.json"
title_to_tokens_path = processed_dir / "title_to_tokens.json"

### Token count distribution

In [None]:
token_counts = {}

with open(title_to_tokens_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    for title, tokens in data.items():
        token_counts[title] = len(tokens)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(token_counts.values(), bins=50, color="skyblue", edgecolor="black")
plt.title("Distribution of Article Token Counts")
plt.xlabel("Token Count")
plt.ylabel("Number of Articles")
plt.yscale("log")

plt.grid(True)
plt.show()

# Show top 10 titles of longest articles by token count
top_10 = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 Longest Articles (by token count):")
for i, (title, count) in enumerate(top_10, 1):
    print(f"{i}. {title} ({count} tokens)")

### Link Frequency Distribution


In [None]:
# Read the link_to_freq.json file
with open(link_to_freq_path, "r", encoding="utf-8") as f:
    link_freq_dict = json.load(f)

# Plot histogram of frequencies
plt.figure(figsize=(10, 6))
plt.hist(link_freq_dict.values(), bins=50, color="salmon", edgecolor="black")
plt.title("Distribution of Link Frequencies")
plt.xlabel("Number of Articles Link Appears In")
plt.ylabel("Number of Links")
plt.yscale("log")
plt.grid(True)
plt.show()

# Show top 10 most frequent links
top_10_links = sorted(link_freq_dict.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 Most Frequent Links:")
for i, (link, freq) in enumerate(top_10_links, 1):
    print(f"{i}. {link} ({freq} articles)")

### Distribution of Number of Links per Article


In [None]:
# Read the title_to_links.json file
with open(title_to_links_path, "r", encoding="utf-8") as f:
    title_to_links = json.load(f)

# Count the number of links per article
link_counts = [len(links) for links in title_to_links.values()]

# Count how many articles have 0 links
num_zero_link_articles = sum(1 for count in link_counts if count == 0)
print(f"Number of articles with 0 links: {num_zero_link_articles}")

# Plot histogram of link count frequencies
plt.figure(figsize=(10, 6))
plt.hist(link_counts, bins=50, color="mediumseagreen", edgecolor="black")
plt.title("Distribution of Number of Links per Article")
plt.xlabel("Number of Links in Article")
plt.ylabel("Number of Articles")
plt.yscale("log")
plt.grid(True)
plt.show()

# Show top 10 articles with most links
top_10_link_articles = sorted(
    title_to_links.items(), key=lambda x: len(x[1]), reverse=True
)[:10]
print("Top 10 Articles with Most Links:")
for i, (title, links) in enumerate(top_10_link_articles, 1):
    print(f"{i}. {title} ({len(links)} links)")