In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from llm_ol.dataset import data_model

In [None]:
# graph_file = Path("out/data/wikipedia/v1/full/full_graph.json")
graph_file = Path("out/data/wikipedia/v2/full/graph_depth_3.json")
# graph_file = Path("out/data/wikipedia/v2/train_test_split/test_graph.json")

G = data_model.load_graph(graph_file)

In [None]:
num_nodes = nx.number_of_nodes(G)
num_edges = nx.number_of_edges(G)

print(f"Number of nodes: {num_nodes:,}")
print(f"Number of edges: {num_edges:,}")

In [None]:
in_degrees = [G.in_degree(n) for n in G.nodes]
out_degrees = [G.out_degree(n) for n in G.nodes]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.hist(in_degrees, bins=21, log=True)
ax1.set(xlabel="In-degree", ylabel="Count")
ax2.hist(out_degrees, bins=100, log=True)
ax2.set(xlabel="Out-degree", ylabel="Count")

In [None]:
component_sizes = [len(c) for c in nx.strongly_connected_components(G)]
component_sizes = pd.DataFrame(component_sizes, columns=["size"])
component_sizes.groupby("size").size().reset_index(name="count")

In [None]:
num_pages = []
num_text = []
for node, data in G.nodes(data=True):
    num_pages.append(len(data["pages"]))
    num_text += [len(p["abstract"]) for p in data["pages"]]
page_count = sum(num_pages)
text_count = sum(num_text)

print(f"Number of pages: {page_count:,}")
print(f"Avg. pages per node: {page_count/num_nodes:.2f}")
print(f"Number of characters: {text_count:,} = ~{int(text_count/4):,} tokens")
print(f"Avg. characters per page: {text_count/page_count:.2f}")
print(f"Avg. characters per node: {text_count/num_nodes:.2f}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.hist(num_pages, bins=50, log=True)
ax1.set(xlabel="Number of pages", ylabel="Number of nodes")
ax2.hist(num_text, bins=50, log=True)
ax2.set(xlabel="Number of characters", ylabel="Number of pages")

In [None]:
from llm_ol.dataset.wikipedia.build_categories import ROOT_CATEGORY_ID

lenghts = list(nx.single_source_shortest_path_length(G, ROOT_CATEGORY_ID).values())
df = pd.DataFrame(lenghts, columns=["length"])
df.groupby("length").size().reset_index(name="count")

In [None]:
category_names = [data["title"] for _, data in G.nodes(data=True)]

print(len(category_names))
print(len(set(category_names)))

In [None]:
special_words = {
    "wikipedia",
    "wikiproject",
    "list",
    "lists",
    "mediawiki",
    "template",
    "templates",
    "user",
    "users",
    "portal",
    "portal",
    "category",
    "categories",
    "article",
    "page",
}


def is_special(name):
    return any(
        any(word == special_word for word in name.lower().split())
        for special_word in special_words
    )


filtered_names = [name for name in category_names if is_special(name)]

print(len(filtered_names))
print(f"{len(filtered_names)/len(category_names):.2%}")
print(filtered_names)