# Match algorithm (time-aware semantic clustering)

Goal: cluster articles that are about the same event, mostly based on semantic similarity, with a naive time window/decay to avoid matching far-apart events.

This notebook reads `data/raw/<Country>/<outlet>.jsonl`, builds embeddings, and clusters via a similarity graph + connected components.


In [24]:
from __future__ import annotations

import json
import math
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

In [25]:
# --- config ---
DATA_DIR = Path('../data/raw')
MAX_ARTICLES = 999999  # reduce for faster iterations
MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
TIME_WINDOW_HOURS = 72  # only compare within this window
TIME_DECAY_HOURS = 36   # exp(-delta/tau)
SIM_THRESHOLD = 0.90    # combined score threshold
MIN_TEXT_CHARS = 120

def parse_dt(value: str | None) -> datetime | None:
    if not value:
        return None
    try:
        # RFC 822 style (rss/atom)
        dt = parsedate_to_datetime(value)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    except Exception:
        pass
    try:
        dt = datetime.fromisoformat(value.replace('Z', '+00:00'))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        return dt.astimezone(timezone.utc)
    except Exception:
        return None

def build_text(item: dict) -> str:
    title = (item.get('title') or '').strip()
    summary = (item.get('summary') or '').strip()
    full_text = (item.get('full_text') or '').strip()
    if full_text:
        return f"{title}\n{full_text}".strip()
    return f"{title}\n{summary}".strip()


In [26]:
# --- load jsonl ---
records = []
for path in sorted(DATA_DIR.rglob('*.jsonl')):
    for line in path.read_text().splitlines():
        if not line.strip():
            continue
        item = json.loads(line)
        text = build_text(item)
        if len(text) < MIN_TEXT_CHARS:
            continue
        published = parse_dt(item.get('published'))
        ingested = parse_dt(item.get('ingested_at'))
        ts = published or ingested
        if not ts:
            continue
        records.append({
            'id': item.get('id'),
            'title': item.get('title'),
            'url': item.get('url'),
            'outlet_id': item.get('outlet_id'),
            'outlet_name': item.get('outlet_name'),
            'country': item.get('country'),
            'published': ts,
            'text': text,
        })

df = pd.DataFrame(records)
df = df.drop_duplicates(subset=['id', 'url'])
df = df.sort_values('published')
if len(df) > MAX_ARTICLES:
    df = df.iloc[-MAX_ARTICLES:]

print('Loaded', len(df), 'articles')
df.head(3)


Loaded 2387 articles


Unnamed: 0,id,title,url,outlet_id,outlet_name,country,published,text
2068,212bc0a5d9ee5950f683ef6eef5992e5879d918b3331e4...,Senaste nytt om kriget i Ukraina,https://www.svd.se/a/ja1n9q/konflikten-mellan-...,svd,Svenska Dagbladet,Sweden,2022-02-22 08:53:12+00:00,Senaste nytt om kriget i Ukraina\nUkrainska st...
2067,c3d419b36bf834898e77a1e65df45eefb4e305734e0243...,Senaste nytt om konflikterna i Mellanöstern,https://www.svd.se/a/dwel81/senaste-nytt-om-ko...,svd,Svenska Dagbladet,Sweden,2023-10-08 07:10:10+00:00,Senaste nytt om konflikterna i Mellanöstern\nE...
514,265e96ce907bc774dd88f4f30c97305aa959031711daa8...,Kom ut av skapet,https://www.dagbladet.no/sport/dramatisk-beskj...,dagbladet,Dagbladet,Norway,2025-01-02 04:02:29+00:00,Kom ut av skapet\nDramatisk beskjed\nDen polsk...


In [27]:
# --- embeddings ---
texts = df['text'].tolist()

model = SentenceTransformer(MODEL_NAME)
embeddings = model.encode(texts, show_progress_bar=True, batch_size=32, normalize_embeddings=True)

print('Embeddings shape:', embeddings.shape)


Batches: 100%|██████████| 75/75 [00:08<00:00,  8.77it/s]

Embeddings shape: (2387, 384)





In [28]:
# --- similarity graph with time decay ---
published = df['published'].tolist()

def sim_idx(i: int, j: int) -> float:
    return float(cosine_similarity(embeddings[i:i+1], embeddings[j:j+1])[0, 0])

def time_weight(dt_a: datetime, dt_b: datetime) -> float:
    delta_h = abs((dt_b - dt_a).total_seconds()) / 3600.0
    return math.exp(-delta_h / TIME_DECAY_HOURS)

window_s = TIME_WINDOW_HOURS * 3600
edges = []
n = len(df)

for i in range(n):
    dt_i = published[i]
    for j in range(i + 1, n):
        dt_j = published[j]
        if (dt_j - dt_i).total_seconds() > window_s:
            break
        sim = sim_idx(i, j)
        score = sim * time_weight(dt_i, dt_j)
        if score >= SIM_THRESHOLD:
            edges.append((i, j, score))

print('Edges:', len(edges))


Edges: 258


In [29]:
# --- clustering via connected components (union-find) ---
parent = list(range(n))
rank = [0] * n

def find(x: int) -> int:
    while parent[x] != x:
        parent[x] = parent[parent[x]]
        x = parent[x]
    return x

def union(a: int, b: int) -> None:
    ra = find(a)
    rb = find(b)
    if ra == rb:
        return
    if rank[ra] < rank[rb]:
        parent[ra] = rb
    elif rank[ra] > rank[rb]:
        parent[rb] = ra
    else:
        parent[rb] = ra
        rank[ra] += 1

for i, j, _score in edges:
    union(i, j)

clusters = defaultdict(list)
for i in range(n):
    clusters[find(i)].append(i)

cluster_ids = {root: idx for idx, root in enumerate(sorted(clusters, key=lambda r: len(clusters[r]), reverse=True))}
df['cluster_id'] = [cluster_ids[find(i)] for i in range(n)]

sizes = df['cluster_id'].value_counts().sort_values(ascending=False)
sizes.head(100)


cluster_id
0      8
1      7
2      6
3      6
7      5
      ..
44     2
124    2
45     2
109    2
135    2
Name: count, Length: 100, dtype: int64

In [30]:
# --- inspect clusters ---
def show_cluster(cluster_id: int, limit: int = 10):
    subset = df[df['cluster_id'] == cluster_id].sort_values('published')
    cols = ['published', 'country', 'outlet_name', 'title', 'url']
    return subset[cols].head(limit)

for cid in range(20):
    print(f"--- Cluster {cid} ---")
    display(show_cluster(cid, limit=5))


--- Cluster 0 ---


Unnamed: 0,published,country,outlet_name,title,url
1061,2025-12-28 09:44:37+00:00,Norway,NRK,Den franske skuespilleren Brigitte Bardot er død,https://www.nrk.no/nyheter/den-franske-skuespi...
434,2025-12-28 09:46:05+00:00,Norway,Aftenposten,Brigitte Bardot er død,https://www.aftenposten.no/verden/i/V6boR4/bri...
2403,2025-12-28 09:46:09+00:00,Sweden,SVT,Brigitte Bardot är död,https://www.svt.se/kultur/tt-flash-birgitte-ba...
2048,2025-12-28 09:46:18+00:00,Sweden,Expressen,Brigitte Bardot är död,https://www.expressen.se/nyheter/expressen-dir...
114,2025-12-28 09:49:00+00:00,Denmark,DR,Skuespiller Brigitte Bardot er død,https://www.dr.dk/nyheder/seneste/skuespiller-...


--- Cluster 1 ---


Unnamed: 0,published,country,outlet_name,title,url
1304,2025-12-23 12:35:48+00:00,Sweden,Aftonbladet,Lastbil och personbil i krock,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1302,2025-12-23 13:06:46+00:00,Sweden,Aftonbladet,Efter avpubliceringen: Mer Epsteinmaterial uppe,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1298,2025-12-23 14:22:16+00:00,Sweden,Aftonbladet,Man hittad död,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1296,2025-12-23 15:35:00+00:00,Sweden,Aftonbladet,Köer på E18 efter bilolycka,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1295,2025-12-23 16:24:55+00:00,Sweden,Aftonbladet,Norsk skidskytt död,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...


--- Cluster 2 ---


Unnamed: 0,published,country,outlet_name,title,url
1331,2025-12-24 03:03:25+00:00,Sweden,Aftonbladet,Nationalgardet till New Orleans,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1330,2025-12-24 03:46:47+00:00,Sweden,Aftonbladet,Sparade 191 lik – uppgörelse i rätten,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1307,2025-12-24 04:34:21+00:00,Sweden,Aftonbladet,USA:s sändebud: Vi ska inte ”erövra” Grönland,https://www.aftonbladet.se/nyheter/a/zOG41O/us...
2096,2025-12-24 04:34:21+00:00,Sweden,Svenska Dagbladet,USA:s sändebud: Vi ska inte ”erövra” Grönland,https://www.svd.se/a/0pnLxB/usa-s-sandebud-vi-...
1329,2025-12-24 04:41:57+00:00,Sweden,Aftonbladet,Sändebudet: USA ska inte ”erövra” Grönland,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...


--- Cluster 3 ---


Unnamed: 0,published,country,outlet_name,title,url
1384,2025-12-25 23:16:18+00:00,Sweden,Aftonbladet,Trump: Attackerat IS-terrorister i Nigeria,https://www.aftonbladet.se/nyheter/a/WvboP2/tr...
2145,2025-12-25 23:16:18+00:00,Sweden,Svenska Dagbladet,Trump: Attackerat IS-terrorister i Nigeria,https://www.svd.se/a/n1amoJ/trump-attackerat-i...
1212,2025-12-25 23:17:05+00:00,Norway,VG,Donald Trump sier USA har angrepet IS i Nigeria,https://www.vg.no/nyheter/i/vrQnoX/donald-trum...
1452,2025-12-25 23:21:34+00:00,Sweden,Aftonbladet,Trump: USA har attackerat IS-terrorister,https://www.aftonbladet.se/nyheter/a/rrAmaR/tr...
1943,2025-12-25 23:26:25+00:00,Sweden,Expressen,USA har bombat IS,https://www.expressen.se/nyheter/expressen-dir...


--- Cluster 4 ---


Unnamed: 0,published,country,outlet_name,title,url
817,2025-12-23 11:33:34+00:00,Norway,NRK,Greta Thunberg arrestert i London,https://www.nrk.no/nyheter/greta-thunberg-arre...
1270,2025-12-23 11:35:37+00:00,Sweden,Aftonbladet,Greta Thunberg gripen under demonstration,https://www.aftonbladet.se/nyheter/a/0pn9v2/gr...
1898,2025-12-23 11:36:56+00:00,Sweden,Expressen,Greta Thunberg gripen i London,https://www.expressen.se/nyheter/expressen-dir...
1269,2025-12-23 11:38:50+00:00,Sweden,Aftonbladet,Greta Thunberg gripen i London,https://www.aftonbladet.se/nyheter/a/gk7vvL/gr...
2076,2025-12-23 11:38:51+00:00,Sweden,Svenska Dagbladet,Greta Thunberg gripen i London,https://www.svd.se/a/RjR66O/greta-thunberg-gri...


--- Cluster 5 ---


Unnamed: 0,published,country,outlet_name,title,url
2071,2025-12-23 13:50:45+00:00,Sweden,Svenska Dagbladet,Hotet mot Tjernobyl: ”En minijordbävning”,https://www.svd.se/a/rrAzya/rysk-attack-kan-fa...
1266,2025-12-23 13:50:45+00:00,Sweden,Aftonbladet,Rysk attack kan få skyddet i Tjernobyl att kol...,https://www.aftonbladet.se/nyheter/a/m0R5nl/ry...
1892,2025-12-23 14:28:39+00:00,Sweden,Expressen,Larmet från Tjernobyl: Riskerar att kollapsa,https://www.expressen.se/nyheter/expressen-dir...
265,2025-12-23 14:32:09+00:00,Norway,Aftenposten,Russisk angrep kan få beskyttelsen i Tsjernoby...,https://www.aftenposten.no/verden/i/oEpkaa/rus...
455,2025-12-23 15:03:54+00:00,Norway,Dagbladet,Slår full alarm: - Kan kollapse,https://www.dagbladet.no/nyheter/slar-alarm-ka...


--- Cluster 6 ---


Unnamed: 0,published,country,outlet_name,title,url
453,2025-12-23 17:24:41.639647+00:00,Norway,Dagbladet,Tiltalt for ytterligere to overgrep,https://www.dagbladet.no/studio/72797-kjendiss...
468,2025-12-23 17:24:46.673980+00:00,Norway,Dagbladet,Arrestert,https://www.dagbladet.no/studio/72720-nyhetsst...
472,2025-12-23 17:24:48.922473+00:00,Norway,Dagbladet,Forfatter siktet for voldtekt: Henlagt,https://www.dagbladet.no/studio/72720-nyhetsst...
481,2025-12-23 17:24:52.402393+00:00,Norway,Dagbladet,Bekrefter ny sesong,https://www.dagbladet.no/studio/72797-kjendiss...
491,2025-12-23 17:24:56.703217+00:00,Norway,Dagbladet,NÅ: To personer skadd,https://www.dagbladet.no/studio/72720-nyhetsst...


--- Cluster 7 ---


Unnamed: 0,published,country,outlet_name,title,url
1324,2025-12-23 18:08:51+00:00,Sweden,Aftonbladet,Explosion i Södertälje,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1338,2025-12-23 19:46:41+00:00,Sweden,Aftonbladet,Tågstopp i Dalarna,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1337,2025-12-23 21:02:57+00:00,Sweden,Aftonbladet,Man fick inte flyga – ställde till med scen,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1336,2025-12-23 21:14:08+00:00,Sweden,Aftonbladet,HD stoppar Trumps order till nationalgardet,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1335,2025-12-23 21:25:17+00:00,Sweden,Aftonbladet,Nytt rekord på Wall Street,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...


--- Cluster 8 ---


Unnamed: 0,published,country,outlet_name,title,url
1499,2025-12-26 15:56:47+00:00,Sweden,Aftonbladet,Man häktad för inblandning i mordet i Oxie,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1498,2025-12-26 16:04:55+00:00,Sweden,Aftonbladet,Efter mordet i Boden – kommunen erbjuder krisstöd,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1497,2025-12-26 17:15:40+00:00,Sweden,Aftonbladet,Zelenskyj: Redo för folkomröstning,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1496,2025-12-26 17:19:10+00:00,Sweden,Aftonbladet,Skadeläget stabilt för brottsoffer i Boden,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1495,2025-12-26 17:54:39+00:00,Sweden,Aftonbladet,Myndigheterna varnar: Ge dig inte ut,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...


--- Cluster 9 ---


Unnamed: 0,published,country,outlet_name,title,url
1280,2025-12-23 10:00:11+00:00,Sweden,Aftonbladet,Hon lämnar SD i riksdagen – byter parti,https://www.aftonbladet.se/nyheter/a/k0J8WB/ho...
2081,2025-12-23 10:00:12+00:00,Sweden,Svenska Dagbladet,SD-politiker byter parti – till KD,https://www.svd.se/a/d4Ov5j/hon-lamnar-sd-i-ri...
1905,2025-12-23 10:03:56+00:00,Sweden,Expressen,SD-politikern byter parti,https://www.expressen.se/nyheter/expressen-dir...
1279,2025-12-23 10:07:38+00:00,Sweden,Aftonbladet,Hon lämnar SD i riksdagen – byter parti,https://www.aftonbladet.se/nyheter/a/5p0bVb/sa...


--- Cluster 10 ---


Unnamed: 0,published,country,outlet_name,title,url
689,2025-12-23 16:20:27+00:00,Norway,NRK,Skiskytter Sivert Guttorm Bakken er død: – Hel...,https://www.nrk.no/sport/sivert-guttorm-bakken...
449,2025-12-23 16:21:15+00:00,Norway,Dagbladet,Funnet død i Italia,https://www.dagbladet.no/sport/funnet-dod/8402...
258,2025-12-23 16:21:33+00:00,Norway,Aftenposten,Skiskytter Sivert Guttorm Bakken (27) er død,https://www.aftenposten.no/sport/skiskyting/i/...
807,2025-12-23 16:22:10+00:00,Norway,NRK,Skiskytter Sivert Guttorm Bakken (27) er død,https://www.nrk.no/nyheter/skiskytter-sivert-g...


--- Cluster 11 ---


Unnamed: 0,published,country,outlet_name,title,url
2100,2025-12-23 23:24:48+00:00,Sweden,Svenska Dagbladet,17-åring gripen för mordplaner i Tyskland,https://www.svd.se/a/9pvqxp/17-aring-gripen-fo...
1313,2025-12-23 23:24:48+00:00,Sweden,Aftonbladet,17-åring gripen för mordplaner i Tyskland,https://www.aftonbladet.se/nyheter/a/y538VE/17...
2331,2025-12-23 23:50:36+00:00,Sweden,SVT,17-åring gripen för mordplaner i Tyskland,https://www.svt.se/nyheter/inrikes/17-aring-gr...
841,2025-12-23 23:59:27+00:00,Norway,NRK,Svensk 17-åring pågrepet for drapsplaner i Tys...,https://www.nrk.no/nyheter/svensk-17-aring-pag...


--- Cluster 12 ---


Unnamed: 0,published,country,outlet_name,title,url
856,2025-12-24 18:58:07+00:00,Norway,NRK,Kan ha avdekket 1 million flere dokumenter kny...,https://www.nrk.no/nyheter/kan-ha-avdekket-1-m...
302,2025-12-24 19:30:59+00:00,Norway,Aftenposten,USAs justisdepartement: En million dokumenter ...,https://www.aftenposten.no/verden/i/XMgdLx/usa...
522,2025-12-24 19:40:21+00:00,Norway,Dagbladet,Nytt sjokkfunn: - Jobber døgnet rundt,https://www.dagbladet.no/nyheter/nytt-sjokkfun...
1208,2025-12-24 19:41:55+00:00,Norway,VG,Justis­departementet fant over en million nye ...,https://www.vg.no/nyheter/i/JOoBy7/justisdepar...


--- Cluster 13 ---


Unnamed: 0,published,country,outlet_name,title,url
883,2025-12-24 19:37:25+00:00,Norway,NRK,Rødt farevarsel for snøskred i Troms julaften,https://www.nrk.no/nyheter/rodt-farevarsel-for...
525,2025-12-24 19:50:04+00:00,Norway,Dagbladet,Slår alarm: Faren øker,https://www.dagbladet.no/nyheter/rodt-farevars...
1206,2025-12-24 19:58:24+00:00,Norway,VG,Rødt farevarsel om snøskred i Troms,https://www.vg.no/nyheter/i/gk7zAq/roedt-farev...
300,2025-12-24 20:04:25+00:00,Norway,Aftenposten,Rødt farevarsel om snøskred,https://www.aftenposten.no/norge/i/BxRbXl/roed...


--- Cluster 14 ---


Unnamed: 0,published,country,outlet_name,title,url
1424,2025-12-25 20:19:36+00:00,Sweden,Aftonbladet,Önation i Stilla havet tar emot USA-migranter,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1423,2025-12-25 20:33:12+00:00,Sweden,Aftonbladet,Hög smäll: ”Vi vet inte vad det är”,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1422,2025-12-25 20:55:38+00:00,Sweden,Aftonbladet,Misstänkt anlagd brand i trapphus – stor sökin...,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...
1421,2025-12-25 21:29:48+00:00,Sweden,Aftonbladet,Vild biljakt på E18: ”Låg i 180”,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...


--- Cluster 15 ---


Unnamed: 0,published,country,outlet_name,title,url
549,2025-12-25 23:34:16.608223+00:00,Norway,Dagbladet,- Ble funnet her,https://www.dagbladet.no/video/dagbladet-i-lav...
561,2025-12-25 23:34:20.342384+00:00,Norway,Dagbladet,Her flyr russiske bombefly over Norskehavet,https://www.dagbladet.no/video/her-flyr-russis...
569,2025-12-25 23:34:28.925298+00:00,Norway,Dagbladet,- Er lei meg,https://www.dagbladet.no/video/har-vaert-gansk...
579,2025-12-25 23:34:35.252129+00:00,Norway,Dagbladet,Funnet drivende,https://www.dagbladet.no/video/gran-canaria-sj...


--- Cluster 16 ---


Unnamed: 0,published,country,outlet_name,title,url
951,2025-12-26 06:56:14+00:00,Norway,NRK,Zelenskyj sier han skal møte Trump snart,https://www.nrk.no/nyheter/zelenskyj-sier-han-...
1443,2025-12-26 07:12:07+00:00,Sweden,Aftonbladet,Zelenskyj ska träffa Trump,https://www.aftonbladet.se/nyheter/a/pBLKpE/ze...
67,2025-12-26 07:39:00+00:00,Denmark,DR,Zelenskyj: Møde med Trump på vej - 'meget kan ...,https://www.dr.dk/nyheder/seneste/zelenskyj-mo...
189,2025-12-26 08:02:23+00:00,Denmark,Jyllands-Posten,Zelenskyj varsler møde med Trump i den nære fr...,https://jyllands-posten.dk/international/ECE18...


--- Cluster 17 ---


Unnamed: 0,published,country,outlet_name,title,url
370,2025-12-26 18:57:46+00:00,Norway,Aftenposten,The Cure-gitarist er død – skulle spilt i Oslo...,https://www.aftenposten.no/kultur/i/k0JdAQ/the...
1233,2025-12-26 19:03:13+00:00,Norway,VG,The Cure-gitarist Perry Bamonte er død,https://www.vg.no/rampelys/i/zOGxd9/the-cure-g...
1157,2025-12-26 19:09:36+00:00,Norway,TV 2,The Cure-gitarist er død,https://www.tv2.no/a/18414215
961,2025-12-26 19:13:43+00:00,Norway,NRK,Gitaristen i The Cure er død,https://www.nrk.no/nyheter/gitaristen-i-the-cu...


--- Cluster 18 ---


Unnamed: 0,published,country,outlet_name,title,url
2247,2025-12-27 03:54:32+00:00,Sweden,Svenska Dagbladet,Kambodja och Thailand undertecknar vapenvila,https://www.svd.se/a/2pGoqv/kambodja-overens-m...
1517,2025-12-27 03:54:32+00:00,Sweden,Aftonbladet,Kambodja och Thailand undertecknar vapenvila,https://www.aftonbladet.se/nyheter/a/oEp19K/ka...
2383,2025-12-27 04:31:10+00:00,Sweden,SVT,Avtal om vapenvila klart mellan Thailand och K...,https://www.svt.se/nyheter/utrikes/avtal-om-va...
2015,2025-12-27 06:24:24+00:00,Sweden,Expressen,Nytt avtal om vapenvila,https://www.expressen.se/nyheter/expressen-dir...


--- Cluster 19 ---


Unnamed: 0,published,country,outlet_name,title,url
269,2025-12-23 12:58:02+00:00,Norway,Aftenposten,USAs justisdepartement: – Epstein-dokumenter i...,https://www.aftenposten.no/verden/i/q6LJbg/usa...
1895,2025-12-23 13:22:38+00:00,Sweden,Expressen,"Epsteinfilerna ""innehåller falska anklagelser ...",https://www.expressen.se/nyheter/expressen-dir...
463,2025-12-23 13:46:10+00:00,Norway,Dagbladet,Nye dokumenter: - Falske anklager,https://www.dagbladet.no/nyheter/nye-dokumente...


## Notes
- Raise `SIM_THRESHOLD` for tighter clusters; lower it to merge more.
- Increase `TIME_WINDOW_HOURS` to allow longer-running stories to match.
- If you want more structure, replace connected-components with DBSCAN or HDBSCAN.


In [57]:
# --- raw jsonl viewer (country/outlet/id) ---
RAW_DIR = Path('../data/raw')
_outlet_cache = {}

def list_countries():
    return sorted([p.name for p in RAW_DIR.iterdir() if p.is_dir()])

def list_outlets(country: str):
    country_dir = RAW_DIR / country
    if not country_dir.exists():
        raise FileNotFoundError(f'No such country folder: {country_dir}')
    return sorted([p.stem for p in country_dir.glob('*.jsonl')])

def load_outlet_articles(country: str, outlet_id: str):
    key = (country, outlet_id)
    if key in _outlet_cache:
        return _outlet_cache[key]
    path = RAW_DIR / country / f'{outlet_id}.jsonl'
    if not path.exists():
        raise FileNotFoundError(f'Missing jsonl: {path}')
    records = []
    for line in path.read_text().splitlines():
        if not line.strip():
            continue
        records.append(json.loads(line))
    _outlet_cache[key] = records
    return records

def list_article_ids(country: str, outlet_id: str, limit: int | None = 20):
    records = load_outlet_articles(country, outlet_id)
    rows = []
    for item in records:
        rows.append({
            'id': item.get('id'),
            'title': item.get('title'),
            'published': item.get('published'),
        })
    df_ids = pd.DataFrame(rows)
    if limit:
        return df_ids.head(limit)
    return df_ids

def get_article_by_id(country: str, outlet_id: str, article_id: str) -> dict:
    records = load_outlet_articles(country, outlet_id)
    record = next((r for r in records if r.get('id') == article_id), None)
    if record is None:
        raise ValueError(f'Article id not found: {article_id}')
    return record

# Example:
countries = list_countries()
#print(countries)

outlets = list_outlets('Sweden')
#print(outlets)

outlet = 'aftonbladet'
country = 'Sweden'
article_ids = list_article_ids(country, outlet, limit=None)
#print(article_ids)

article_id = article_ids.iloc[0]['id']
article_id = 'b8d5adda3851d5711f6f6d15172218502b83fdf245c0ba3555220a1bd2192985'
print('Viewing article id:', article_id)
article = get_article_by_id(country, outlet, article_id)

import textwrap

def print_full_text(article: dict, width: int = 80) -> None:
    full_text = (article.get("full_text") or "").strip()
    if not full_text:
        print("(no full_text)")
        return
    wrapped = textwrap.fill(full_text, width=width, break_long_words=False, break_on_hyphens=False)
    print(wrapped)

print_full_text(article)

Viewing article id: b8d5adda3851d5711f6f6d15172218502b83fdf245c0ba3555220a1bd2192985
Tre människor, bland dem ett ettårigt barn, dog när ett tvåvåningshus föll
samman i Soweto i Sydafrika. Ytterligare tre personer skadades i kollapsen och
vårdas nu på sjukhus, enligt myndigheter. Samtliga sex befann sig i byggnaden
när den föll ihop tidigt på söndagen. De överlevande var fastklämda under
rasmassor innan de kunde dras fram. Soweto är en förstad som ligger väster om
Johannesburg. Händelsen utreds, uppger en talesperson för Johannesburgs
katastrofmyndighet.   En man i 20-årsåldern har dött efter en singelolycka i
Smedby, Österåkers kommun. Tre personer har befunnit sig i bilen, alla tre
fördes till sjukhus. Mannens anhöriga har underrättats. Föraren misstänks för
drograttfylleri, enligt polisen.   Det har under söndagsförmiddagen brunnit
kraftigt i en lägenhet i Slottsstaden i centrala Malmö, det skriver polisen på
sin hemsida. Polis, räddningstjänst och ambulans kallades till platsen. En