In [None]:
import re, math, json, os, sys, random, pickle, itertools
from collections import Counter, defaultdict
from dataclasses import dataclass
import numpy as np
import pandas as pd
import torch
import spacy
from spacy.pipeline import EntityRuler
from tqdm.auto import tqdm

from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)

set_seed(42)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE


'cuda'

In [None]:
#from google.colab import files
#uploaded = files.upload()


In [None]:
PATH_XLSX = "evrensel_isci_sendika_2024_dec2025_clean_fin_uncorrupted_real.xlsx"  # <-- change this
df = pd.read_excel(PATH_XLSX)

required_cols = ["EVENT_RELEVANT", "EVENT_ID", "title", "content","date"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in XLSX: {missing}")

df.shape, df.columns.tolist()


((9186, 15),
 ['title',
  'date',
  'link',
  'content',
  'EVENT_RELEVANT',
  'EVENT_ID',
  'Unnamed: 6',
  'error',
  'Unnamed: 8',
  'Unnamed: 9',
  'Unnamed: 10',
  'Unnamed: 11',
  'Unnamed: 12',
  650,
  160])

In [None]:
def normalize_text(x):
    """Light cleanup: keep Turkish characters, remove weird spaces, collapse whitespace."""
    if pd.isna(x):
        return ""
    x = str(x).replace("\u00A0", " ")  # non-breaking space
    x = re.sub(r"\s+", " ", x).strip()
    return x

df["title"] = df["title"].apply(normalize_text)
df["content"] = df["content"].apply(normalize_text)


# final text fed into the model
df["text"] = (df["title"].fillna("") + "\n\n" + df["content"].fillna("")).astype(str)

df[["title", "content", "text"]].head(2)

Unnamed: 0,title,content,text
0,Bartın'da Hema'ya ait maden ocağında vagonları...,Bartın'ın Amasra ilçesindeki Hema Enerji şirke...,Bartın'da Hema'ya ait maden ocağında vagonları...
1,Bu soygun düzeni değişmeli,Pendik Marmara Eğitim ve Araştırma Hastanesind...,Bu soygun düzeni değişmeli\n\nPendik Marmara E...


In [None]:
# ----------------------------
# Cell 4 (fixed) — Define labeled rows (accept 0/1 as floats OR strings)
# ----------------------------
import numpy as np
import pandas as pd

def normalize_label(x):
    if pd.isna(x):
        return np.nan

    # float/integer case (your current situation: 0.0 / 1.0)
    if isinstance(x, (int, np.integer, float, np.floating)):
        if x == 0 or x == 0.0:
            return 0
        if x == 1 or x == 1.0:
            return 1
        return np.nan

    # string case (future-proof)
    s = str(x).strip()
    if s == "0":
        return 0
    if s == "1":
        return 1
    if s == "0.0":
        return 0
    if s == "1.0":
        return 1

    return np.nan

df["LABEL_CLEAN"] = df["EVENT_RELEVANT"].apply(normalize_label)
labeled_mask = df["LABEL_CLEAN"].notna()

print("Labeled rows:", labeled_mask.sum())
print(df.loc[labeled_mask, "LABEL_CLEAN"].value_counts())


Labeled rows: 738
LABEL_CLEAN
0.0    578
1.0    160
Name: count, dtype: int64


In [None]:
# ----------------------------
# Cell 5 — Train/validation split (using LABEL_CLEAN from Cell 4)
# ----------------------------

# Keep only labeled rows (LABEL_CLEAN is 0/1, NaN otherwise)
df_labeled = df.loc[labeled_mask].copy()

# This is what the Trainer will learn on
df_labeled["label"] = df_labeled["LABEL_CLEAN"].astype(int)

# Stratified split so class balance is preserved in train/val
train_df, val_df = train_test_split(
    df_labeled,
    test_size=0.20,
    random_state=42,
    stratify=df_labeled["label"],
)

print("Train:", len(train_df), "Val:", len(val_df))
print("Train label distribution:\n", train_df["label"].value_counts())
print("Val label distribution:\n", val_df["label"].value_counts())
print("Train positive rate:", train_df["label"].mean())
print("Val positive rate:", val_df["label"].mean())


Train: 590 Val: 148
Train label distribution:
 label
0    462
1    128
Name: count, dtype: int64
Val label distribution:
 label
0    116
1     32
Name: count, dtype: int64
Train positive rate: 0.21694915254237288
Val positive rate: 0.21621621621621623


In [None]:
# Inspect what EVENT_RELEVANCE really looks like
s = df["EVENT_RELEVANT"]

print("dtype:", s.dtype)
print("non-null count:", s.notna().sum())

# show a sample of unique raw values (as-is)
u = s.dropna().unique()
print("unique values sample (up to 50):", u[:50])

# show stringified + stripped sample too
u_str = pd.Series(u).astype(str).str.strip()
print("stringified sample (up to 50):", u_str.head(50).tolist())


dtype: float64
non-null count: 738
unique values sample (up to 50): [0. 1.]
stringified sample (up to 50): ['0.0', '1.0']


In [None]:
MODEL_NAME = "dbmdz/bert-base-turkish-cased"  # BERTurk
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 384  # 512 is allowed but slower

class TextClsDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=384):
        self.texts = list(texts)
        self.labels = None if labels is None else list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors=None,
        )
        if self.labels is not None:
            enc["labels"] = int(self.labels[idx])
        return enc

train_ds = TextClsDataset(train_df["text"], train_df["label"], tokenizer, MAX_LEN)
val_ds   = TextClsDataset(val_df["text"],   val_df["label"],   tokenizer, MAX_LEN)

collator = DataCollatorWithPadding(tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(DEVICE)

pos = int(train_df["label"].sum())
neg = int(len(train_df) - pos)

# More weight to positive class if positives are rare
class_weights = torch.tensor([1.0, (neg / max(pos, 1))], dtype=torch.float, device=DEVICE)
class_weights


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([1.0000, 3.6094], device='cuda:0')

In [None]:
# ----------------------------
# Cell 8 (updated) — Trainer setup + fine-tune (version compatible)
# ----------------------------
import inspect

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()[:, 1]
    return {
        "roc_auc": float(roc_auc_score(labels, probs)) if len(np.unique(labels)) > 1 else float("nan")
    }

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        **kwargs absorbs version-specific args like num_items_in_batch
        """
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# Build TrainingArguments with only supported params
ta_params = inspect.signature(TrainingArguments.__init__).parameters

args_dict = dict(
    output_dir="./berturk_event_detector",
    learning_rate=2e-5,
    per_device_train_batch_size=8 if DEVICE == "cuda" else 4,
    per_device_eval_batch_size=16 if DEVICE == "cuda" else 8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Strategy naming differs by transformers version
if "evaluation_strategy" in ta_params:
    args_dict["evaluation_strategy"] = "epoch"
elif "eval_strategy" in ta_params:
    args_dict["eval_strategy"] = "epoch"

if "save_strategy" in ta_params:
    args_dict["save_strategy"] = "epoch"

if "logging_strategy" in ta_params:
    args_dict["logging_strategy"] = "steps"

if "logging_steps" in ta_params:
    args_dict["logging_steps"] = 50

if "load_best_model_at_end" in ta_params:
    args_dict["load_best_model_at_end"] = True

if "metric_for_best_model" in ta_params:
    args_dict["metric_for_best_model"] = "roc_auc"

if "greater_is_better" in ta_params:
    args_dict["greater_is_better"] = True

if "fp16" in ta_params:
    args_dict["fp16"] = True if DEVICE == "cuda" else False

if "report_to" in ta_params:
    args_dict["report_to"] = "none"

training_args = TrainingArguments(**args_dict)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Roc Auc
1,0.6781,0.689853,0.703664
2,0.7614,0.578894,0.786099
3,0.5541,0.524608,0.858702


TrainOutput(global_step=222, training_loss=0.6330845506341608, metrics={'train_runtime': 91.9975, 'train_samples_per_second': 19.24, 'train_steps_per_second': 2.413, 'total_flos': 349279925990400.0, 'train_loss': 0.6330845506341608, 'epoch': 3.0})

In [None]:
val_out = trainer.predict(val_ds)
val_logits = val_out.predictions
val_labels = val_out.label_ids

val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()[:, 1]
val_preds = (val_probs >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(val_labels, val_probs) if len(np.unique(val_labels)) > 1 else "NA")
print(classification_report(val_labels, val_preds, digits=3))


ROC-AUC: 0.8587015086206896
              precision    recall  f1-score   support

           0      0.891     0.914     0.902       116
           1      0.655     0.594     0.623        32

    accuracy                          0.845       148
   macro avg      0.773     0.754     0.763       148
weighted avg      0.840     0.845     0.842       148



In [None]:
# ----------------------------
# Recall-first threshold tuning (on validation set)
# ----------------------------

from sklearn.metrics import classification_report
import numpy as np

target_recall = 0.80   # you can change this if you want

best = None
for thr in np.linspace(0.05, 0.95, 91):
    preds = (val_probs >= thr).astype(int)
    rep = classification_report(val_labels, preds, output_dict=True, zero_division=0)
    p = rep["1"]["precision"]
    r = rep["1"]["recall"]
    f = rep["1"]["f1-score"]

    if r >= target_recall:
        cand = (f, thr, p, r)
        if best is None or f > best[0]:
            best = cand

print("Best threshold for recall >=", target_recall, ":", best)


Best threshold for recall >= 0.8 : (0.5894736842105263, np.float64(0.33999999999999997), 0.4444444444444444, 0.875)


In [None]:
# ----------------------------
# ML predict on FULL corpus -> EVENT_PRED_ML_PROB
# ----------------------------

# 1) Build a dataset aligned with df (same order)
#    Assumes you already have a function that tokenizes from df["text"].
#    Minimal inline version:

from datasets import Dataset

full_df = df[["text"]].copy()
full_df["text"] = full_df["text"].fillna("").astype(str)

full_hf = Dataset.from_pandas(full_df, preserve_index=False)

def tok(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=256,          # keep same as your training
        padding=False
    )

full_ds = full_hf.map(tok, batched=True, remove_columns=["text"])

# 2) Predict
full_out = trainer.predict(full_ds)
full_logits = full_out.predictions

# 3) Save probabilities back to df
df["EVENT_PRED_ML_PROB"] = torch.softmax(torch.tensor(full_logits), dim=-1).numpy()[:, 1]

print("Saved EVENT_PRED_ML_PROB for", len(df), "rows.")
df["EVENT_PRED_ML_PROB"].describe()


Map:   0%|          | 0/9186 [00:00<?, ? examples/s]

Saved EVENT_PRED_ML_PROB for 9186 rows.


Unnamed: 0,EVENT_PRED_ML_PROB
count,9186.0
mean,0.261125
std,0.112281
min,0.080719
25%,0.175132
50%,0.249516
75%,0.334491
max,0.824391


In [None]:
# ----------------------------
# Stage 1: ML gate -> EVENT_PRED (predicted relevant)
# ----------------------------
THRESH = 0.44  # or set this from your tuning output

df["EVENT_PRED"] = (df["EVENT_PRED_ML_PROB"] >= THRESH).astype(int)

print("Pred-relevant (ML):", int(df["EVENT_PRED"].sum()), "/", len(df))


Pred-relevant (ML): 2639 / 9186


In [None]:
# ----------------------------
# Stage 2: Narrow to Collective Bargaining / Wage-related (within ML-relevant)
# ----------------------------
import re

KEEP_PATTERNS = [
    r"\btoplu sözleşme\b", r"\btis\b", r"\btoplu iş sözleşmesi\b",
    r"\bücret\b", r"\bzam\b", r"\bmaaş\b", r"\bücret artış\b",
    r"\bpazarlık\b", r"\bgörüşme\b", r"\bmüzakere\b",
    r"\bgrev\b", r"\bgrevde\b", r"\bgreve çıktı\b",
    r"\bsözleşme süreci\b", r"\bsözleşme görüşmeleri\b", r"\bdüşük ücret\b"
]

DROP_PATTERNS = [
    r"\bişten çıkar", r"\bişten at", r"\bişten çıkarıl", r"\bkovuldu\b",
    r"\bsendikalaş", r"\bsendika üye", r"\bsendika üyeli",
    r"\biş kaz", r"\bölüm\b.*\b(ölüm|yaralı)\b", r"\bgöçük\b",
    r"\bgözalt", r"\btutuk", r"\bdava\b", r"\bmahkeme\b",
    r"\bziyaret\b", r"\bdayanışma\b", r"\banma\b", r"\bbasın açıklama\b",
    r"\bsendikalaşma\b"
]

keep_re = re.compile("|".join(KEEP_PATTERNS), flags=re.IGNORECASE)
drop_re = re.compile("|".join(DROP_PATTERNS), flags=re.IGNORECASE)

def is_cb_wage_article(title, content):
    text = f"{title} {content}"
    if drop_re.search(text):
        return False
    return bool(keep_re.search(text))

df["IS_CB_WAGE"] = df.apply(
    lambda r: is_cb_wage_article(str(r.get("title","")), str(r.get("content",""))),
    axis=1
)

# Apply gate ONLY to ML-predicted relevant
df["EVENT_PRED_CB"] = ((df["EVENT_PRED"] == 1) & (df["IS_CB_WAGE"])).astype(int)

print("Pred-relevant (ML all):", int((df["EVENT_PRED"]==1).sum()))
print("Pred-relevant (CB/Wage):", int((df["EVENT_PRED_CB"]==1).sum()))


Pred-relevant (ML all): 2639
Pred-relevant (CB/Wage): 955


In [None]:
# ----------------------------
# Cell S2 (updated) — Parse Turkish publication date into PUB_DATE
# ----------------------------

MONTHS_TR = {
    "ocak":1, "şubat":2, "subat":2, "mart":3, "nisan":4, "mayıs":5, "mayis":5,
    "haziran":6, "temmuz":7, "ağustos":8, "agustos":8, "eylül":9, "eylul":9,
    "ekim":10, "kasım":11, "kasim":11, "aralık":12, "aralik":12
}

def parse_tr_pub_date(x):
    if pd.isna(x):
        return pd.NaT

    s = str(x).lower()

    # Remove everything after "güncelleme"
    s = re.sub(r"güncelleme.*", "", s)

    # Keep only the date part: "10 şubat 2024"
    m = re.search(r"(\d{1,2})\s+([a-zçğıöşü]+)\s+(\d{4})", s)
    if not m:
        return pd.NaT

    day, month_tr, year = m.groups()
    month = MONTHS_TR.get(month_tr, None)
    if month is None:
        return pd.NaT

    return pd.Timestamp(year=int(year), month=month, day=int(day))

df["PUB_DATE"] = df["date"].apply(parse_tr_pub_date)

print("Parsed PUB_DATE:", df["PUB_DATE"].notna().sum(), "/", len(df))
df[["date", "PUB_DATE"]].head(10)


Parsed PUB_DATE: 9167 / 9186


Unnamed: 0,date,PUB_DATE
0,2 Ocak 2024 11:02 — — Güncelleme: 10:13,2024-01-02
1,2 Ocak 2024 04:30,2024-01-02
2,1 Ocak 2024 10:36,2024-01-01
3,1 Ocak 2024 03:00,2024-01-01
4,31 Aralık 2023 23:07,2023-12-31
5,31 Aralık 2023 15:18,2023-12-31
6,31 Aralık 2023 06:34 — — Güncelleme: 1 Ocak 20...,2023-12-31
7,31 Aralık 2023 05:34 — — Güncelleme: 11:21,2023-12-31
8,30 Aralık 2023 17:58,2023-12-30
9,30 Aralık 2023 17:40,2023-12-30


In [None]:
UNION_TERMS = {
 "birleşik metal iş", "metal iş", "türk metal", "türk iş", "hak iş", "disk", "kesk",
    "tüm bel sen", "tüm bel-sen", "birtek sen", "genel iş", "sağlık iş", "petrol iş",
    "tek gıda iş", "emek partisi", "emep", "chp", "akp", "mhp", "birleşik metal"
}
PERSON_TERMS = {
"hilal tok istanbul", "ramis sağlam içerik", "genel başkan özkan atar",
    "bölge temsilcisi hayrettin çakmak", "izbb başkanı cemil tugay", "hilal tok",
    "hasret gültekin kozan"
}
DROP_IF_CONTAINS = [
    "genel başkan", "başkanı", "içerik", "servisi", "şube başkanı",
    "organize sanayi", "sanayi bölgesi", "devlet hastanesi"
]


def normalize_key(s: str) -> str:
  s = s.lower().strip().replace("i̇","i")
  s = re.sub(r"\s+"," ", s)
  s = re.sub(r"[^\w\sçğıöşü0-9]","", s)
  return s

ALIAS2CANON = {}
for t in UNION_TERMS:
    k = normalize_key(t)
    ALIAS2CANON[k] = k



_drop_terms = [re.escape(normalize_key(t)) for t in DROP_IF_CONTAINS if t.strip()]
DROP_RE = re.compile("|".join(_drop_terms), flags=re.IGNORECASE) if _drop_terms else None


def drop_noise_key(k: str) -> bool:
  if not k: return True
  k2 = normalize_key(k)
  if not k2: return True
  return bool(DROP_RE.search(k2)) if DROP_RE else False


def canonicalize_org(k: str) -> str:
  k2 = normalize_key(k)
  return ALIAS2CANON.get(k2, k2)

In [None]:
CONTENT_CHARS = 900; TITLE_BOOST = 3; NGRAM_MIN,NGRAM_MAX = 2,4; MIN_FREQ=3; MAX_PHRASES=6000
STOP = set("""ve veya ile için gibi üzere da de ki mi mı mu mü işçi işçileri grev grevi direniş direnişi eylem açıklama basın sendika sendikası işçilerden işçilerin mücadele talep sözleşme toplu iş emekçi emekçiler emekçileri""".split())


def iter_ngrams(tokens,nmin=2,nmax=4):
  L=len(tokens)
  for n in range(nmin,nmax+1):
    for i in range(L-n+1): yield " ".join(tokens[i:i+n])


def good_phrase(p):
  if len(p)<8: return False
  toks=p.split()
  if len(toks)<2: return False
  if sum(t.isdigit() for t in toks)>=len(toks)-1: return False
  return True


phrase_counts=Counter()
for t,c in zip(df["title"], df["content"]):
  t_norm=normalize_key(t); c_norm=normalize_key(c[:CONTENT_CHARS])
  t_t=[x for x in t_norm.split() if x not in STOP and len(x)>=3]
  c_t=[x for x in c_norm.split() if x not in STOP and len(x)>=3]
  for ng in iter_ngrams(c_t):
    if good_phrase(ng): phrase_counts[ng]+=1
  for ng in iter_ngrams(t_t):
    if good_phrase(ng): phrase_counts[ng]+=TITLE_BOOST


cands=[(p,c) for p,c in phrase_counts.items() if c>=MIN_FREQ]
cands.sort(key=lambda x:x[1], reverse=True)
firm_phrases=[p for p,_ in cands[:MAX_PHRASES]]

In [None]:
nlp = spacy.blank("tr")
nlp.max_length = 2_000_000
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns([{ "label":"ORG", "pattern":p } for p in firm_phrases])


ORG_MAX_PER_DOC=6; TEXT_TRUNC=4500; BATCH_SIZE=64
texts=[t[:TEXT_TRUNC] for t in df["text"].tolist()]


org_keys=[]
for doc in nlp.pipe(texts, batch_size=BATCH_SIZE):
    ents=[e.text for e in doc.ents if e.label_=="ORG"]
    out=[]; seen=set()
    for e in ents:
      if drop_noise_key(e): continue
      e2=canonicalize_org(e)
      if not e2 or e2 in seen: continue
      seen.add(e2); out.append(e2)
      if len(out)>=ORG_MAX_PER_DOC: break
    org_keys.append(out)


df["ORG_KEYS"]=org_keys

In [None]:
BAD_ORG_EXACT=set(map(normalize_key,["disk","türk iş","turk is","hak iş","kesk","genel başkanı","genel baskani","şube başkanı","sube baskani","emek partisi","emep","organize sanayi"]))


def is_good_org_key(k):
  k2=normalize_key(k)
  if not k2 or k2 in BAD_ORG_EXACT: return False
  toks=k2.split()
  if len(toks)<2 or len(k2)<7: return False
  return True


df["ORG_KEYS_FILTERED"]=df["ORG_KEYS"].apply(lambda ks:[k for k in ks if is_good_org_key(k)])

In [None]:
LINK_WINDOW_DAYS=14
# Link only among relevant articles (EVENT_RELEVANT == 1) AND having org keys
relevant_mask = df["EVENT_RELEVANT"].fillna(0).astype(int).eq(1)
has_keys_mask = df["ORG_KEYS_FILTERED"].apply(lambda x: isinstance(x, list) and len(x) > 0)

mask = relevant_mask & has_keys_mask

# Compute CB only for relevant rows; set others to 0 (or NaN if you prefer)
#df["EVENT_PRED_CB"] = np.nan
#df.loc[relevant_mask, "EVENT_PRED_CB"] = df.loc[relevant_mask, "text"].apply(cb_flag).astype(int)


# IMPORTANT: keep original index so you can write EVENT_ID_FIRM back to df_out later
df_link = (
    df.loc[mask]
      .sort_values(["PUB_DATE"])
      .reset_index(drop=False)   # now "index" exists
)


parent=list(range(len(df_link)))


def find(x):
  while parent[x]!=x:
    parent[x]=parent[parent[x]]; x=parent[x]
  return x


def union(a,b):
  ra,rb=find(a),find(b)
  if ra!=rb: parent[rb]=ra


rows_by_org=defaultdict(list)
for i,ks in enumerate(df_link["ORG_KEYS_FILTERED"]):
  for k in ks: rows_by_org[k].append(i)


pub=pd.to_datetime(df_link["PUB_DATE"], errors="coerce")
for k,idxs in rows_by_org.items():
  idxs = sorted(idxs, key=lambda i: pub.iloc[i] if pd.notna(pub.iloc[i]) else pd.Timestamp.min)
  for j in range(len(idxs)-1):
    a,b=idxs[j],idxs[j+1]
    da,db=pub.iloc[a],pub.iloc[b]
    if pd.isna(da) or pd.isna(db): continue
    if abs((db-da).days)<=LINK_WINDOW_DAYS: union(a,b)


cid=[find(i) for i in range(len(df_link))]
uniq={c:i+1 for i,c in enumerate(pd.unique(cid))}
df_link["EVENT_ID_FIRM"]=[uniq[c] for c in cid]

  uniq={c:i+1 for i,c in enumerate(pd.unique(cid))}


In [None]:
df_out = df.copy()
df_out["EVENT_ID_FIRM"] = np.nan

# ----------------------------
# Final event id for ALL relevant articles:
# - if linked, use EVENT_ID_FIRM
# - else assign a unique singleton id
# ----------------------------

rel = df_out["EVENT_RELEVANT"].fillna(0).astype(int).eq(1)

# Start EVENT_ID_FINAL as the linked id (may be NaN)
df_out["EVENT_ID_FINAL"] = df_out["EVENT_ID_FIRM"]

# Find relevant rows that still have no event id (no ORG link)
missing = rel & df_out["EVENT_ID_FINAL"].isna()

# Choose next ids after the max linked id
max_id = int(pd.to_numeric(df_out["EVENT_ID_FINAL"], errors="coerce").max()) if df_out["EVENT_ID_FINAL"].notna().any() else 0

n_missing = int(missing.sum())
df_out.loc[missing, "EVENT_ID_FINAL"] = range(max_id + 1, max_id + 1 + n_missing)

print("Relevant articles:", int(rel.sum()))
print("Linked (firm) articles:", int((rel & df_out["EVENT_ID_FIRM"].notna()).sum()))
print("Singleton (unlinked) articles:", n_missing)
print("Total events (linked clusters + singletons):", int(df_out.loc[rel, "EVENT_ID_FINAL"].nunique()))


Assigned singleton EVENT_ID_FIRM to: 0 rows
Total firm-level events (including singletons): 76


Unnamed: 0_level_0,count
EVENT_ID_FIRM,Unnamed: 1_level_1
59,18
8,14
11,11
44,8
23,4
53,4
58,4
36,4
28,4
29,4


In [None]:
# Quick check (on relevant rows; or on CB subset inside relevant rows)
mask_cb = df_out["EVENT_PRED_CB"].fillna(0).astype(int).eq(1)

print("CB/Wage articles:", int(mask_cb.sum()))
print("Unique firm-level events:", int(df_out.loc[mask_cb, "EVENT_ID_FIRM"].nunique()))
print("Rows missing EVENT_ID_FIRM:", int(df_out.loc[mask_cb, "EVENT_ID_FIRM"].isna().sum()))

df_out.loc[mask_cb].groupby("EVENT_ID_FIRM").size().describe()


CB/Wage articles: 955
Unique firm-level events: 37
Rows missing EVENT_ID_FIRM: 897


Unnamed: 0,0
count,37.0
mean,1.567568
std,1.38525
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,7.0


In [None]:
# Mark relevant rows with no firm id
rel = df_out["EVENT_RELEVANT"].fillna(0).astype(int).eq(1)
no_firm = df_out["EVENT_ID_FIRM"].isna()

df_out.loc[rel & no_firm, "EVENT_ID_FIRM"] = "FIRM_UNK"


  df_out.loc[rel & no_firm, "EVENT_ID_FIRM"] = "FIRM_UNK"


In [None]:
from openpyxl import Workbook
import pandas as pd

# ----------------------------
# Export firm-level strike events to Excel for manual checking
# Only exports relevant articles (EVENT_RELEVANT == 1)
# ----------------------------

# Relevant-only mask (matches your linking intention)
mask = df_out["EVENT_RELEVANT"].fillna(0).astype(int).eq(1)
mask = mask & df_out["EVENT_ID_FIRM"].notna()


def list_to_str(x):
    if isinstance(x, list):
        return "; ".join(str(i) for i in x)
    if pd.isna(x):
        return ""
    return str(x)

# ---- Sheet 1: firm-level events ----
events_firm = (
    df_out.loc[mask]
    .groupby("EVENT_ID_FIRM")
    .agg(
        start=("PUB_DATE","min"),
        end=("PUB_DATE","max"),
        duration=("PUB_DATE", lambda x: (x.max()-x.min()).days + 1 if x.notna().any() else ""),
        n_articles=("title","count"),
        firms=("ORG_KEYS_FILTERED", lambda x: "; ".join(sorted({k for ks in x for k in (ks or [])})))
    )
    .reset_index()
)

for c in ["start", "end"]:
    events_firm[c] = pd.to_datetime(events_firm[c], errors="coerce")

wb = Workbook()
ws1 = wb.active
ws1.title = "Firm_Level_Strikes"
ws1.append(list(events_firm.columns))

for _, row in events_firm.iterrows():
    ws1.append([list_to_str(v) for v in row.tolist()])

# ---- Sheet 2: article-level mapping ----
ws2 = wb.create_sheet("Articles_By_Firm_Event")
cols = ["EVENT_ID_FIRM","PUB_DATE","title","ORG_KEYS_FILTERED","EVENT_ID","EVENT_RELEVANT"]
ws2.append(cols)

tmp = df_out.loc[mask, cols].copy()
tmp["PUB_DATE"] = pd.to_datetime(tmp["PUB_DATE"], errors="coerce")
tmp["ORG_KEYS_FILTERED"] = tmp["ORG_KEYS_FILTERED"].apply(list_to_str)

for _, r in tmp.iterrows():
    ws2.append([list_to_str(v) for v in r.tolist()])

# Save
path = "firm_level_strikes_6.xlsx"
wb.save(path)
print("Saved:", path)


Saved: firm_level_strikes_6.xlsx
