In [2]:
import torch
torch.cuda.is_available()


True

In [3]:
from google.colab import files
uploaded = files.upload()


Saving evrensel_isci_sendika_2024_dec2025_clean_fin_uncorrupted_real.xlsx to evrensel_isci_sendika_2024_dec2025_clean_fin_uncorrupted_real (1).xlsx


In [4]:
!pip -q install transformers accelerate evaluate openpyxl scikit-learn pandas numpy torch


In [5]:
# If you haven't installed these in this environment, uncomment
!pip -q install "transformers>=4.38"

import re
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)

set_seed(42)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE


'cuda'

In [7]:
PATH_XLSX = "evrensel_isci_sendika_2024_dec2025_clean_fin_uncorrupted_real.xlsx"  # <-- change this
df = pd.read_excel(PATH_XLSX)

required_cols = ["EVENT_RELEVANT", "EVENT_ID", "title", "content","date"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in XLSX: {missing}")

df.shape, df.columns.tolist()


((9186, 15),
 ['title',
  'date',
  'link',
  'content',
  'EVENT_RELEVANT',
  'EVENT_ID',
  'Unnamed: 6',
  'error',
  'Unnamed: 8',
  'Unnamed: 9',
  'Unnamed: 10',
  'Unnamed: 11',
  'Unnamed: 12',
  650,
  160])

In [8]:
def normalize_text(x):
    """Light cleanup: keep Turkish characters, remove weird spaces, collapse whitespace."""
    if pd.isna(x):
        return ""
    x = str(x).replace("\u00A0", " ")  # non-breaking space
    x = re.sub(r"\s+", " ", x).strip()
    return x

df["title"] = df["title"].apply(normalize_text)
df["content"] = df["content"].apply(normalize_text)


# final text fed into the model
df["text"] = (df["title"].fillna("") + "\n\n" + df["content"].fillna("")).astype(str)

df[["title", "content", "text"]].head(2)


Unnamed: 0,title,content,text
0,Bartın'da Hema'ya ait maden ocağında vagonları...,Bartın'ın Amasra ilçesindeki Hema Enerji şirke...,Bartın'da Hema'ya ait maden ocağında vagonları...
1,Bu soygun düzeni değişmeli,Pendik Marmara Eğitim ve Araştırma Hastanesind...,Bu soygun düzeni değişmeli\n\nPendik Marmara E...


In [9]:
# ----------------------------
# Cell 4 (fixed) — Define labeled rows (accept 0/1 as floats OR strings)
# ----------------------------
import numpy as np
import pandas as pd

def normalize_label(x):
    if pd.isna(x):
        return np.nan

    # float/integer case (your current situation: 0.0 / 1.0)
    if isinstance(x, (int, np.integer, float, np.floating)):
        if x == 0 or x == 0.0:
            return 0
        if x == 1 or x == 1.0:
            return 1
        return np.nan

    # string case (future-proof)
    s = str(x).strip()
    if s == "0":
        return 0
    if s == "1":
        return 1
    if s == "0.0":
        return 0
    if s == "1.0":
        return 1

    return np.nan

df["LABEL_CLEAN"] = df["EVENT_RELEVANT"].apply(normalize_label)
labeled_mask = df["LABEL_CLEAN"].notna()

print("Labeled rows:", labeled_mask.sum())
print(df.loc[labeled_mask, "LABEL_CLEAN"].value_counts())


Labeled rows: 738
LABEL_CLEAN
0.0    578
1.0    160
Name: count, dtype: int64


In [10]:
# ----------------------------
# Cell 5 — Train/validation split (using LABEL_CLEAN from Cell 4)
# ----------------------------

# Keep only labeled rows (LABEL_CLEAN is 0/1, NaN otherwise)
df_labeled = df.loc[labeled_mask].copy()

# This is what the Trainer will learn on
df_labeled["label"] = df_labeled["LABEL_CLEAN"].astype(int)

# Stratified split so class balance is preserved in train/val
train_df, val_df = train_test_split(
    df_labeled,
    test_size=0.20,
    random_state=42,
    stratify=df_labeled["label"],
)

print("Train:", len(train_df), "Val:", len(val_df))
print("Train label distribution:\n", train_df["label"].value_counts())
print("Val label distribution:\n", val_df["label"].value_counts())
print("Train positive rate:", train_df["label"].mean())
print("Val positive rate:", val_df["label"].mean())


Train: 590 Val: 148
Train label distribution:
 label
0    462
1    128
Name: count, dtype: int64
Val label distribution:
 label
0    116
1     32
Name: count, dtype: int64
Train positive rate: 0.21694915254237288
Val positive rate: 0.21621621621621623


In [11]:
# Inspect what EVENT_RELEVANCE really looks like
s = df["EVENT_RELEVANT"]

print("dtype:", s.dtype)
print("non-null count:", s.notna().sum())

# show a sample of unique raw values (as-is)
u = s.dropna().unique()
print("unique values sample (up to 50):", u[:50])

# show stringified + stripped sample too
u_str = pd.Series(u).astype(str).str.strip()
print("stringified sample (up to 50):", u_str.head(50).tolist())


dtype: float64
non-null count: 738
unique values sample (up to 50): [0. 1.]
stringified sample (up to 50): ['0.0', '1.0']


In [12]:
MODEL_NAME = "dbmdz/bert-base-turkish-cased"  # BERTurk
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 384  # 512 is allowed but slower

class TextClsDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=384):
        self.texts = list(texts)
        self.labels = None if labels is None else list(labels)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors=None,
        )
        if self.labels is not None:
            enc["labels"] = int(self.labels[idx])
        return enc

train_ds = TextClsDataset(train_df["text"], train_df["label"], tokenizer, MAX_LEN)
val_ds   = TextClsDataset(val_df["text"],   val_df["label"],   tokenizer, MAX_LEN)

collator = DataCollatorWithPadding(tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(DEVICE)

pos = int(train_df["label"].sum())
neg = int(len(train_df) - pos)

# More weight to positive class if positives are rare
class_weights = torch.tensor([1.0, (neg / max(pos, 1))], dtype=torch.float, device=DEVICE)
class_weights


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([1.0000, 3.6094], device='cuda:0')

In [14]:
# ----------------------------
# Cell 8 (updated) — Trainer setup + fine-tune (version compatible)
# ----------------------------
import inspect

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()[:, 1]
    return {
        "roc_auc": float(roc_auc_score(labels, probs)) if len(np.unique(labels)) > 1 else float("nan")
    }

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        **kwargs absorbs version-specific args like num_items_in_batch
        """
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# Build TrainingArguments with only supported params
ta_params = inspect.signature(TrainingArguments.__init__).parameters

args_dict = dict(
    output_dir="./berturk_event_detector",
    learning_rate=2e-5,
    per_device_train_batch_size=8 if DEVICE == "cuda" else 4,
    per_device_eval_batch_size=16 if DEVICE == "cuda" else 8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Strategy naming differs by transformers version
if "evaluation_strategy" in ta_params:
    args_dict["evaluation_strategy"] = "epoch"
elif "eval_strategy" in ta_params:
    args_dict["eval_strategy"] = "epoch"

if "save_strategy" in ta_params:
    args_dict["save_strategy"] = "epoch"

if "logging_strategy" in ta_params:
    args_dict["logging_strategy"] = "steps"

if "logging_steps" in ta_params:
    args_dict["logging_steps"] = 50

if "load_best_model_at_end" in ta_params:
    args_dict["load_best_model_at_end"] = True

if "metric_for_best_model" in ta_params:
    args_dict["metric_for_best_model"] = "roc_auc"

if "greater_is_better" in ta_params:
    args_dict["greater_is_better"] = True

if "fp16" in ta_params:
    args_dict["fp16"] = True if DEVICE == "cuda" else False

if "report_to" in ta_params:
    args_dict["report_to"] = "none"

training_args = TrainingArguments(**args_dict)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = WeightedTrainer(


Epoch,Training Loss,Validation Loss,Roc Auc
1,0.7285,0.693396,0.763605
2,0.6908,0.492232,0.904903
3,0.4029,0.404981,0.938712


TrainOutput(global_step=222, training_loss=0.573586773228001, metrics={'train_runtime': 141.9868, 'train_samples_per_second': 12.466, 'train_steps_per_second': 1.564, 'total_flos': 349279925990400.0, 'train_loss': 0.573586773228001, 'epoch': 3.0})

In [15]:
val_out = trainer.predict(val_ds)
val_logits = val_out.predictions
val_labels = val_out.label_ids

val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()[:, 1]
val_preds = (val_probs >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(val_labels, val_probs) if len(np.unique(val_labels)) > 1 else "NA")
print(classification_report(val_labels, val_preds, digits=3))


ROC-AUC: 0.9387122844827587
              precision    recall  f1-score   support

           0      0.933     0.957     0.945       116
           1      0.828     0.750     0.787        32

    accuracy                          0.912       148
   macro avg      0.880     0.853     0.866       148
weighted avg      0.910     0.912     0.911       148



In [22]:
# If needed
!pip -q install spacy


In [23]:
# ----------------------------
# Cell S2 (updated) — Parse Turkish publication date into PUB_DATE
# ----------------------------

MONTHS_TR = {
    "ocak":1, "şubat":2, "subat":2, "mart":3, "nisan":4, "mayıs":5, "mayis":5,
    "haziran":6, "temmuz":7, "ağustos":8, "agustos":8, "eylül":9, "eylul":9,
    "ekim":10, "kasım":11, "kasim":11, "aralık":12, "aralik":12
}

def parse_tr_pub_date(x):
    if pd.isna(x):
        return pd.NaT

    s = str(x).lower()

    # Remove everything after "güncelleme"
    s = re.sub(r"güncelleme.*", "", s)

    # Keep only the date part: "10 şubat 2024"
    m = re.search(r"(\d{1,2})\s+([a-zçğıöşü]+)\s+(\d{4})", s)
    if not m:
        return pd.NaT

    day, month_tr, year = m.groups()
    month = MONTHS_TR.get(month_tr, None)
    if month is None:
        return pd.NaT

    return pd.Timestamp(year=int(year), month=month, day=int(day))

df["PUB_DATE"] = df["date"].apply(parse_tr_pub_date)

print("Parsed PUB_DATE:", df["PUB_DATE"].notna().sum(), "/", len(df))
df[["date", "PUB_DATE"]].head(10)


Parsed PUB_DATE: 9167 / 9186


Unnamed: 0,date,PUB_DATE
0,2 Ocak 2024 11:02 — — Güncelleme: 10:13,2024-01-02
1,2 Ocak 2024 04:30,2024-01-02
2,1 Ocak 2024 10:36,2024-01-01
3,1 Ocak 2024 03:00,2024-01-01
4,31 Aralık 2023 23:07,2023-12-31
5,31 Aralık 2023 15:18,2023-12-31
6,31 Aralık 2023 06:34 — — Güncelleme: 1 Ocak 20...,2023-12-31
7,31 Aralık 2023 05:34 — — Güncelleme: 11:21,2023-12-31
8,30 Aralık 2023 17:58,2023-12-30
9,30 Aralık 2023 17:40,2023-12-30


In [24]:
from collections import Counter

# You can tune these
CONTENT_CHARS = 800  # only scan first N chars of content for speed + relevance
MIN_FREQ = 3         # keep phrases that appear at least this many times
MAX_PHRASES = 5000   # cap to avoid huge ruler
NGRAM_MIN = 2
NGRAM_MAX = 4

# Light stopwords (add more if needed)
STOP = set("""
ve veya ile için gibi üzere da de ki mi mı mu mü
işçi işçileri grev grevi direniş direnişi eylem açıklama basın
sendika sendikası işçilerden işçilerin mücadele talep sözleşme toplu iş emekçi
emekçiler emekçileri örgütlü ama fakat lakin çünkü işçiler direnişe
""".split())

def clean_for_phrase_mining(text: str) -> str:
    text = (text or "")
    text = text.replace("\u00A0", " ")
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize_simple(text: str):
    # Keep Turkish letters; split on non-letters/digits
    toks = re.findall(r"[A-Za-zÇĞİÖŞÜçğıöşü0-9]+", text)
    return toks

def is_titlecase_like(tok: str) -> bool:
    # Accept tokens that look like proper nouns in Turkish news:
    # - Starts uppercase + has lowercase later, OR
    # - ALLCAPS acronyms length>=2 (MESS, BMİS, vs.)
    if len(tok) < 2:
        return False
    if tok.isupper() and len(tok) >= 2:
        return True
    return tok[0].isupper() and any(c.islower() for c in tok[1:])

phrase_counts = Counter()

for t, c in zip(df["title"].astype(str), df["content"].astype(str)):
    t = clean_for_phrase_mining(t)
    c = clean_for_phrase_mining(c)[:CONTENT_CHARS]
    text = f"{t} {c}"

    toks = tokenize_simple(text)

    # mark which tokens look like name-like tokens
    flags = [is_titlecase_like(tok) for tok in toks]

    # collect contiguous spans of titlecase-like tokens
    i = 0
    while i < len(toks):
        if not flags[i]:
            i += 1
            continue
        j = i
        while j < len(toks) and flags[j]:
            j += 1

        span = toks[i:j]  # consecutive titlecase-like tokens
        # generate ngrams within span
        for n in range(NGRAM_MIN, NGRAM_MAX + 1):
            for k in range(0, len(span) - n + 1):
                ng = span[k:k+n]
                ng_l = [w.lower() for w in ng]

                # filter: avoid phrases that are mostly stopwords or too generic
                if any(w in STOP for w in ng_l):
                    continue
                if all(w.isdigit() for w in ng):
                    continue

                phrase = " ".join(ng)  # keep original casing for ruler
                phrase_counts[phrase] += 1

        i = j

# filter by frequency
candidates = [(p, cnt) for p, cnt in phrase_counts.items() if cnt >= MIN_FREQ]

# sort: most frequent first
candidates.sort(key=lambda x: x[1], reverse=True)

print("Candidate phrases (freq>=MIN_FREQ):", len(candidates))
print("Top 30:")
for p, cnt in candidates[:30]:
    print(cnt, " - ", p)

# keep capped list for EntityRuler
firm_phrases = [p for p, cnt in candidates[:MAX_PHRASES]]
len(firm_phrases)


Candidate phrases (freq>=MIN_FREQ): 11200
Top 30:
987  -  Emek Partisi
958  -  Türk İş
902  -  Genel Başkanı
816  -  Genel İş
814  -  Şube Başkanı
620  -  Birleşik Metal
606  -  Metal İş
601  -  Birleşik Metal İş
554  -  Organize Sanayi
516  -  BİRTEK SEN
489  -  Petrol İş
416  -  Sağlık İş
405  -  Partisi EMEP
403  -  Emek Partisi EMEP
403  -  Sanayi Bölgesi
400  -  Sosyal Güvenlik
396  -  Organize Sanayi Bölgesi
368  -  Devlet Hastanesi
367  -  DİSK Genel
365  -  Büyükşehir Belediyesi
344  -  İş İzmir
342  -  İzmir Büyükşehir
339  -  İş Genel
331  -  Tüm Bel
329  -  Hak İş
328  -  Bel Sen
328  -  Tüm Bel Sen
327  -  Türk Metal
295  -  Andaç Aydın
293  -  Genel Başkan


5000

In [29]:
# ----------------------------
# Filter ORG_KEYS: remove union/confed/general-institution anchors
# ----------------------------

BAD_ORG = set([
    "disk", "dsk", "dİsk", "türk iş", "turk is", "hak iş", "kesk",
    "genel başkanı", "genel baskani", "genel başkan", "sube baskani", "şube başkanı",
    "emek partisi", "emep", "sosyal guvenlik", "devlet hastanesi",
    "organize sanayi",
    "belediyesi",
    "iş", "genel iş", "metal iş", "petrol iş", "sağlık iş", "türk metal", "hak iş",
    "bel sen", "andaç aydın", "emep", "partisi emep", "sağlık iş"
])

# Also remove keys that are too short/generic (1 token or very short)
def is_good_org_key(k: str) -> bool:
    if not k:
        return False
    k2 = k.strip().lower()
    if k2 in BAD_ORG:
        return False
    toks = k2.split()
    if len(toks) < 2:          # single-token org keys are usually too broad (e.g., "disk")
        return False
    if len(k2) < 7:
        return False
    return True

df["ORG_KEYS_FILTERED"] = df["ORG_KEYS"].apply(lambda ks: [k for k in ks if is_good_org_key(k)])

print("Docs with >=1 ORG_KEY before:", (df["ORG_KEYS"].apply(len) > 0).sum())
print("Docs with >=1 ORG_KEY after :", (df["ORG_KEYS_FILTERED"].apply(len) > 0).sum())

# See top filtered keys
from collections import Counter
cnt = Counter()
for ks in df["ORG_KEYS_FILTERED"]:
    for k in ks:
        cnt[k] += 1
print("Top 30 filtered:")
for k, v in cnt.most_common(30):
    print(v, "-", k)


Docs with >=1 ORG_KEY before: 7947
Docs with >=1 ORG_KEY after : 7872
Top 30 filtered:
303 - birleşik metal
209 - andaç aydın arıduru içerik
201 - nazlıer içerik
171 - volkan pekal içerik
162 - iş izmir
160 - sosyal güvenlik bakanı vedat
157 - ismail cem şimşek içerik
155 - özer akdemir içerik
149 - tüm bel
148 - acil sağlık
115 - devlet hastanesine
112 - ramis sağlam içerik
95 - işçi sendika servisi
92 - belediye başkanı
91 - duygu ayber gültekin içerik
91 - izmir büyükşehir belediyesi
90 - zeliha irmak içerik
89 - sen genel başkanı mehmet
87 - işçi sağlığı
85 - istanbul milletvekili iskender bayhan
85 - dem parti
79 - gaziantep milletvekili sevda karaca
78 - dayanışma günü
76 - eğitim sen
74 - sen izmir
74 - dudu selçuk içerik
73 - deri işçileri
70 - erciyes üniversitesinden
69 - iskender bayhan
69 - izmir büyükşehir


In [30]:
import spacy
from spacy.pipeline import EntityRuler

# We don't need a pretrained tr model; a blank pipeline is enough for rule-based entity matching
nlp = spacy.blank("tr")

ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": p} for p in firm_phrases]
ruler.add_patterns(patterns)

# quick test: pick a few rows and see entities
test_text = df["title"].iloc[0] + " " + df["content"].iloc[0][:400]
doc = nlp(test_text)
[(ent.text, ent.label_) for ent in doc.ents][:20]


[]

In [31]:
def normalize_org(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\w\sçğıöşü0-9]", "", s, flags=re.UNICODE)
    return s

ORG_MAX_PER_DOC = 5

org_keys = []
for t, c in zip(df["title"].astype(str), df["content"].astype(str)):
    text = (t + " " + c[:CONTENT_CHARS]).strip()
    doc = nlp(text)
    ents = [normalize_org(ent.text) for ent in doc.ents if ent.label_ == "ORG"]
    # keep unique in order
    seen = set()
    uniq = []
    for e in ents:
        if e and e not in seen:
            seen.add(e)
            uniq.append(e)
        if len(uniq) >= ORG_MAX_PER_DOC:
            break
    org_keys.append(uniq)

df["ORG_KEYS"] = org_keys

# sanity checks
print("Docs with >=1 ORG_KEY:", (df["ORG_KEYS"].apply(len) > 0).sum(), "/", len(df))
df[["title", "ORG_KEYS"]].head(15)


Docs with >=1 ORG_KEY: 7947 / 9186


Unnamed: 0,title,ORG_KEYS
0,Bartın'da Hema'ya ait maden ocağında vagonları...,"[acil sağlık, bartın devlet, bilkent şehir]"
1,Bu soygun düzeni değişmeli,[]
2,Gündüz sanayide gece ringde,[teknik üniversitesi]
3,"""Harcama, bağışla!""",[]
4,Ankara Üniversitesi Tıp Fakültesinin 72 işçisi...,"[ankara üniversitesi, üniversitesi tıp fakülte..."
5,İsveç’teki Tesla grevi uluslararası bir mücade...,[]
6,2024’ten beklentimiz insanca yaşamak,"[urfa organize sanayi, öz iplik, özak tekstil]"
7,"""Bir akşam güzel bir yemeğe ayıracak paramız y...",[kübra kirimli ankara]
8,Urfa'da göçük altında kalan bir işçi kurtarıldı,"[mehmet akif inan eğitim, araştırma hastanesine]"
9,Antep’te inşaatın iskelesi çöktü; 2 işçi yaralı,"[acil çağrı merkezi, volkan pekal içerik]"


In [47]:
# ----------------------------
# Cell: Narrow to Collective Bargaining / Wage-related strikes
# ----------------------------

KEEP_PATTERNS = [
    r"\btoplu sözleşme\b", r"\btis\b", r"\btoplu iş sözleşmesi\b",
    r"\bücret\b", r"\bzam\b", r"\bmaaş\b", r"\bücret artış\b",
    r"\bpazarlık\b", r"\bgörüşme\b", r"\bmüzakere\b",
    r"\bgrev\b", r"\bgrevde\b", r"\bgreve çıktı\b",
    r"\bsözleşme süreci\b", r"\bsözleşme görüşmeleri\b", r"\bdüşük ücret\b"
]

DROP_PATTERNS = [
    r"\bişten çıkar", r"\bişten at", r"\bişten çıkarıl", r"\bkovuldu\b",
    r"\bsendikalaş", r"\bsendika üye", r"\bsendika üyeli",
    r"\biş kaz", r"\bölüm\b.*\b(ölüm|yaralı)\b", r"\bgöçük\b",
    r"\bgözalt", r"\btutuk", r"\bdava\b", r"\bmahkeme\b",
    r"\bziyaret\b", r"\bdayanışma\b", r"\banma\b", r"\bbasın açıklama\b",
    r"\bsendikalaşma\b"
]

keep_re = re.compile("|".join(KEEP_PATTERNS), flags=re.IGNORECASE)
drop_re = re.compile("|".join(DROP_PATTERNS), flags=re.IGNORECASE)

def is_cb_wage_article(title, content):
    text = f"{title} {content}"
    if drop_re.search(text):
        return False
    return bool(keep_re.search(text))

df["IS_CB_WAGE"] = df.apply(lambda r: is_cb_wage_article(r["title"], r["content"]), axis=1)

# Apply gate only to predicted relevant
mask_rel = df["EVENT_PRED"] == 1
df["EVENT_PRED_CB"] = ((df["EVENT_PRED"] == 1) & (df["IS_CB_WAGE"])).astype(int)

print("Pred-relevant (all):", int((df["EVENT_PRED"]==1).sum()))
print("Pred-relevant (CB/Wage):", int((df["EVENT_PRED_CB"]==1).sum()))


Pred-relevant (all): 1113
Pred-relevant (CB/Wage): 445


In [52]:
from collections import Counter
import re

def title_tokens(title):
    return re.findall(r"[A-Za-zÇĞİÖŞÜçğıöşü]+", str(title).lower())

mask_cb = df["EVENT_PRED_CB"] == 1
N = int(mask_cb.sum())

dfreq = Counter()
for t in df.loc[mask_cb, "title"]:
    seen = set(w for w in title_tokens(t) if len(w) >= 4)
    for w in seen:
        dfreq[w] += 1

# keep tokens that appear in <= 5% of CB titles
RARE_MAX_FRAC = 0.05
rare_tokens = {w for w,c in dfreq.items() if c / N <= RARE_MAX_FRAC}

print("CB titles:", N)
print("Rare tokens kept:", len(rare_tokens))
print("Examples:", list(sorted(list(rare_tokens)))[:30])


CB titles: 445
Rare tokens kept: 1192
Examples: ['adalet', 'adaletin', 'adapazarı', 'adayı', 'adım', 'aileleri', 'aktif', 'akçul', 'alacak', 'alacakları', 'alacağız', 'alalım', 'alamayız', 'alana', 'alanlarda', 'alanı', 'alanına', 'aldı', 'aldım', 'aldığımız', 'alka', 'alkışlarla', 'almanya', 'altına', 'altında', 'altınyıldız', 'altını', 'alındı', 'alınmadan', 'alınmalı']


In [58]:
# ----------------------------
# Cell: Hybrid linking (CB/Wage subset) with strict identity gating
# Uses EVENT_PRED_CB == 1 instead of EVENT_PRED
# ----------------------------

import re
import numpy as np
from collections import defaultdict

def rare_title_tokens(title, min_len=4):
    toks = re.findall(r"[A-Za-zÇĞİÖŞÜçğıöşü]+", str(title).lower())
    return {t for t in toks if len(t) >= min_len and t in rare_tokens}


def assign_event_ids_hybrid(
    df_in,
    rel_flag_col="EVENT_PRED_CB",   # <-- IMPORTANT: which rows to link
    date_col="PUB_DATE",
    org_col="ORG_KEYS_FILTERED",
    sim_short=0.92,
    sim_org=0.95,
    max_rel=6000,
    ORG_MAX_BUCKET=30,
    TITLE_OVERLAP_K=2
):
    df_out = df_in.copy()

    # Normalize EVENT_ID
    df_out["EVENT_ID"] = df_out["EVENT_ID"].where(df_out["EVENT_ID"].notna(), np.nan)
    df_out["EVENT_ID"] = df_out["EVENT_ID"].apply(lambda x: str(x).strip() if not pd.isna(x) else np.nan)

    # Select only the subset we want to link
    if rel_flag_col not in df_out.columns:
        raise ValueError(f"Missing column {rel_flag_col}. Did you create EVENT_PRED_CB first?")

    rel = df_out[df_out[rel_flag_col] == 1].copy()
    if len(rel) == 0:
        print(f"No rows where {rel_flag_col} == 1. Nothing to link.")
        return df_out

    if len(rel) > max_rel:
        print(f"Too many relevant rows ({len(rel)}). Increase threshold or lower corpus slice.")
        return df_out

    # Stable ordering
    rel["_has_date"] = rel[date_col].notna()
    rel = rel.sort_values(by=[date_col, "_has_date"], ascending=[True, False])

    idx = list(rel.index)
    n = len(idx)

    # Embeddings once
    texts = rel["text"].tolist()
    E = encode_texts(texts, batch_size=32 if DEVICE == "cuda" else 8, max_len=256)

    # Precompute title token sets
    title_tok_sets = [rare_title_tokens(t) for t in rel["title"].tolist()]

    # Inverted index for ORG keys
    org_to_pos = defaultdict(list)
    rel_org_lists = rel[org_col].tolist() if org_col in rel.columns else [None] * n
    for pos, keys in enumerate(rel_org_lists):
        for k in (keys or []):
            if k:
                org_to_pos[k].append(pos)

    edges = set()

    # --- A) ORG-candidate links: require BOTH high sim AND title overlap ---
    for org, positions in org_to_pos.items():
        if len(positions) <= 1:
            continue
        if len(positions) > ORG_MAX_BUCKET:
            continue

        for a_i in range(len(positions)):
            i = positions[a_i]
            for a_j in range(a_i + 1, len(positions)):
                j = positions[a_j]

                sim = float(np.dot(E[i], E[j]))
                if sim >= sim_org:
                    if len(title_tok_sets[i] & title_tok_sets[j]) >= TITLE_OVERLAP_K:
                        edges.add((min(i, j), max(i, j)))

    # --- B) Non-ORG links: require high sim AND title overlap ---
    for i in range(n):
        for j in range(i + 1, n):
            sim = float(np.dot(E[i], E[j]))
            if sim >= sim_short:
                if len(title_tok_sets[i] & title_tok_sets[j]) >= TITLE_OVERLAP_K:
                    edges.add((i, j))

    comps = connected_components(n, list(edges))
    clusters = [[idx[pos] for pos in comp] for comp in comps]

    # Assign EVENT_IDs
    new_counter = 1
    for cluster in clusters:
        existing = df_out.loc[cluster, "EVENT_ID"].dropna()
        if len(existing) > 0:
            chosen = existing.value_counts().idxmax()
        else:
            chosen = f"EV{new_counter:06d}"
            new_counter += 1
        df_out.loc[cluster, "EVENT_ID"] = chosen

    return df_out


# ---- Run linking on CB/Wage relevant subset ----
df_linked = assign_event_ids_hybrid(
    df,
    rel_flag_col="EVENT_PRED_CB",
    date_col="PUB_DATE",
    org_col="ORG_KEYS_FILTERED",
    sim_short=0.94,
    sim_org=0.97,
    ORG_MAX_BUCKET=30,
    TITLE_OVERLAP_K=2
)

mask_rel = df_linked["EVENT_PRED_CB"] == 1
print("Pred-relevant (CB/Wage):", int(mask_rel.sum()))
print("Unique EVENT_ID among predicted relevant:", int(df_linked.loc[mask_rel, "EVENT_ID"].nunique()))
df_linked.loc[mask_rel].groupby("EVENT_ID").size().describe()


Pred-relevant (CB/Wage): 445
Unique EVENT_ID among predicted relevant: 223


Unnamed: 0,0
count,223.0
mean,1.995516
std,8.350701
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,124.0


In [72]:
import re
import pandas as pd

# ----------------------------
# Rebuild EMPLOYER_KEYS (employer-only) from:
# 1) ORG_KEYS_FILTERED (if present)
# 2) Contextual mining from content/text near CB/strike keywords
#
# This removes:
# - unions/confederations
# - journalists/bylines (e.g., "Hilal Tok", "Eda Aktaş")
# - generic institutions/OSB
# ----------------------------

def _as_list(x):
    """ORG_KEYS_FILTERED may be list or ';'-joined string (Excel export). Convert to list."""
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    if isinstance(x, list):
        return [str(i).strip() for i in x if str(i).strip()]
    s = str(x).strip()
    if not s:
        return []
    parts = [p.strip() for p in re.split(r"[;|,]\s*", s) if p.strip()]
    return parts

# --- Strong removal patterns (tune as you discover more) ---
UNION_OR_MEDIA_PAT = re.compile(
    r"(sendika|sendikası|konfederasyon|şube|temsilcilik|"
    r"disk|kesk|türk[-\s]?iş|hak[-\s]?iş|memur[-\s]?sen|"
    r"türk\s*metal|birleşik\s*metal|metal\s*iş|genel\s*iş|"
    r"petrol\s*iş|tek\s*gıda\s*iş|"
    r"emep|chp|akp|mhp|hdp|iyi\s*parti|"
    r"servisi|muhabir|haber\s*merkezi|editör|gazetesi|ajans|"
    r"işçi\s*sendika\s*servisi)",
    flags=re.IGNORECASE
)

CITY_WORDS = {
    "istanbul","ankara","izmir","bursa","kocaeli","gebze","antalya","adana","mersin",
    "manisa","eskisehir","denizli","tekirdag","sakarya"
}

def looks_like_person(s: str) -> bool:
    """
    Heuristic: 2-3 word phrases are often bylines:
      "hilal tok", "eda aktaş izmir"
    """
    t = str(s).strip().lower()
    t = re.sub(r"[^a-zçğıöşü\s]", " ", t)
    toks = [w for w in t.split() if w]
    if len(toks) in (2, 3):
        if toks[-1] in CITY_WORDS:
            toks = toks[:-1]
        if len(toks) == 2:
            return True
    return False

def normalize_key(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s.lower()

def is_valid_employer_key(k: str) -> bool:
    s = normalize_key(k)
    if not s:
        return False
    if UNION_OR_MEDIA_PAT.search(s):
        return False
    if looks_like_person(s):
        return False
    if "organize sanayi" in s or "sanayi bölgesi" in s:
        return False
    # drop overly generic single tokens
    if len(s.split()) == 1 and s in {"metal","tekstil","enerji","elektrik","belediye","işçiler","işçi"}:
        return False
    return True

# --- Contextual mining from text near CB/wage/strike keywords ---
CB_CONTEXT = re.compile(
    r"(grev|grevde|greve|iş\s*bırak|toplu\s*sözleşme|tis|ücret|zam|maaş|müzakere|sözleşme)",
    re.IGNORECASE
)

TITLECASE_PHRASE = re.compile(
    r"\b([A-ZÇĞİÖŞÜ][\wçğıöşüÇĞİÖŞÜ’'-]+(?:\s+[A-ZÇĞİÖŞÜ][\wçğıöşüÇĞİÖŞÜ’'-]+){0,3})\b"
)

def extract_employers_from_text(title, content, window_chars=260):
    """
    Look around CB_CONTEXT matches and pull TitleCase phrases.
    Works decently for "Purmo Metal", "Schneider Elektrik", "Özak Tekstil", etc.
    """
    text = f"{title}\n{content}"
    text = "" if text is None else str(text)

    out = []
    for m in CB_CONTEXT.finditer(text):
        start = max(0, m.start() - window_chars)
        end = min(len(text), m.end() + window_chars)
        chunk = text[start:end]

        for pm in TITLECASE_PHRASE.finditer(chunk):
            cand = pm.group(1).strip()

            # remove generic phrases
            if re.search(r"(Toplu|Sözleşme|Ücret|Zam|Grev|İşçi|Sendika|Belediye\s*Başkanı)",
                         cand, re.IGNORECASE):
                continue
            if len(cand) < 4:
                continue
            out.append(cand)

    return out

# Choose content column
if "content" in df_linked.columns:
    content_col = "content"
elif "CONTENT" in df_linked.columns:
    content_col = "CONTENT"
else:
    content_col = "text"  # fallback

# Build EMPLOYER_KEYS
employer_keys = []
for _, r in df_linked.iterrows():
    base = _as_list(r.get("ORG_KEYS_FILTERED", []))

    # mine only for CB/Wage predicted articles
    if int(r.get("EVENT_PRED_CB", 0)) == 1:
        mined = extract_employers_from_text(r.get("title",""), r.get(content_col,""))
    else:
        mined = []

    combined = base + mined

    cleaned = []
    seen = set()
    for k in combined:
        nk = normalize_key(k)
        if nk in seen:
            continue
        if is_valid_employer_key(nk):
            cleaned.append(nk)
            seen.add(nk)

    employer_keys.append(cleaned)

df_linked["EMPLOYER_KEYS"] = employer_keys

print("EMPLOYER_KEYS rebuilt.")
print(df_linked.loc[df_linked["EVENT_PRED_CB"]==1, ["title","ORG_KEYS_FILTERED","EMPLOYER_KEYS"]].head(8))


EMPLOYER_KEYS rebuilt.
                                                 title  \
37   Toplu sözleşme sürecindeki Şişecam işçileri: T...   
273  Birlik olduğumuz sürece kimse hakkımızı yiyeme...   
280  Zafer Tekstil ve Alka Polyester işçileri de iş...   
286  Grevdeki Purmo işçileri: Yaşamak için insanca ...   
287  Grevdeki Mersen işçileri Fransız Konsolosluğu ...   
289  Gürdesan'da işçiler TİS'e tepki gösterdi, üret...   
291  Purmo işçileri: Bedeli ne olursa olsun greve d...   
293    İskenderun Yolbulan Metal’de grev kararı alındı   

                                     ORG_KEYS_FILTERED  \
37                                [hilal tok istanbul]   
273                             [işçi sendika servisi]   
280  [zafer tekstil, alka polyester, mazlum ayçiçek...   
286     [eda aktaş izmir, purmo group, birleşik metal]   
287            [gebze organize sanayi, birleşik metal]   
289                                                 []   
291  [izmir kemalpaşa organize sanayi, purmo gro

In [73]:
from collections import Counter

mask_cb = df_linked["EVENT_PRED_CB"] == 1

# Top employers after filtering/mining
c_emp = Counter()
for ks in df_linked.loc[mask_cb, "EMPLOYER_KEYS"]:
    for k in (ks or []):
        c_emp[k] += 1

print("\nTop 30 EMPLOYER_KEYS:")
for k,v in c_emp.most_common(30):
    print(v, "-", k)

# What got removed from ORG_KEYS_FILTERED
def _as_list_diag(x):
    if x is None or (isinstance(x, float) and pd.isna(x)):
        return []
    if isinstance(x, list):
        return [str(i).strip().lower() for i in x if str(i).strip()]
    s = str(x).strip().lower()
    if not s:
        return []
    return [p.strip() for p in re.split(r"[;|,]\s*", s) if p.strip()]

c_removed = Counter()
for orgs, emps in zip(df_linked.loc[mask_cb, "ORG_KEYS_FILTERED"], df_linked.loc[mask_cb, "EMPLOYER_KEYS"]):
    org_set = set(_as_list_diag(orgs))
    emp_set = set(emps or [])
    for k in (org_set - emp_set):
        if k:
            c_removed[k] += 1

print("\nTop 30 REMOVED keys (ideally unions/people/media):")
for k,v in c_removed.most_common(30):
    print(v, "-", k)

# UNK risk: how many CB/Wage articles still have no employer key
no_emp = df_linked.loc[mask_cb, "EMPLOYER_KEYS"].apply(lambda x: len(x)==0).sum()
print("\nCB/Wage articles with ZERO employer keys:", int(no_emp), "out of", int(mask_cb.sum()))



Top 30 EMPLOYER_KEYS:
101 - evrensel
94 - fotoğraf
75 - bugün
73 - bizim
67 - ancak
50 - bunun
46 - mess
45 - yüzde
43 - daha
41 - şimdi
39 - burada
38 - eylem nazlıer i̇çerik
37 - çünkü
37 - iş izmir
36 - bize
35 - genel-i̇ş i̇zmir
34 - patronun
34 - kamu
33 - bizler
33 - fabrika
33 - genel-i̇ş
33 - sodemsen
32 - ocak
31 - asgari
31 - özer akdemir i̇çerik
31 - andaç aydın arıduru i̇çerik
30 - artık
30 - hitachi
29 - yapılan
29 - patron

Top 30 REMOVED keys (ideally unions/people/media):
75 - birleşik metal
20 - arıtaş kriyojenik
16 - schneider elektrik
15 - hitachi energy
12 - temel conta
9 - yolbulan metal
9 - iş sendikasının
9 - tüm bel
9 - buca belediyesi
8 - iş sözleşmesi
8 - schneider electric
8 - green transfo
7 - grid solutions
6 - işçi sendika servisi
6 - emirhan durmaz
6 - maltepe belediyesi
5 - iş sendikasında
5 - başpınar organize sanayi bölgesinde
5 - emep genel başkanı seyit
5 - murat uysal istanbul
5 - şube başkanı ercan gül
5 - belediye başkanı
5 - karşıyaka belediyesi

In [76]:
import pandas as pd
import re

# --- Edit these lists as you discover new false "firms" ---
UNION_TERMS = {
    "birleşik metal iş", "metal iş", "türk metal", "türk iş", "hak iş", "disk", "kesk",
    "tüm bel sen", "tüm bel-sen", "birtek sen", "genel iş", "sağlık iş", "petrol iş",
    "tek gıda iş", "emek partisi", "emep", "chp", "akp", "mhp", "birleşik metal"
}

# Common journalist / author names you saw (add more as needed)
PERSON_TERMS = {
    "hilal tok istanbul", "ramis sağlam içerik", "genel başkan özkan atar",
    "bölge temsilcisi hayrettin çakmak", "izbb başkanı cemil tugay", "hilal tok",
    "hasret gültekin kozan"
}

# Words that indicate "this is not an employer"
DROP_IF_CONTAINS = [
    "genel başkan", "başkanı", "içerik", "servisi", "şube başkanı",
    "organize sanayi", "sanayi bölgesi", "devlet hastanesi"
]

def is_bad_key(k: str) -> bool:
    s = str(k).strip().lower()
    if not s:
        return True

    # union patterns
    if s in UNION_TERMS:
        return True
    if s.endswith("sen") or s.endswith(" iş") or "sendika" in s:
        return True

    # person patterns
    if s in PERSON_TERMS:
        return True

    # generic patterns
    for w in DROP_IF_CONTAINS:
        if w in s:
            return True

    return False

def keep_employers(keys):
    out = []
    for k in (keys or []):
        if not is_bad_key(k):
            out.append(str(k).strip().lower())
    # de-duplicate while preserving order
    seen = set()
    cleaned = []
    for x in out:
        if x not in seen:
            cleaned.append(x)
            seen.add(x)
    return cleaned

df_linked["EMPLOYER_KEYS"] = df_linked["ORG_KEYS_FILTERED"].apply(keep_employers)

print("Example EMPLOYER_KEYS (10):")
df_linked.loc[df_linked["EVENT_PRED_CB"]==1, ["title","ORG_KEYS_FILTERED","EMPLOYER_KEYS"]].head(10)


Example EMPLOYER_KEYS (10):


Unnamed: 0,title,ORG_KEYS_FILTERED,EMPLOYER_KEYS
37,Toplu sözleşme sürecindeki Şişecam işçileri: T...,[hilal tok istanbul],[]
273,Birlik olduğumuz sürece kimse hakkımızı yiyeme...,[işçi sendika servisi],[]
280,Zafer Tekstil ve Alka Polyester işçileri de iş...,"[zafer tekstil, alka polyester, mazlum ayçiçek...","[zafer tekstil, alka polyester, mazlum ayçiçek..."
286,Grevdeki Purmo işçileri: Yaşamak için insanca ...,"[eda aktaş izmir, purmo group, birleşik metal]","[eda aktaş izmir, purmo group]"
287,Grevdeki Mersen işçileri Fransız Konsolosluğu ...,"[gebze organize sanayi, birleşik metal]",[]
289,"Gürdesan'da işçiler TİS'e tepki gösterdi, üret...",[],[]
291,Purmo işçileri: Bedeli ne olursa olsun greve d...,"[izmir kemalpaşa organize sanayi, purmo group,...",[purmo group]
293,İskenderun Yolbulan Metal’de grev kararı alındı,"[dilek omaklilar hatay, yolbulan metal, iş söz...","[dilek omaklilar hatay, yolbulan metal, iş söz..."
294,Birşelik Metal-İş üyesi HMS işçileri greve çık...,"[birleşik metal, manisa organize sanayi, izmir...","[izmir şube, haziran çarşamba]"
298,Sumitomo işçileri: 3 tekerlek parasını çok gör...,"[damla kirmizitaş, sumitomo rubber ako lastik]","[damla kirmizitaş, sumitomo rubber ako lastik]"


In [78]:
from collections import Counter
import pandas as pd

# ----------------------------
# Split each wave-level EVENT_ID into firm-level strike IDs using EMPLOYER_KEYS
# Requires df_linked["EMPLOYER_KEYS"] to exist (built in the previous cells).
# ----------------------------

def split_wave_into_employers(
    df_in,
    wave_col="EVENT_ID",
    emp_col="EMPLOYER_KEYS",
    min_emp_mentions=1,       # keep single-mention employers (e.g., "purmo metal")
    multi_firm_mode=False     # if True, duplicates rows for multi-employer articles
):
    out = df_in.copy()

    if emp_col not in out.columns:
        raise ValueError(f"Missing {emp_col}. Run the EMPLOYER_KEYS rebuild cell first.")

    if multi_firm_mode:
        # Expand rows: one row per (article × employer) when multiple employers appear
        rows = []
        for idx, r in out.iterrows():
            emps = r[emp_col] or []
            if len(emps) == 0:
                r2 = r.copy()
                r2["EVENT_ID_FIRM"] = f"{r[wave_col]}_UNK"
                rows.append(r2)
            else:
                for e in emps:
                    r2 = r.copy()
                    r2["EVENT_ID_FIRM"] = f"{r[wave_col]}_{str(e).replace(' ','_')}"
                    rows.append(r2)
        out = pd.DataFrame(rows)
        return out

    # Default: one employer assignment per article (most common employer within the wave)
    out["EVENT_ID_FIRM"] = None

    for wave_id, g in out.groupby(wave_col):
        # employer frequency within wave
        emp_counter = Counter()
        for ks in g[emp_col]:
            for k in (ks or []):
                emp_counter[k] += 1

        employers = {e for e,c in emp_counter.items() if c >= min_emp_mentions}

        for i, row in g.iterrows():
            ks = set(row[emp_col] or [])
            candidates = list(ks & employers)

            if len(candidates) == 0:
                out.loc[i, "EVENT_ID_FIRM"] = f"{wave_id}_UNK"
            elif len(candidates) == 1:
                chosen = candidates[0]
                out.loc[i, "EVENT_ID_FIRM"] = f"{wave_id}_{str(chosen).replace(' ','_')}"
            else:
                # choose the most common employer in this wave (stable)
                chosen = max(candidates, key=lambda e: emp_counter[e])
                out.loc[i, "EVENT_ID_FIRM"] = f"{wave_id}_{str(chosen).replace(' ','_')}"

    return out


# ---- RUN IT ----
# Option 1 (recommended): one employer per article, stable
df_linked = split_wave_into_employers(
    df_linked,
    wave_col="EVENT_ID",
    emp_col="EMPLOYER_KEYS",
    min_emp_mentions=1,
    multi_firm_mode=False
)

# Quick check
mask_cb = df_linked["EVENT_PRED_CB"] == 1
print("CB/Wage articles:", int(mask_cb.sum()))
print("Unique firm-level events:", df_linked.loc[mask_cb, "EVENT_ID_FIRM"].nunique())
print("UNK rows:", int(df_linked.loc[mask_cb, "EVENT_ID_FIRM"].str.endswith("_UNK", na=False).sum()))
df_linked.loc[mask_cb].groupby("EVENT_ID_FIRM").size().describe()


CB/Wage articles: 445
Unique firm-level events: 365
UNK rows: 66


Unnamed: 0,0
count,365.0
mean,1.219178
std,1.019677
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,13.0


In [79]:
import numpy as np
import pandas as pd

def absorb_unk_into_employers(
    df_in,
    wave_col="EVENT_ID",
    firm_col="EVENT_ID_FIRM",
    date_col="PUB_DATE",
    text_col="text",
    sim_thresh=0.90,
    max_days=14
):
    out = df_in.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce")

    # process only CB/Wage subset
    base = out[out["EVENT_PRED_CB"]==1].copy()

    for wave_id, g in base.groupby(wave_col):
        firm = g[~g[firm_col].str.endswith("_UNK", na=False)].copy()
        unk  = g[g[firm_col].str.endswith("_UNK", na=False)].copy()

        if len(firm) == 0 or len(unk) == 0:
            continue

        # encode firm + unk texts once
        firm_texts = firm[text_col].tolist()
        unk_texts = unk[text_col].tolist()

        E_firm = encode_texts(firm_texts, batch_size=32 if DEVICE=="cuda" else 8, max_len=256)
        E_unk  = encode_texts(unk_texts,  batch_size=32 if DEVICE=="cuda" else 8, max_len=256)

        firm_dates = firm[date_col].tolist()
        unk_dates  = unk[date_col].tolist()
        firm_ids   = firm[firm_col].tolist()

        # assign each UNK to best firm if close in time + high similarity
        for u_i, row_idx in enumerate(unk.index):
            ud = unk_dates[u_i]
            best_sim = -1.0
            best_firm = None

            for f_i in range(len(firm)):
                fd = firm_dates[f_i]
                if pd.isna(ud) or pd.isna(fd):
                    continue
                if abs((ud - fd).days) > max_days:
                    continue

                sim = float(np.dot(E_unk[u_i], E_firm[f_i]))
                if sim > best_sim:
                    best_sim = sim
                    best_firm = firm_ids[f_i]

            if best_firm is not None and best_sim >= sim_thresh:
                out.loc[row_idx, firm_col] = best_firm

    return out

df_linked = absorb_unk_into_employers(
    df_linked,
    sim_thresh=0.90,
    max_days=14
)

mask_cb = df_linked["EVENT_PRED_CB"]==1
print("UNK count after absorption:", int(df_linked.loc[mask_cb, "EVENT_ID_FIRM"].str.endswith("_UNK", na=False).sum()))
print("Unique firm events:", df_linked.loc[mask_cb, "EVENT_ID_FIRM"].nunique())
df_linked.loc[mask_cb].groupby("EVENT_ID_FIRM").size().describe()


UNK count after absorption: 47
Unique firm events: 359


Unnamed: 0,0
count,359.0
mean,1.239554
std,0.889728
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,10.0


In [80]:
from openpyxl import Workbook
import pandas as pd

# ----------------------------
# Export firm-level strike events to Excel for manual checking
# Converts list-type cells (like ORG_KEYS_FILTERED) into strings.
# ----------------------------

mask = df_linked["EVENT_PRED_CB"] == 1

def list_to_str(x):
    if isinstance(x, list):
        return "; ".join(str(i) for i in x)
    if pd.isna(x):
        return ""
    return str(x)

# ---- Sheet 1: firm-level events ----
events_firm = (
    df_linked[mask]
    .groupby("EVENT_ID_FIRM")
    .agg(
        start=("PUB_DATE","min"),
        end=("PUB_DATE","max"),
        duration=("PUB_DATE", lambda x: (x.max()-x.min()).days + 1),
        n_articles=("title","count"),
        firms=("ORG_KEYS_FILTERED", lambda x: "; ".join(sorted({k for ks in x for k in (ks or [])})))
    )
    .reset_index()
)

# make sure datetimes are Excel-friendly (optional but good)
for c in ["start", "end"]:
    events_firm[c] = pd.to_datetime(events_firm[c], errors="coerce")

wb = Workbook()
ws1 = wb.active
ws1.title = "Firm_Level_Strikes"
ws1.append(list(events_firm.columns))

for _, row in events_firm.iterrows():
    ws1.append([list_to_str(v) for v in row.tolist()])

# ---- Sheet 2: article-level mapping ----
ws2 = wb.create_sheet("Articles_By_Firm_Event")
cols = ["EVENT_ID_FIRM","PUB_DATE","title","ORG_KEYS_FILTERED","EVENT_ID","EVENT_PRED_CB"]
ws2.append(cols)

tmp = df_linked.loc[mask, cols].copy()
tmp["PUB_DATE"] = pd.to_datetime(tmp["PUB_DATE"], errors="coerce")
tmp["ORG_KEYS_FILTERED"] = tmp["ORG_KEYS_FILTERED"].apply(list_to_str)

for _, r in tmp.iterrows():
    ws2.append([list_to_str(v) for v in r.tolist()])

# Save
path = "firm_level_strikes_3.xlsx"
wb.save(path)
print("Saved:", path)


Saved: firm_level_strikes_3.xlsx


üst kısım şimdisi, aşağısı önceki şeyler

In [59]:
# ----------------------------
# Split wave-level EVENT_ID into firm-level strike events
# ----------------------------

def split_wave_into_firms(df_linked, org_col="ORG_KEYS_FILTERED"):
    out = df_linked.copy()
    out["EVENT_ID_FIRM"] = None

    for wave_id, g in out.groupby("EVENT_ID"):
        # collect firm names in this wave
        firm_counter = Counter()
        for ks in g[org_col]:
            for k in ks:
                firm_counter[k] += 1

        # remove generic unions etc.
        firms = [f for f,c in firm_counter.items() if c >= 2]

        if not firms:
            # fallback: keep wave id
            out.loc[g.index, "EVENT_ID_FIRM"] = f"{wave_id}_UNK"
            continue

        # assign each article to the most likely firm
        for i, row in g.iterrows():
            ks = row[org_col]
            if ks:
                intersect = [f for f in ks if f in firms]
                if intersect:
                    chosen = intersect[0]
                    out.loc[i, "EVENT_ID_FIRM"] = f"{wave_id}_{chosen.replace(' ','_')}"
                else:
                    out.loc[i, "EVENT_ID_FIRM"] = f"{wave_id}_UNK"
            else:
                out.loc[i, "EVENT_ID_FIRM"] = f"{wave_id}_UNK"

    return out


df_linked = split_wave_into_firms(df_linked, org_col="ORG_KEYS_FILTERED")


In [60]:
mask = df_linked["EVENT_PRED_CB"] == 1

print("CB/Wage articles:", int(mask.sum()))
print("Unique firm-level strike events:", df_linked.loc[mask, "EVENT_ID_FIRM"].nunique())

df_linked.loc[mask].groupby("EVENT_ID_FIRM").size().describe()


CB/Wage articles: 445
Unique firm-level strike events: 292


Unnamed: 0,0
count,292.0
mean,1.523973
std,2.02292
min,1.0
25%,1.0
50%,1.0
75%,1.0
max,24.0


In [66]:
mask = (df_linked["EVENT_PRED_CB"]==1) & (df_linked["EVENT_ID_FIRM"].notna())
mask_named = mask & (~df_linked["EVENT_ID_FIRM"].str.endswith("_UNK", na=False))

df_linked[mask_named].groupby("EVENT_ID_FIRM").head(2)[["EVENT_ID_FIRM","PUB_DATE","title"]].sample(20, random_state=42)


Unnamed: 0,EVENT_ID_FIRM,PUB_DATE,title
348,E0018_schneider_elektrik,2025-01-14,GE Grid Solutions’ta da işçilerin grevi kazanı...
3670,E0018_emep_genel_başkanı_seyit,2024-10-17,"EMEP Genel Başkanı Aslan: İşçilerin, kazanımla..."
3754,EV000051_iş_izmir,2024-10-22,İki belediyede daha grev kararı asıldı
6058,E0018_başkan_yardımcısı,2025-04-28,TÜPRAŞ işçilerinin eylemleri sürüyor
291,E0018_birleşik_metal,2024-05-30,Purmo işçileri: Bedeli ne olursa olsun greve d...
2539,E0018_tüm_bel,2024-07-05,İzBB’de çalışan kamu emekçileri tam gün iş bır...
4449,E0018_ge_grid_solutions,2024-12-17,Yasak kâr etmiyor metal işçilerinin grevi sürüyor
6885,E0018_buca_belediyesi,2025-06-19,İş bırakma eylemi 2'nci gününde
772,E0018_portakal_plastik,2024-02-06,Portakal Plastik'te işçiler TİS'in uygulanmama...
4066,E0018_iş_istanbul_anadolu_yakası,2024-11-19,Genel-İş işçiye sormadan sözleşmeyi imzaladı


In [65]:
from openpyxl import Workbook

# ----------------------------
# Export firm-level strike events to Excel for manual checking
# ----------------------------

mask = df_linked["EVENT_PRED_CB"] == 1

events_firm = (
    df_linked[mask]
    .groupby("EVENT_ID_FIRM")
    .agg(
        start=("PUB_DATE","min"),
        end=("PUB_DATE","max"),
        duration=("PUB_DATE", lambda x: (x.max()-x.min()).days + 1),
        n_articles=("title","count"),
        firms=("ORG_KEYS_FILTERED", lambda x: ", ".join(sorted({k for ks in x for k in ks})))
    )
    .reset_index()
)

wb = Workbook()
ws1 = wb.active
ws1.title = "Firm_Level_Strikes"

# Header
ws1.append(list(events_firm.columns))

# Rows
for _, row in events_firm.iterrows():
    ws1.append(list(row))

# Second sheet: article-level mapping (for manual validation)
ws2 = wb.create_sheet("Articles_By_Firm_Event")
cols = ["EVENT_ID_FIRM","PUB_DATE","title","ORG_KEYS_FILTERED","EVENT_ID","EVENT_PRED_CB"]
ws2.append(cols)

for _, r in df_linked[mask][cols].iterrows():
    ws2.append(list(r))

# Save
path = "firm_level_strikes.xlsx"
wb.save(path)

print("Saved:", path)

ValueError: Cannot convert ['hilal tok istanbul'] to Excel

In [63]:
df_linked[mask].groupby("EVENT_ID_FIRM").head(2)[["EVENT_ID_FIRM","PUB_DATE","title"]].sample(20)


Unnamed: 0,EVENT_ID_FIRM,PUB_DATE,title
4135,EV000080_UNK,2024-11-23,"Akçul, ""Bakanlık işçilerin taleplerini kabul e..."
361,E0018_plastikçiler_organize_sanayi,2025-02-10,Chinatool Otomotiv’de grev başladı
315,E0015_UNK,2024-08-08,"Tek Gıda-İş, Kristal Yağ’da anlaşma sağlandığı..."
7399,EV000171_UNK,2025-07-21,"""Güvenceli istihdam derhal sağlanmalıdır"""
758,EV000011_UNK,2024-02-02,'Mücadelede kararlıyız'
4465,E0018_hilal_tok_istanbul,2024-12-19,Metal işçileri greve kararlılıkla devam ediyor...
971,EV000022_UNK,2024-02-16,İZENERJİ işçileri ikramiyelerinin zamanında öd...
3129,EV000053_UNK,2024-08-28,"Öz Büro İş Sendikası üyesi işçiler, 69 Sarar m..."
6420,EV000136_UNK,2025-05-15,Ücret ve sosyal haklardan vergi kesintilerini ...
3462,EV000058_UNK,2024-09-25,İşçiye sarı duvar


In [67]:
from openpyxl import Workbook
import pandas as pd

# ----------------------------
# Export firm-level strike events to Excel for manual checking
# Converts list-type cells (like ORG_KEYS_FILTERED) into strings.
# ----------------------------

mask = df_linked["EVENT_PRED_CB"] == 1

def list_to_str(x):
    if isinstance(x, list):
        return "; ".join(str(i) for i in x)
    if pd.isna(x):
        return ""
    return str(x)

# ---- Sheet 1: firm-level events ----
events_firm = (
    df_linked[mask]
    .groupby("EVENT_ID_FIRM")
    .agg(
        start=("PUB_DATE","min"),
        end=("PUB_DATE","max"),
        duration=("PUB_DATE", lambda x: (x.max()-x.min()).days + 1),
        n_articles=("title","count"),
        firms=("ORG_KEYS_FILTERED", lambda x: "; ".join(sorted({k for ks in x for k in (ks or [])})))
    )
    .reset_index()
)

# make sure datetimes are Excel-friendly (optional but good)
for c in ["start", "end"]:
    events_firm[c] = pd.to_datetime(events_firm[c], errors="coerce")

wb = Workbook()
ws1 = wb.active
ws1.title = "Firm_Level_Strikes"
ws1.append(list(events_firm.columns))

for _, row in events_firm.iterrows():
    ws1.append([list_to_str(v) for v in row.tolist()])

# ---- Sheet 2: article-level mapping ----
ws2 = wb.create_sheet("Articles_By_Firm_Event")
cols = ["EVENT_ID_FIRM","PUB_DATE","title","ORG_KEYS_FILTERED","EVENT_ID","EVENT_PRED_CB"]
ws2.append(cols)

tmp = df_linked.loc[mask, cols].copy()
tmp["PUB_DATE"] = pd.to_datetime(tmp["PUB_DATE"], errors="coerce")
tmp["ORG_KEYS_FILTERED"] = tmp["ORG_KEYS_FILTERED"].apply(list_to_str)

for _, r in tmp.iterrows():
    ws2.append([list_to_str(v) for v in r.tolist()])

# Save
path = "firm_level_strikes.xlsx"
wb.save(path)
print("Saved:", path)


Saved: firm_level_strikes.xlsx


In [55]:
from collections import Counter

# Most common ORG keys in the big cluster
c_org = Counter()
for ks in big["ORG_KEYS_FILTERED"]:
    for k in ks:
        c_org[k] += 1
print("Top ORG keys in big cluster:")
for k,v in c_org.most_common(20):
    print(v, "-", k)

# Most common title tokens in the big cluster
def toks(title):
    return re.findall(r"[A-Za-zÇĞİÖŞÜçğıöşü]+", str(title).lower())

c_tok = Counter()
for t in big["title"]:
    for w in toks(t):
        if len(w) >= 4:
            c_tok[w] += 1

print("\nTop title tokens in big cluster:")
for w,v in c_tok.most_common(30):
    print(v, "-", w)


Top ORG keys in big cluster:
38 - birleşik metal
10 - ge grid solutions
10 - arıtaş kriyojenik
9 - schneider elektrik
8 - green transfo
7 - iş izmir
7 - hitachi energy
7 - temel conta
6 - tüm bel
5 - genel başkanı özkan atar
5 - grid solutions
5 - sosyal demokrat kamu işverenleri
5 - schneider electric
4 - emep genel başkanı seyit
4 - türkiye metal sanayicileri
4 - ramis sağlam içerik
4 - sen izmir
4 - karşıyaka belediyesi
4 - konak belediyesi
4 - andaç aydın arıduru içerik

Top title tokens in big cluster:
30 - işçileri
19 - işçilerinin
18 - karşı
17 - devam
16 - grev
15 - sürüyor
14 - grevi
13 - belediyesi
11 - genel
10 - greve
10 - metal
10 - için
10 - dayatmasına
9 - gününde
9 - ediyor
8 - toplu
8 - sözleşme
8 - yüzde
8 - mess
8 - sefalet
7 - emep
7 - green
7 - transfo
7 - başkanı
7 - temel
7 - conta
6 - emekçileri
5 - anlaşma
5 - sağlandı
5 - işçiler


In [41]:
# Find the biggest cluster
sizes = df_linked.loc[df_linked["EVENT_PRED"]==1].groupby("EVENT_ID").size()
biggest_id = sizes.idxmax()
print("Biggest EVENT_ID:", biggest_id, "size:", sizes.max())

big = df_linked[df_linked["EVENT_ID"] == biggest_id]
big[["PUB_DATE","title","ORG_KEYS_FILTERED"]].head(50)


Biggest EVENT_ID: E0018 size: 1077


Unnamed: 0,PUB_DATE,title,ORG_KEYS_FILTERED
24,2024-01-16,Özak Tekstil işçilerinin direnişi sürüyor: Bir...,[özak tekstil]
37,2024-01-25,Toplu sözleşme sürecindeki Şişecam işçileri: T...,[hilal tok istanbul]
88,2024-05-28,"KESK, direnişteki Fernas işçilerini ziyaret etti","[fernas otoyol, inceoğulları inşaat, kesk deni..."
99,2024-07-10,Sözde kalmasın,"[iş genel başkanı ergün, iş genel başkanı mahm..."
119,2024-07-27,EMEP Genel Başkanı Seyit Aslan'dan direnişteki...,"[emep genel başkanı seyit, nazımiye belediye, ..."
157,2024-12-17,Polonez işçileri Maltepe sahilden yürüyüş başl...,"[anayasal hak, çatalca adliyesi]"
163,2024-12-16,Polonez işçileri: Bize destek olması gereken d...,"[çatalca adliyesi, örgütlenme uzmanı suat karl..."
170,2024-12-18,Polonez işçileri Ankara yürüyüşünü sonlandırdı,"[çalışma bakanlığı, anayasal hak yürüyüşü, iş ..."
224,2025-02-06,İzBB işçilerinin işe dönüş mücadelesi 4’üncü g...,"[izmir büyükşehir, izmir büyükşehir belediyesi..."
239,2025-02-07,İzmir’de işten atılan belediye işçilerinin dir...,"[izmir büyükşehir, kıbrıs şehitleri, emirhan d..."


In [42]:
# sample 50 articles from giant cluster
sample = big.sample(50, random_state=42)
E = encode_texts(sample["text"].tolist(), batch_size=32, max_len=256)

# pairwise similarities
sims = E @ E.T
print("Similarity stats:", sims[np.triu_indices_from(sims,1)].min(),
                      sims[np.triu_indices_from(sims,1)].mean(),
                      sims[np.triu_indices_from(sims,1)].max())


Similarity stats: 0.9020879 0.94557834 0.980029


In [28]:
OUT_XLSX = "berturk_event_detection_and_linking.xlsx"
OUT_CSV  = "berturk_event_detection_and_linking.csv"

df_linked.to_excel(OUT_XLSX, index=False)
df_linked.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

OUT_XLSX, OUT_CSV

('berturk_event_detection_and_linking.xlsx',
 'berturk_event_detection_and_linking.csv')