In [1]:
from dotenv import load_dotenv
import os
from post_processing_newest import semantic_search, fetch_wikidata_info, fast_clustering, agglomerative_clustering
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModel
import torch
load_dotenv()

data_path = Path("/home/yazici/playground/new-prompts/output/gpt-4o-2024-08-06_event2_newest_report_20241107-035313.json")
anchor_file_path = Path("/mnt/datasets/dop-position-mining/wiki-anchor/anchor_target_counts.csv")

  from tqdm.autonotebook import tqdm, trange


In [2]:
df = pd.read_json(data_path)
device = "cuda:1" if torch.cuda.is_available() else "cpu"

In [3]:
batch_size = 64 * 4 if device == "cuda" else 64
# Explode the 'positions' field if it contains a list
df_report = df.explode("positions").reset_index(drop=True)

# Normalize the 'positions' field into a separate dataframe
df_positions = pd.json_normalize(df_report["positions"])
# drop rows that have the targets as empty lists
df_positions = df_positions[df_positions["targets"].apply(len) > 0]
# --- Stakeholder Clustering ---
print(
    f"{len(df_positions['stakeholder'].unique())} stakeholders"
    " before clustering..."
)
stakeholders = df_positions["stakeholder"].tolist()
stakeholders = [stakeholder.lower().strip() for stakeholder in stakeholders]
# does df_max_views.parquet exist?
anchor_file_path_dir = Path(anchor_file_path).parent
if (anchor_file_path_dir / "df_max_views.parquet").exists():
    print("Anchor file found.")
    df_max_views = pd.read_parquet(anchor_file_path_dir / "df_max_views.parquet")
    df_embeddings = [
        torch.stack([torch.tensor(emb) for emb in df_max_views["jina_embeddings"].values]),
        torch.stack([torch.tensor(emb) for emb in df_max_views["sentence_transformer_embeddings"].values]),
    ]
else:
    raise FileNotFoundError("Anchor file not found.")

3836 stakeholders before clustering...
Anchor file found.


In [4]:
df_max_views.head()

Unnamed: 0,normalized_anchor_text,target_page_id,target_item_id,target_page_title,target_page_views,anchor_target_count,p_anchor_given_target,p_target_given_anchor,jina_embeddings,sentence_transformer_embeddings
364147,!,10606,120976,Factorial,30517,28,0.074271,0.459016,"[0.120287396, -0.17119974, 0.018252913, 0.0800...","[-0.04682187, 0.028049402, -0.04723211, 0.1036..."
743613,!!!,600744,371,!!!,3428,108,0.923077,1.0,"[0.11590401, -0.16914222, 0.025198031, 0.08789...","[-0.04671332, 0.05583068, -0.05227658, 0.00920..."
743614,!!! (chk chk chk),600744,371,!!!,3428,4,0.034188,1.0,"[0.14810045, -0.098733634, 0.04295554, 0.09873...","[0.0060914885, 0.021325106, -0.014094437, 0.02..."
743615,!!! chk chk chk,600744,371,!!!,3428,3,0.025641,1.0,"[0.12620306, -0.11048431, 0.031981107, 0.10473...","[-0.015853163, 0.021743488, -0.015280079, -0.0..."
4602208,!!!fuck you!!!,4838455,1106400,Fuck You (EP),362,4,1.0,1.0,"[0.06759844, -0.12256393, 0.030473959, 0.01470...","[-0.031093562, 0.01322756, -0.036412507, -0.02..."


In [5]:
import pickle
from tqdm import tqdm
from typing import List, Tuple

# stakeholders: List[str],
# df_embeddings: List[torch.Tensor],
# device: str,
# df_max_views: pd.DataFrame,
# models: Tuple[AutoModel, SentenceTransformer] = None,
# threshold=0.15,
# clustering_method="fast",
# output_dir=None,
# event_name="",

models = (
    AutoModel.from_pretrained(
        "arkohut/jina-embeddings-v3", trust_remote_code=True
    ).to(device),
    SentenceTransformer("all-mpnet-base-v2", device=device),
)
threshold = 0.15
clustering_method = "fast"
output_dir = Path("/home/yazici/playground/new-prompts/temp-files")
# output_dir = None
event_name = "event2_multimodel"

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

In [6]:
assert len(df_embeddings) == 2, "Two models are required for wiki_anchor"
assert df_embeddings[0].shape[1] == 1024, "Model 1 should output 1024 dim embeddings"
assert df_embeddings[1].shape[1] == 768, "Model 2 should output 768 dim embeddings"
batch_size = 64 * 4 if device == "cuda" else 64
unique_stakeholders = list(set(stakeholders))

In [7]:
if models is None:
    models = (
        AutoModel.from_pretrained(
            "arkohut/jina-embeddings-v3", trust_remote_code=True
        ).to(device),
        SentenceTransformer("all-mpnet-base-v2", device=device),
    )
# Initial wiki matches
unique_stakeholder_embeddings = (
    models[0].encode(
        unique_stakeholders,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device,
        batch_size=batch_size,
    ),
    models[1].encode(
        unique_stakeholders,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device,
        batch_size=batch_size,
    )
)
search_results = (
    semantic_search(
        unique_stakeholder_embeddings[0],  # query embeddings
        df_embeddings[0],  # database embeddings (can remain on CPU)
        top_k=1,  # Retrieve the top match only, adjust as needed
        device=device,
    ),
    semantic_search(
        unique_stakeholder_embeddings[1],  # query embeddings
        df_embeddings[1],  # database embeddings (can remain on CPU)
        top_k=1,  # Retrieve the top match only, adjust as needed
        device=device,
    )
)

del unique_stakeholder_embeddings

Encoding:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

Query chunks: 100%|██████████| 39/39 [03:35<00:00,  5.53s/it]
Query chunks: 100%|██████████| 39/39 [02:41<00:00,  4.13s/it]


In [8]:
from collections import defaultdict
# Process search results to get the best matches above the threshold
all_results = {}
stakeholder_index_to_target_item_id = [None] * len(stakeholders)

for idx in range(len(unique_stakeholders)):
    best_match_model1 = search_results[0][idx][0]
    best_match_model2 = search_results[1][idx][0]
    if best_match_model1["corpus_id"] != best_match_model2["corpus_id"]:
        continue
    similarity_score1 = best_match_model1["score"]
    similarity_score2 = best_match_model2["score"]
    best_match_idx = best_match_model1["corpus_id"]
    best_match_row = df_max_views.iloc[best_match_idx]
    stakeholder = unique_stakeholders[idx]
    if similarity_score1 >= (1 - float(threshold)) and similarity_score2 >= (1 - float(threshold)):
        # Store the best match for the stakeholder
        for idx_orig_stakeholder, orig_stakeholder in enumerate(stakeholders):
            if orig_stakeholder == stakeholder:
                assert stakeholder_index_to_target_item_id[idx_orig_stakeholder] is None
                stakeholder_index_to_target_item_id[idx_orig_stakeholder] = (best_match_row["target_item_id"], best_match_row["target_page_id"])

    all_results[unique_stakeholders[idx]] = (
        best_match_row["normalized_anchor_text"],
        best_match_row["target_page_id"],
        best_match_row["target_item_id"],
        similarity_score1,
        similarity_score2,
    )

hit_count = sum([1 for item_id in stakeholder_index_to_target_item_id if item_id is not None])
print(
    f"Hit count: {hit_count} | Hit percentage before wiki: {hit_count / len(stakeholders) * 100:.2f}%"
)
unique_wiki_items = set([
    val for val in stakeholder_index_to_target_item_id if val is not None
])

if output_dir is not None:
    with open(output_dir / f"all_results_{event_name}.pkl", "wb") as f:
        pickle.dump(all_results, f)

# del all_results
# del search_results
# del unique_stakeholders
# del hit_count

Hit count: 9833 | Hit percentage before wiki: 68.58%


In [9]:
# Extend with wikidata info
all_wiki_info = []
wiki_info_index_to_itemid = []
wiki_info_index_to_page_id = []
itemid_to_wiki_info = {}
for val in tqdm(unique_wiki_items, desc="Fetching wiki info"):
    target_item_id, target_page_id = val
    wikidata_info = fetch_wikidata_info(target_item_id)
    if wikidata_info:
        itemid_to_wiki_info[target_item_id] = wikidata_info
        labels = wikidata_info["labels"]  # list of labels
        descriptions = (
            wikidata_info["main_label"].lower() + " " + wikidata_info["description"]
        )  # string
        aliases = wikidata_info["aliases"]  # list of aliases
        all_wiki_info += labels + [descriptions] + aliases
        wiki_info_index_to_itemid += [target_item_id] * (len(labels) + len(aliases) + 1)
        wiki_info_index_to_page_id += [target_page_id] * (len(labels) + len(aliases) + 1)
print(f"Length of wiki corpus: {len(all_wiki_info)}")
# del unique_wiki_items

Fetching wiki info: 100%|██████████| 1359/1359 [05:20<00:00,  4.24it/s]

Length of wiki corpus: 26431





In [10]:
missing_stakeholders = list(set(
    [stakeholder for idx, stakeholder in enumerate(stakeholders) if stakeholder_index_to_target_item_id[idx] is None]
))
print(f"Length of missing stakeholders: {len(missing_stakeholders)}")

Length of missing stakeholders: 2095


In [11]:
missing_stakeholder_embeddings = (
    models[0].encode(
        missing_stakeholders,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device,
        batch_size=batch_size,
    ),
    models[1].encode(
        missing_stakeholders,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device,
        batch_size=batch_size,
    ),
)
all_wiki_info_embeddings = (
    models[0].encode(
        all_wiki_info,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device,
        batch_size=batch_size,
    ),
    models[1].encode(
        all_wiki_info,
        show_progress_bar=True,
        convert_to_tensor=True,
        device=device,
        batch_size=batch_size,
    ),
)
# semantic search
search_results = (
    semantic_search(
        missing_stakeholder_embeddings[0],  # query embeddings
        all_wiki_info_embeddings[0],  # database embeddings (can remain on CPU)
        top_k=1,  # Retrieve the top match only, adjust as needed
        device=device,
    ),
    semantic_search(
        missing_stakeholder_embeddings[1],  # query embeddings
        all_wiki_info_embeddings[1],  # database embeddings (can remain on CPU)
        top_k=1,  # Retrieve the top match only, adjust as needed
        device=device,
    ),
)

del missing_stakeholder_embeddings
del all_wiki_info_embeddings

Encoding:   0%|          | 0/33 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Encoding:   0%|          | 0/413 [00:00<?, ?it/s]

Batches:   0%|          | 0/413 [00:00<?, ?it/s]

Query chunks: 100%|██████████| 21/21 [00:00<00:00, 479.19it/s]
Query chunks: 100%|██████████| 21/21 [00:00<00:00, 520.98it/s]


In [12]:
missing_stakeholder_matches = {}
for idx in range(len(missing_stakeholders)):
    best_match_model1 = search_results[0][idx][0]
    best_match_model2 = search_results[1][idx][0]
    similarity_score1 = best_match_model1["score"]
    similarity_score2 = best_match_model2["score"]
    best_match_idx1 = best_match_model1["corpus_id"]
    best_match_idx2 = best_match_model2["corpus_id"]
    missing_stakeholder = missing_stakeholders[idx]
    item_id1 = wiki_info_index_to_itemid[best_match_idx1]
    item_id2 = wiki_info_index_to_itemid[best_match_idx2]
    missing_stakeholder_matches[stakeholder] = (
        similarity_score1,
        all_wiki_info[best_match_idx1],
        itemid_to_wiki_info[item_id1]["main_label"],
        similarity_score2,
        all_wiki_info[best_match_idx2],
        itemid_to_wiki_info[item_id2]["main_label"],
    )
    if item_id1 != item_id2:
        continue
    if similarity_score1 >= (1 - float(threshold)) and similarity_score2 >= (1 - float(threshold)):
        for stakeholder_index, stakeholder in enumerate(stakeholders):
            if stakeholder == missing_stakeholder:
                assert stakeholder_index_to_target_item_id[stakeholder_index] is None, f"Stakeholder {stakeholder} already has a match"
                stakeholder_index_to_target_item_id[stakeholder_index] = (item_id1, wiki_info_index_to_page_id[best_match_idx1])

if output_dir != None:
    with open(output_dir / f"stakeholders_{event_name}.pkl", "wb") as f:
        pickle.dump(stakeholders, f)
    with open(output_dir / f"stakeholder_index_to_target_item_id_{event_name}.pkl", "wb") as f:
        pickle.dump(stakeholder_index_to_target_item_id, f)
    with open(output_dir / f"missing_stakeholder_matches_{event_name}.pkl", "wb") as f:
        pickle.dump(missing_stakeholder_matches, f)


hit_count = sum([1 for item_id in stakeholder_index_to_target_item_id if item_id is not None])
print(
    f"Hit count: {hit_count} | Hit percentage after wiki: {hit_count / len(stakeholders) * 100:.2f}%"
)

del missing_stakeholders
del search_results
del missing_stakeholder_matches

Hit count: 10497 | Hit percentage after wiki: 73.22%


In [13]:
augmented_stakeholders = []
unique_stakeholders = list(set(stakeholders))
for stakeholder in tqdm(
    unique_stakeholders, total=len(unique_stakeholders), desc="Augmenting stakeholders"
):
    i = stakeholders.index(stakeholder)
    if stakeholder_index_to_target_item_id[i] is not None:
        stakeholder_itemid = stakeholder_index_to_target_item_id[i][0]
        wikidata_info = itemid_to_wiki_info[stakeholder_itemid]
        # Append label, description, and aliases to stakeholder name
        augmented_text = wikidata_info["main_label"]
        augmented_text += f" ({wikidata_info['description']})"
        if wikidata_info["en_aliases"]:
            augmented_text += (
                f" | Aliases: {', '.join(wikidata_info['en_aliases'])}"
            )
        augmented_stakeholders.append(augmented_text.lower())
    else:
        augmented_stakeholders.append(stakeholder.lower())  # If no match, keep original

if output_dir != None:
    with open(output_dir / f"stakeholder_to_wiki_info_{event_name}.pkl", "wb") as f:
        pickle.dump(itemid_to_wiki_info, f)

del all_wiki_info

Augmenting stakeholders: 100%|██████████| 3819/3819 [00:00<00:00, 7076.15it/s]


In [14]:
# clear cuda cache
if device == "cuda":
    torch.cuda.empty_cache()

stakeholder_embeddings = (
    (models[0].encode(
    augmented_stakeholders,
    device=device,
    show_progress_bar=True,
    convert_to_tensor=True,
    batch_size=batch_size,
    ).cpu()), 
    (models[1].encode(
    augmented_stakeholders,
    device=device,
    show_progress_bar=True,
    convert_to_tensor=True,
    batch_size=batch_size,
    ).cpu())
)

stakeholder_embeddings = (
    util.normalize_embeddings(stakeholder_embeddings[0]),
    util.normalize_embeddings(stakeholder_embeddings[1]),
)

stakeholder_cos_scores = (
    stakeholder_embeddings[0] @ stakeholder_embeddings[0].T,
    stakeholder_embeddings[1] @ stakeholder_embeddings[1].T,
)

del stakeholder_embeddings

stakeholder_cos_scores = (stakeholder_cos_scores[0] + stakeholder_cos_scores[1]) / 2

Encoding:   0%|          | 0/60 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

In [15]:
stakeholder_cos_scores.shape

torch.Size([3819, 3819])

In [16]:
print("Clustering stakeholders/targets threshold" f" {threshold}...")
# Use Agglomerative Clustering for stakeholders
if clustering_method == "agglomerative":
    stakeholder_replacement, clusters = agglomerative_clustering(
        unique_stakeholders, stakeholder_cos_scores, float(threshold)
    )
elif clustering_method == "fast":
    stakeholder_replacement, clusters = fast_clustering(
        stakeholder_cos_scores,
        unique_stakeholders,
        threshold=1 - float(threshold),
    )
else:
    raise ValueError("Invalid clustering method. Choose 'agglomerative' or 'fast'")

del stakeholder_cos_scores

Clustering stakeholders/targets threshold 0.15...
Fast clustering start


Finding clusters: 100%|██████████| 4/4 [00:00<00:00,  8.73it/s]

Clustering done after 0.47 sec





In [17]:
stakeholder_index_to_target_item_id_copy = stakeholder_index_to_target_item_id.copy()

In [18]:
itemids_per_cluster = []
clusters_with_multiple_itemids = []
for stakeholder_ids in clusters:
    stakeholders_in_cluster = [unique_stakeholders[idx] for idx in stakeholder_ids]
    # check if any of the stakeholders in the cluster exist in stakeholder_index_to_target_item_id
    itemids_in_cluster = []
    for stakeholder_in_cluster in stakeholders_in_cluster:
        i = stakeholders.index(stakeholder_in_cluster)
        if stakeholder_index_to_target_item_id[i] is not None:
            itemids_in_cluster.append(stakeholder_index_to_target_item_id[i][0])
    itemids_per_cluster.append(set(itemids_in_cluster))
    # print(f"Cluster: {set(stakeholders_in_cluster)} | Itemids: {set(itemids_in_cluster)}")
    if len(set(itemids_in_cluster)) > 1:
        clusters_with_multiple_itemids.append(
            (set(stakeholders_in_cluster), set(itemids_in_cluster))
        )
    for stakeholder_in_cluster in stakeholders_in_cluster:
        i = stakeholders.index(stakeholder_in_cluster)
        if stakeholder_index_to_target_item_id[i] is not None:
            for orig_stakeholder_id, orig_stakeholder in enumerate(stakeholders):
                if orig_stakeholder in stakeholders_in_cluster:
                    if stakeholder_index_to_target_item_id[orig_stakeholder_id] is not None and stakeholder_index_to_target_item_id[i] != stakeholder_index_to_target_item_id[orig_stakeholder_id]:
                        # skip the replacement if the stakeholder is already replaced
                        print(f"Skipping replacement for {stakeholder_in_cluster}")
                        continue
                    stakeholder_index_to_target_item_id[orig_stakeholder_id] = stakeholder_index_to_target_item_id[i]
            break

Skipping replacement for central elections commission of russia
Skipping replacement for central elections commission of russia
Skipping replacement for central elections commission of russia
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for ukrainian air forces
Skipping replacement for russian civilians
Skipping replacement for russian civilians
Skipping replacement for russian

In [19]:
for i in range(len(stakeholder_index_to_target_item_id)):
    if stakeholder_index_to_target_item_id[i] is not None:
        wiki_info = itemid_to_wiki_info[stakeholder_index_to_target_item_id[i][0]]
        main_label = wiki_info["main_label"]
        orig_stakeholder = stakeholders[i]
        # copy the original stakeholder to the replacement
        old_replacement = stakeholder_replacement[orig_stakeholder]
        stakeholder_replacement[orig_stakeholder] = main_label

In [20]:
stakeholder_replacement_grouped = defaultdict(list)

for k,v in stakeholder_replacement.items():
    stakeholder_replacement_grouped[v].append(k)

In [27]:
stakeholder_eval_set_answers[7060:7090]

'"akhmat grozny", "akhmat"],]'

In [28]:
import json

# read the stakeholder_eval_set_answers.txt file
stakeholder_eval_set_answers = ""
with open("stakeholder_eval_set_answers.txt", "r") as f:
    for line in f:
        stakeholder_eval_set_answers += line.strip()

stakeholder_eval_set_answers = json.loads(stakeholder_eval_set_answers)

[['sergei karaganov'],
 ['kajsa ollongren'],
 ['mark n. katz'],
 ['chuck schumer'],
 ['president vladimir putin',
  'vladimir putin',
  'russian president vladimir putin'],
 ['vitaliy vavryshchuk'],
 ['the west'],
 ['vladimir zelenskiy',
  'volodimir zelenski',
  'wolodimir zelenski',
  'ukrainian president volodymyr zelensky',
  'volodymyr zelenskiyy',
  'vlodomyr zelensky',
  'volodimir zelensky',
  'volodymyr zelenskyj',
  'volodymyr zelensky',
  'wolodymyr selenskyj',
  'vladimir aleksandrovich zelensky'],
 ['ihor kolomoisky'],
 ['jan lipavsky'],
 ['oleksandr tarnawskyi',
  'oleksandr tarnawskyj',
  'oleksandr tarnavskiy',
  'oleksandr tarnavskyi',
  'oleksander tarnawskyi',
  'oleksandr tarnawski',
  'oleksander tarnavskyi',
  'oleksandr tarnavsky',
  'olexander tarnawskyj'],
 ['dmitry ivanovich kuleba'],
 ['oleksiy makeyev', 'oleksii makeiev'],
 ['new people'],
 ['united nations high commissioner for human rights',
  'united nations human rights council'],
 ['violeta artemchuk'],

In [29]:
# Positive samples (pairs of items within the same list)
positive_samples = []
for sublist in stakeholder_eval_set_answers:
    for i in range(len(sublist)):
        for j in range(i + 1, len(sublist)):
            positive_samples.append((sublist[i], sublist[j]))

# Negative samples (pairs of items from different sublists)
negative_samples = []
for i in range(len(stakeholder_eval_set_answers)):
    for j in range(i + 1, len(stakeholder_eval_set_answers)):
        # Create all possible pairs between sublist[i] and sublist[j]
        for element1 in stakeholder_eval_set_answers[i]:
            for element2 in stakeholder_eval_set_answers[j]:
                negative_samples.append((element1, element2))

print(f"Number of positive samples: {len(positive_samples)}")
print(f"Number of negative samples: {len(negative_samples)}")

Number of positive samples: 398
Number of negative samples: 42088


In [30]:
negative_samples[:10]

[('sergei karaganov', 'kajsa ollongren'),
 ('sergei karaganov', 'mark n. katz'),
 ('sergei karaganov', 'chuck schumer'),
 ('sergei karaganov', 'president vladimir putin'),
 ('sergei karaganov', 'vladimir putin'),
 ('sergei karaganov', 'russian president vladimir putin'),
 ('sergei karaganov', 'vitaliy vavryshchuk'),
 ('sergei karaganov', 'the west'),
 ('sergei karaganov', 'vladimir zelenskiy'),
 ('sergei karaganov', 'volodimir zelenski')]

In [31]:
stakeholder_clusters_final = [
    [stakeholder for stakeholder in cluster] for cluster in stakeholder_replacement_grouped.values()
]

stakeholder_clusters_final

[['central elections commission of russia',
  "russia's elections commission",
  'russian central election commission',
  'the central election commission of russia',
  'central election commission of the russian federation',
  'central electoral commission of the russian federation',
  'central election commission of russia',
  'moscow regional election committee',
  'russian election commission',
  "russia's central election commission",
  "russia's election commission",
  'russian central electoral commission',
  "russia's central elections commission",
  'russian electoral commission',
  'russian election authorities',
  'central electoral commission of russia',
  'election commission of tsentralny of the russian federation',
  "moscow's electoral commission",
  "moscow's election commission",
  'moscow city election commission'],
 ['ukrainian central election commission',
  "ukraine's central election commission",
  'central election commission of ukraine'],
 ['volodymyr zelenskyy

In [32]:
positive_results = []
for sublist in stakeholder_clusters_final:
    for i in range(len(sublist)):
        for j in range(i + 1, len(sublist)):
            positive_results.append((sublist[i], sublist[j]))

positive_results

[('central elections commission of russia', "russia's elections commission"),
 ('central elections commission of russia',
  'russian central election commission'),
 ('central elections commission of russia',
  'the central election commission of russia'),
 ('central elections commission of russia',
  'central election commission of the russian federation'),
 ('central elections commission of russia',
  'central electoral commission of the russian federation'),
 ('central elections commission of russia',
  'central election commission of russia'),
 ('central elections commission of russia',
  'moscow regional election committee'),
 ('central elections commission of russia', 'russian election commission'),
 ('central elections commission of russia',
  "russia's central election commission"),
 ('central elections commission of russia', "russia's election commission"),
 ('central elections commission of russia',
  'russian central electoral commission'),
 ('central elections commission of 

In [33]:
positive_samples_set = set(positive_samples)
negative_samples_set = set(negative_samples)

In [34]:
# Convert the samples to sorted tuples (to handle unordered pairs)
positive_samples_set = {tuple(sorted(sample)) for sample in positive_samples}
negative_samples_set = {tuple(sorted(sample)) for sample in negative_samples}

# Calculate true positives (TP): positive samples that exist in positive_results
true_positives_results = [sample for sample in positive_samples_set if tuple(sorted(sample)) in positive_results]

# Calculate false negatives (FN): positive samples that do not exist in positive_results
false_negatives_results = [sample for sample in positive_samples_set if tuple(sorted(sample)) not in positive_results]

# Calculate false positives (FP): negative samples that exist in positive_results
false_positives_results = [sample for sample in negative_samples_set if tuple(sorted(sample)) in positive_results]

# Calculate true negatives (TN): negative samples that do not exist in positive_results
true_negatives_results = [sample for sample in negative_samples_set if tuple(sorted(sample)) not in positive_results]


true_positives = len(true_positives_results)
false_negatives = len(false_negatives_results)
false_positives = len(false_positives_results)
true_negatives = len(true_negatives_results)

# Output the results
print("True Positives (TP):", true_positives)
print("False Negatives (FN):", false_negatives)
print("False Positives (FP):", false_positives)
print("True Negatives (TN):", true_negatives)


True Positives (TP): 146
False Negatives (FN): 252
False Positives (FP): 8
True Negatives (TN): 41518


In [35]:
# Calculate the metrics based on TP, FP, TN, FN
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
fpr = false_positives / (false_positives + true_negatives) if (false_positives + true_negatives) != 0 else 0
specificity = true_negatives / (true_negatives + false_positives) if (true_negatives + false_positives) != 0 else 0

# Output the results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"False Positive Rate (FPR): {fpr:.4f}")
print(f"Specificity: {specificity:.4f}")

Precision: 0.9481
Recall: 0.3668
F1 Score: 0.5290
Accuracy: 0.9938
False Positive Rate (FPR): 0.0002
Specificity: 0.9998


In [37]:
false_positives_results

[("ukraine's defense forces", "ukraine's military"),
 ("ukraine's defense forces", 'ukrainian troops'),
 ('abbas galljamov', 'abbas galyamov'),
 ("ukraine's defense forces", 'ukrainian army'),
 ("ukraine's defense forces", 'ukrainian defense forces'),
 ("ukraine's defense forces", 'ukrainian military'),
 ("ukraine's armed forces", "ukraine's defense forces"),
 ("ukraine's defense forces", 'ukrainian armed forces')]

In [38]:
false_negatives_results

[("ukraine's military", 'ukrainian troops'),
 ("russia's defense ministry", 'russian defense ministry'),
 ("ukraine's military", 'ukrainian defence forces'),
 ('centre of national resistance of ukraine',
  "ukraine's national resistance center"),
 ('ukraine military', 'ukrainian troops'),
 ('ukrainian defense forces', 'ukrainian forces'),
 ("ukraine's defense forces", 'ukrainian defence forces'),
 ('national resistance center of the special operations of the ukrainian armed forces',
  "ukraine's national resistance centre"),
 ('ukrainian president volodymyr zelensky', 'vladimir zelenskiy'),
 ('defense forces of ukraine', 'ukrainian troops'),
 ('ukraine military', "ukraine's armed forces"),
 ('russian defence ministry', 'russian ministry of defence'),
 ('vladimir zelenskiy', 'volodymyr zelensky'),
 ("ukraine's military", 'ukrainian military'),
 ('volodimir zelensky', 'volodymyr zelenskyj'),
 ("russia's defense ministry", 'russian ministry of defence'),
 ('centre of national resistance o

In [None]:
del itemid_to_wiki_info