# Setup

In [172]:
import hashlib
import json
import os
import shutil
from urllib.parse import urlparse

import geopandas
import ipywidgets as widgets
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from IPython.display import HTML, display
from ipywidgets import interact, interact_manual
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from umap import UMAP


In [178]:
claims_path = "../claims_parse_urls-6m-hard_clean.json"

In [190]:
config = {
    "text_columns": ["claimReviewed"],
    "filter_by": ["language"],
    "embedding_model" : "paraphrase-multilingual-MiniLM-L12-v2",
    "umap_model": {
        "n_neighbors": 15,
        "n_components": 5,
        "min_dist": 0.0,
        "metric":"cosine"
    },
    "hdbscan_model":{
        "min_cluster_size": 5,
        "metric": "euclidean",
        "cluster_selection_method": "eom",
        "prediction_data": True
    }
}

In [173]:
tqdm.pandas()

# date_formatter = lambda ax : mdates.ConciseDateFormatter(ax.xaxis.get_major_locator())
pd.set_option('max_colwidth',None)
pd.set_option('display.max_columns', None)

In [174]:
# current_palette = sns.color_palette()
# first = current_palette[0]
# second = current_palette[1]

# Helper Functions

In [175]:
def value_counts(series):
    "Compute the count, percentage, and cumulative sum of unique values in a Pandas Series."
    return pd.concat(
        [
            series.value_counts().rename("count"),
            series.value_counts(normalize=True).rename("percentage"),
            series.value_counts(normalize=True).cumsum().rename("cumsum")
        ], axis=1
    )

In [176]:
def top_k_claims(df,topic,k=3,asc=False):
    "Returns the top k claims for a given topic in a DataFrame, sorted by probability."
    return df[df.cluster==topic].sort_values("probability",ascending=asc).head(k)

In [177]:
def topic_summary(topic,sample=2,asc=False):
    "Prints a summary of a topic and displays the top k claims for that topic."
    print(f"Topic {topic}: {len(claims[claims.cluster==topic])} articles")
    print(topic_model.get_topic(topic))
    display(top_k_claims(claims,topic,sample,asc)[["country","claimReviewed_t"]])

In [192]:
def get_config_id(config):
    a = json.dumps(config, sort_keys=True)
    return hashlib.md5(a.encode("utf-8")).hexdigest()

# Data


<div class="alert alert-block alert-info">
Download the parsed claims from https://drive.google.com/file/d/1QlCtKtSwdUX6NWOcUXijmxi4JoSdp49O/view?usp=sharing
</div>

In [179]:
claims = pd.read_json(claims_path)

In [180]:
claims["media"] = claims.url.apply(lambda x: urlparse(x).hostname)

### Filter by language


<div class="alert alert-block alert-info">
Language ISO list from https://publications.europa.eu/code/en/en-5000800.htm
</div>

In [181]:
european_languages = ["bg","hr","cs","da","nl","en","et","fi","fr","de","el","hu","ga","it","lv","lt","mt","pl","pt","ro","sk","sl","es","sv"]

In [182]:
claims = claims[claims.languageISO.apply(lambda x: x in european_languages)]

### Filter by continent


<div class="alert alert-block alert-info">
The library geopandas contains a map of countries and continents. See https://geopandas.org/en/stable/ </div>

In [183]:
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

Some media had their country missing. We have compleated it for the media with more fact-checks.

In [184]:
media_country = {
    "politica.estadao.com.br": "Brazil",
    "www.bufale.net": "Italy",
    "www.telugupost.com": "India",
    "www.tf1info.fr": "France",
    "projetocomprova.com.br": "Brazil",
    "www.polygraph.info": "United States of America",
    "srilanka.factcrescendo.com": "Sir Lanka",
    "www.20minutes.fr": "France",
    "verifica.efe.com": "Spain",
    "defacto-observatoire.fr": "France",
    "dpa-factchecking.com": "Germany",
    "www.e-farsas.com": "Brazil",
    "info-veritas.com": "Spain",
    "factcheckgreek.afp.com": "Greece",
    "www1.folha.uol.com.br":"Brazil",
    "www.rtve.es": "Spain"
}

In [185]:
claims["country"] = claims.apply(lambda x: media_country.get(x["media"]) if not x["country"] else x["country"],axis=1)

In [186]:
countries = {c:world[world.name.str.lower().str.contains(c.lower())].iloc[0]["iso_a3"] for c in claims.country.unique() if c and world.name.str.lower().str.contains(c.lower()).any()}

In [187]:
claims["iso_a3"] = claims.country.map(countries)

In [188]:
claims["continent"] = claims.iso_a3.map(world.set_index("iso_a3").continent.to_dict())

In [19]:
claims = claims[claims["continent"].apply(lambda x: x in ["Europe", "North America", "South America"])]

## EDA

In [189]:
language_count = value_counts(claims.languageISO)
language_count.head(5)

Unnamed: 0,count,percentage,cumsum
en,6849,0.492415,0.492415
pt,2815,0.202387,0.694802
es,1487,0.106909,0.801711
fr,634,0.045582,0.847293
it,629,0.045223,0.892516


# BERTOPIC

## Model using scrape article

In [191]:
text_columns = ["claimReviewed"]
embeddings = None

In [195]:
def infer_topic(df,config,save_dir = "topic_runs"):
    global embeddings,text_columns

    docs = df[text_columns].apply(lambda x: '. '.join(x.values),axis=1)

    # If there are no embeddings or we change the text columns
    if embeddings is None or config["text_columns"]!=text_columns:
        text_columns = config["text_columns"]
        docs = df[text_columns].apply(lambda x: '. '.join(x.values),axis=1)

        sentence_model = SentenceTransformer(config["embedding_model"])
        embeddings = sentence_model.encode(docs, show_progress_bar=True)

    # Setup run
    topic_model = BERTopic(
        umap_model=UMAP(**config["umap_model"]),
        hdbscan_model=HDBSCAN(**config["hdbscan_model"]),
        ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=True),
        verbose=True
    )

    # Infer topic models
    topics, probs = topic_model.fit_transform(docs,embeddings)
    claims["cluster"] = topics
    claims["probability"] = probs
    topics_df = pd.DataFrame({
        "cluster": topics,
        "probability": probs
    },index=claims.index)

    # Save Results
    save_path = f"{save_dir}/{get_config_id(config)}"
    # Overwrite
    if os.path.exists(save_path):
        shutil.rmtree(save_path)
    os.makedirs(save_path)
    with open(f"{save_path}/config.json", "w") as f:
        json.dump(config, f)
    topics_df.to_json(f"{save_path}/topics_df.json")
    topic_model.save(f"{save_path}/model")

    return topic_model, topics_df, docs

In [196]:
topic_model, topics_df, docs = infer_topic(claims,config)
claims["cluster"] = topics_df["cluster"]
claims["probability"] = topics_df["probability"]

Batches:   0%|          | 0/435 [00:00<?, ?it/s]

2023-03-28 12:20:40,545 - BERTopic - Reduced dimensionality
2023-03-28 12:20:41,013 - BERTopic - Clustered reduced embeddings


Query any term to find which topics have it in their documents

In [240]:
query = "zelenski"
condition = claims.claimReviewed_t.str.lower().str.contains(query)
value_counts(claims[condition].cluster)

Unnamed: 0,count,percentage,cumsum
140,4,0.333333,0.333333
-1,3,0.25,0.583333
201,2,0.166667,0.75
141,2,0.166667,0.916667
47,1,0.083333,1.0


Summary of a topic

In [241]:
topic_summary(140,5)

Topic 140: 19 articles
[('volodimir', 0.5128774243662303), ('rueda', 0.5056073638814902), ('zelenski', 0.4762470728256101), ('butt', 0.4671000312853512), ('mientras', 0.4398478776702831), ('abandona', 0.4398478776702831), ('prensa', 0.4398478776702831), ('volodymyr', 0.43082447516251055), ('elnki', 0.40418651582345316), ('ongepast', 0.40418651582345316)]


Unnamed: 0,country,claimReviewed_t
63ab8d3fee23070012696282,France,"Joe Biden ""se fue"" en medio del discurso de Volodymyr Zelensky"
63a9bb45dc86e20012051542,Italy,Biden toca el trasero de Zelensky
63bffb24309cf700130b4b4a,,Joe Biden agarró el trasero de Volodymyr Zelensky durante la visita presidencial
63e0489cd3957e00129e3b45,,Joe Biden tocó a Zelensky de manera inapropiada
63a5f0c68ac0140012b343af,Spain,Joe Biden le toca el culo a Zelenski durante su viaje a Estados Unidos
