# Building a topic/facet model using transformers

In [None]:
import os
import sys
from importlib import reload
from multiprocessing import cpu_count
import joblib

import numpy as np
import pandas as pd
import umap
import umap.plot
import matplotlib.pyplot as plt
import matplotlib
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
PROJECT_PATH = "/home/romain/projects/socsemics"
sys.path.append(PROJECT_PATH)

from experiments.scripts.topic_modelling.st_tm.st_tm import STTopicModel
import experiments.scripts.topic_modelling.st_tm.helpers as helpers

In [None]:
MODELS_DIR = os.path.join(PROJECT_PATH, "experiments", "models")
TM_DIR = os.path.join(MODELS_DIR, "topic_modelling", "st_tm")
DATA_DIR = os.path.join(PROJECT_PATH, "experiments", "data")

In [None]:
RANDOM_STATE = 42

## Import and preprocess data

In [None]:
docs = [
    "نوال الزغبي (الشاب خالد ليس عالمي) هههههههه أتفرجي على ها الفيديو يا مبتدئة http vía @user",
    "Trying to have a conversation with my dad about vegetarianism is the most pointless infuriating thing ever #caveman ",
    """Royal: le président n'aime pas les pauvres? "c'est n'importe quoi" http …""",
    "@user korrekt! Verstehe sowas nicht...",
    "CONGRESS na ye party kabhi bani hoti na india ka partition hota nd na hi humari country itni khokhli hoti   @ ",
    "@user @user Ma Ferrero? il compagno Ferrero? ma il suo partito esiste ancora? allora stiamo proprio frecati !!!",
    "todos os meus favoritos na prova de eliminação #MasterChefBR",
    "@user jajajaja dale, hacete la boluda vos jajaja igual a vos nunca se te puede tomar en serio te mando un abrazo desde Perú!"
]

## Embed corpus

In [None]:
MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"

model = STTopicModel(embedding_model=MODEL_NAME,
                     documents=docs,
                     document_ids=None,
                     verbose=True
                    )

In [None]:
__ = model.embed_corpus(pooling_method="max")

## Train topic/facet model

In [None]:
_ = model.topic_extraction(n_components=5,
                           n_neighbors=50,
                           min_topic_size=500,
                           min_samples=15,
                           n_words=30,
                           random_state=RANDOM_STATE,
                          )

In [None]:
_ = model.facet_extraction(n_components=5,
                           n_neighbors=15,
                           min_facet_size=10,
                           min_samples=10,
                           n_words=30,
                           random_state=RANDOM_STATE,
                          )

In [None]:
model.save(PATH_MODEL)

## Topic exploration

In [None]:
topics, sizes = np.unique(model.doc_topic, return_counts=True)
print(f"Number of topics  : {len(topics)}")
print()
print(f"Sizes : {' '.join([str(s) for s in sizes])}")

### Clustering visualization

In [None]:
N_NEIGHBORS = model.topic_extraction_parameters["n_neighbors"]
UMAP_2D_NAME = f"umap_2d_{N_NEIGHBORS}_neighbors"
PATH_UMAP_2D = os.path.join(TM_DIR, UMAP_2D_NAME)

if os.path.exists(PATH_UMAP_2D):
    umap_model_2d = joblib.load(PATH_UMAP_2D)
else:
    umap_model_2d = umap.UMAP(n_neighbors=N_NEIGHBORS,
                              n_components=2,
                              min_dist=0,
                              metric='cosine',
                              low_memory=True,
                              random_state=RANDOM_STATE).fit(model.document_vectors)
    joblib.dump(umap_model_2d, PATH_UMAP_2D)

In [None]:
topics_subset = None
# topics_subset = [1, 9, 27, 43]

axs = helpers.plot_topics(model, topics_subset=topics_subset, mark_noisy=False, 
                          umap_model_2d=umap_model_2d)

### Topic characterization

In [None]:
for topic, words in enumerate(model.topic_words):
    print(f"Topic {topic} : {' '.join(words)}")
    print()

### Explore a particular topic

In [None]:
topic = 1

docs, sims, docs_ids = helpers.get_topic_docs(model, topic)
t_words_str = " ".join(model.topic_words[topic][:10])

print(f"Topic {topic} : {t_words_str}")
print()
for i, doc in enumerate(docs):
    print(f"{doc} (sim={sims[i]:.2f})")
    print()

### Topic similarity

In [None]:
# method == "tv" : embedding space topic vectors
# method == "tf_idf" : tf-idf topic vectors
top_top_sims = helpers.most_similar_topics(model, method="tv")

for top1, top2, sim in top_top_sims:
    print(f"Topic {top1} : {' '.join(model.topic_words[top1][:10])}")
    print(f"Topic {top2} : {' '.join(model.topic_words[top2][:10])}")
    print(f"sim : {sim:.2f}")
    print()

## Facet exploration

### Characterization of a topic's facets

In [None]:
topic = 2
t_words_str = " ".join(model.topic_words[topic][:10])

print(f"Topic {topic} : {t_words_str}")
print()

for facet in model.topic_facets[topic].keys():
    f_words_str = " ".join(model.topic_facets[topic][facet]["words"][:10])
    size = model.topic_facets[topic][facet]["size"]
    f_rep_claim = helpers.get_topic_docs(model, topic, facet)[0][0]
    print(f"Facet {facet} ({size} docs) : {f_words_str}")
    print(f"Representative claim : {f_rep_claim}")
    print()

### Explore a particular facets

In [None]:
topic = 1
facet = 33

docs, sims, docs_ids = helpers.get_topic_docs(model, topic, facet)
t_words_str = " ".join(model.topic_words[topic][:10])
f_size = model.topic_facets[topic][facet]["size"]
f_words_str = " ".join(model.topic_facets[topic][facet]["words"][:10])

print(f"Topic {topic} : {t_words_str}")
print(f"Facet {facet} ({f_size} docs) : {f_words_str}")
print()
for i, doc in enumerate(docs):
    print(f"{doc} (sim={sims[i]:.2f})")
    print()

### Visualization of a topic's facets

In [None]:
helpers = reload(helpers)
axs = helpers.plot_facets(model, topic=25, mark_noisy=False, interactive=True)

### Distribution of facets per topic

In [None]:
n_facets_per_topic = np.array([len(model.topic_facets[t].keys()) for t in topics])

fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(topics, n_facets_per_topic)
ax.tick_params(axis='x', colors='white', labelsize=15)
ax.tick_params(axis='y', colors='white', labelsize=15)

In [None]:
topics_few_facets = np.where(n_facets_per_topic < 10)[0]

for topic in topics_few_facets:
    print(f"Topic {topic} : {' '.join(model.topic_words[topic])}")
    print()
    
labels = np.array([c if c in topics_few_facets else -1 for c in model.doc_topic])
axs = umap.plot.points(umap_model_2d, labels=labels, background='black',
                       height=1200, width=1200, show_legend=True)

## Document pairs sampling for fine -tuned facet detection

In [None]:
pct_per_topic = 0.05
quantiles_sim = [0.2, 0.4, 0.6, 0.8, 0.99]
n_to_sample = (pct_per_topic * model.topic_sizes).round()
digits_q = list(range(len(quantiles_sim)))

dict_samples = helpers.sample_doc_pairs(model, pct_per_topic, quantiles_sim)