# Topic modeling à partir des données de Reddit

In [59]:
import os
import json
import re
import time
import csv
from datetime import datetime
import requests

import numpy as np
import pandas as pd
import umap
import hdbscan
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine as cosine_distance
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
from bokeh.layouts import row, column, layout
from bokeh.models import (
    ColumnDataSource, HoverTool, Range1d, Title, TableColumn, DataTable,
    StringFormatter, CustomJS, Div, Button
    )
from bokeh.models.widgets import Tabs, Panel
from bokeh.plotting import figure, curdoc, show
from bokeh.io import output_notebook

## Récupération des données

### Configuration pour requêter l'API

In [53]:
access_token = "changeMe"

In [54]:
def get_batch(sub, after, before, token):
    """Fetch a batch of submissions from the API."""
    url = (f"https://api.pushshift.io/reddit/search/submission/?subreddit={sub}"
           f"&access_token={token}"
           f"&after={after}&before={before}"
           "&size=500")
    req = requests.get(url)
    data = json.loads(req.text)
    return data['data']

### Extraction et nettoyage du texte

In [55]:
def convert_datetime(dto):
    """Convert Python datetime object to timestamp and int YYYYMMDD."""
    date_timestamp = int(datetime.timestamp(dto))
    date_int = dto.year * 10000 + dto.month * 100 + dto.day
    return date_timestamp, date_int

def preprocess_self_text(text):
    """Remove recurrent noisy elements from self-texts."""
    text = text.replace("|", " ")  # Avoid pb with sep for csv
    text = re.sub(r"(_____\s+&gt.+)", " ", text)  # Remove footnote
    text = text.replace("[deleted]", " ").replace("[removed]", " ")
    text = text.strip()  # Remove excessing space at the end
    return text

def extract_info(subm):
    """Extract relevant info from the data dictionary of a submission."""
    title = subm['title']
    regexp_match = re.search(r"(CMV|cmv):\s?(.+)", title)
    if regexp_match:
        # Extract proper title
        title_sub = regexp_match.group(2).strip()
        if title_sub[-1] != ".":
            title_sub = title_sub + "."
        # Extract other relevant info
        subm_id = subm["id"]
        author = subm["author"]
        try:
            self_text = preprocess_self_text(subm["selftext"])
        except KeyError:
            self_text = ""
        timestamp = subm["created_utc"]
        nb_comments = subm["num_comments"]
        score = subm["score"]

        return subm_id, title_sub, author, timestamp, nb_comments, score, self_text

    return None

### Récupération des documents

In [56]:
def get_all_titles(sub, start_timestamp, end_timestamp, token, path_save):
    """Extract all titles from a sub between two dates."""
    data = get_batch(sub, start_timestamp, end_timestamp, token)
    batch_data = []
    while data:
        # Get current batch timerange
        batch_start = str(datetime.fromtimestamp(data[0]['created_utc']))
        batch_end = str(datetime.fromtimestamp(data[-1]['created_utc']))

        # Extract data from current batch
        for submission in data:
            subm_data = extract_info(submission)
            if subm_data is not None:
                batch_data.append(subm_data)
        new_start_timestamp = data[-1]['created_utc']  # Start date of next batch

        print(f"Batch processed : {batch_start} - {batch_end}. "
              f"Next batch start timestamp : {new_start_timestamp}.")

        # Get next batch
        try:
            data = get_batch(sub, new_start_timestamp, end_timestamp, token)
        except json.JSONDecodeError:
            # Retry after sleep time
            time.sleep(5)
            try:
                data = get_batch(sub, new_start_timestamp, end_timestamp, token)
            except json.JSONDecodeError:
                print(f"Batch failed after retry : {batch_start} - {batch_end}.")
                continue
                
    df_titles = pd.DataFrame(data=batch_data)
                
    return df_titles

In [None]:
SUB = "changemyview"
START_DATE = datetime(2015, 1, 1)
END_DATE = datetime(2020, 12, 1)

START_DATE_TMSTP, START_DATE_INT = pushshift.convert_datetime(START_DATE)
END_DATE_TMSTP, END_DATE_INT = pushshift.convert_datetime(END_DATE)

df_titles = pushshift.get_all_titles(sub="changemyview",
                                      start_timestamp=datetime(2015, 1, 1),
                                      end_timestamp=datetime(2020, 12, 1),
                                      token=access_token
                                      path_save=
                                     )

### Export des données

In [None]:
df_titles.to_csv("data/titles_cmv_2015_2020.csv", , sep="|", index=False)

In [61]:
df_titles = pd.read_csv("data/titles_cmv_2015_2020.csv", sep="|")
print(df_titles["title"].values[:10])

['Serial killers cannot change and cannot be rehabilitated.'
 'Serial killers cannot be rehabilitated.'
 'The minimum wage should not be raised.'
 'The most financially successful film released in 2014 was likely Planes: Fire and Rescue.'
 'Copyright law should include a "continued use" provision.'
 'Mother fatally shot by concealed gun from 2 year old son is at fault for her incident.'
 'I believe the police abuse people.'
 'UN Peacekeepers should have the power to engage in offensive operations.'
 "Timed essays like the SAT don't properly measure one's writing skills, nor their critical thinking skills."
 "I believe in a supernatural force and don't believe in the Big Bang, evolution, or gravity."]


## Modélisation : topic modeling

![proj-illustration](img/doc_word_embedding.svg)

source : https://github.com/ddangelov/Top2Vec

### Projection dans un espace sémantique (Sentence Transformer)

In [None]:
def embed_corpus(documents, embedding_model):
    """Compute document embeddings using a sentence-transfomer model."""
    model = SentenceTransformer(embedding_model)
    document_vectors = model.encode(documents)
    document_vectors = document_vectors
    return document_vectors

### Réduction de la dimension (UMAP)

In [None]:
def compute_umap(document_vectors, n_components, n_neighbors,
                random_state=None):
    """Compute low dimensional embeddings using the UMAP algorithm."""
    umap_model = umap.UMAP(n_neighbors=n_neighbors,
                           n_components=n_components,
                           min_dist=0,  # Maximize points density
                           metric='cosine',
                           low_memory=True,
                           random_state=random_state
                           )
    umap_model.fit(document_vectors)
    return umap_model

### Clustering par densité (HDBSCAN)

In [None]:
def compute_hdbscan(document_vectors, min_cluster_size, min_samples,
                    cluster_selection_method, prediction_data=False):
    """Perform density-based clustering using the HDBSCAN algorithm."""
    if min_samples is None:
        min_samples = min_cluster_size

    # Compute HDBSCAN clusters
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                    min_samples=min_samples,
                                    metric='euclidean',
                                    cluster_selection_method=cluster_selection_method,
                                    prediction_data=prediction_data
                                    )
    hdbscan_model.fit(document_vectors)

    return hdbscan_model

### Charactérisation des topics (Class TF-IDF)

In [None]:
def topic_characterization(corpus, doc_topic, subset_docs=None, n_words=30):
    """Characterize each topic by the top n words wight highest tf-idf score."""
    # Compute tf-idf matrix on the relevant corpus
    vectorizer = TfidfVectorizer(strip_accents="ascii",
                                 lowercase=True,
                                 token_pattern=self.token_pattern,
                                 stop_words="english"
                                 )
    tfidf_model = vectorizer.fit(corpus)
    words = np.array(tfidf_model.get_feature_names())

    topic_rep_words = []
    topic_vectors_tfidf = []
    for topic in np.unique(doc_topic):
        # Join all documents of the topic as a single string
        doc_idxs = np.where(doc_topic == topic)[0]
        big_doc_topic = " ".join(corpus[doc_idxs])
        # Compute tf-idf embedding of this meta -document
        topic_tfidf = vectorizer.transform([big_doc_topic]).toarray()[0]
        topic_vectors_tfidf.append(topic_tfidf)
        # Characterize topic with top n words wight highest tf-idf score
        top_scores = np.flip(np.argsort(topic_tfidf))
        top_words = words[top_scores][:n_words]
        topic_rep_words.append(top_words)

    topic_rep_words = np.array(topic_rep_words)
    topic_vectors_tfidf = np.array(topic_vectors_tfidf)

    return topic_rep_words, topic_vectors_tfidf

## Sélection de modèles et évaluation

In [None]:
y_true = true_topics

n_comp_range = [3, 5, 7]
n_neigh_range = [30, 50, 70]
min_size_range = [10, 15, 20]
method_range = ["eom", "leaf"]

configs = []
adj_rands = []
amis = []
v_measures = []
fm_scores = []
for n_comp in n_comp_range:
    for n_neigh in n_neigh_range:
        for min_size in min_size_range:
            for method in method_range:
                 _ = model.topic_extraction(n_components=n_comp,
                                            n_neighbors=n_neigh,
                                            min_size=min_size,
                                             min_samples=min_samples,
                                            cluster_selection_method=method,
                                            random_state=RANDOM_STATE
                                           )
                y_pred = np.array([model.doc_topic_facet[idx]["topic"]
                                    for idx in model.doc_topic_facet])
                adj_rand = metrics.adjusted_rand_score(y_true, y_pred)
                ami = metrics.adjusted_mutual_info_score(y_true, y_pred)
                v_measure = metrics.v_measure_score(y_true, y_pred)
                fm_score = metrics.fowlkes_mallows_score(y_true, y_pred)

                configs.append((n_comp, n_neigh, min_size, method))
                adj_rands.append(adj_rand)
                amis.append(ami)
                v_measures.append(v_measure)
                fm_scores.append(fm_score)

print(f"Best config with Adjusted Rand : {configs[adj_rands.index(max(adj_rands))]}")
print(f"Best config with AMI : {configs[amis.index(max(amis))]}")
print(f"Best config with V-measure : {configs[v_measures.index(max(v_measures))]}")
print(f"Best config with FM-score : {configs[fm_scores.index(max(fm_scores))]}")

## Visualisation

In [83]:
def map_themes(themes_dict):
    """"""
    topics = list(np.unique(app_input_data["doc_topic"]))
    topics_themes = [themes_dict[theme] for theme in themes_dict.keys()]
    topics_themes = sorted([x for sublist in topics_themes for x in sublist])

    # Validate mapping from theme to topics
    if topics != topics_themes:
        raise ValueError("Invalid mapping from theme to topics provided.")

    # Return mapping from topic to theme
    topics_to_theme = {}
    for key, values in themes_dict.items():
        for val in values:
            topics_to_theme[val] = key
    mapping = [topics_to_theme[topic] for topic in topics]
    return mapping

def build_compute_topic_map(app_input_data, theme_to_topics=None):
    """"""
    # Create interactive figure
    x_coords = app_input_data["umap_2d_embeddings"][:, 0]
    y_coords = app_input_data["umap_2d_embeddings"][:, 1]
    x_min, x_max = min(x_coords), max(x_coords)
    y_min, y_max = min(y_coords), max(y_coords)
    x_margin_l, x_margin_r = MARGIN_L*(x_max - x_min), MARGIN_R*(x_max - x_min)
    y_margin_t, y_margin_b = MARGIN_T*(x_max - x_min), MARGIN_B*(x_max - x_min)
    plot = figure(
        plot_width=600, plot_height=600,
        x_range=Range1d(start=x_min-x_margin_l, end=x_max+x_margin_r),
        y_range=Range1d(start=y_min-y_margin_b, end=y_max+y_margin_t),
        x_axis_location=None, y_axis_location=None,
        tools="pan,wheel_zoom,tap", toolbar_location=None,
        active_drag="pan", active_scroll="wheel_zoom"
    )
    plot.grid.grid_line_color = None

    # Set background image (scatter plot of all documents)
    # (using bokeh's Scatter slows down the app excessively)
    plot.image_url(url=[BG_URL],
                   x=x_min-x_margin_l, y=y_min-y_margin_b,
                   w=x_max-x_min+x_margin_l+x_margin_r,
                   h=y_max-y_min+y_margin_b+y_margin_t,
                   anchor="bottom_left")

    # Color topics
    if theme_to_topics is None:
        n_topics = len(np.unique(app_input_data["doc_topic"]))
        palette = plt.get_cmap(PALETTE)(np.linspace(0, 1, n_topics))
        t_colors = ["#FFDECB" for t in np.unique(app_input_data["doc_topic"])]
        topic_to_theme = t_colors   # Hack, legend will be hidden anyway
    else:
        topic_to_theme = map_themes(theme_to_topics)
        themes = np.unique(topic_to_theme)
        n_themes = len(themes)
        themes_to_int = {theme: i for i, theme in enumerate(themes)}
        palette = plt.get_cmap(PALETTE)(np.linspace(0, 1, n_themes))
        t_colors = [to_hex(palette[themes_to_int[theme]])
                    for theme in topic_to_theme]

    # Plot topic glyphs
    topic_glyphs.data = dict(
        x=app_input_data["topic_polygons_xs"],
        y=app_input_data["topic_polygons_ys"],
        topic=list(np.unique(app_input_data["doc_topic"])),
        theme=topic_to_theme,
        color=t_colors,
        words=[" ".join(list(w)[:N_WORDS_TM_TOOLTIPS])
               for w in app_input_data["topic_words"]],
        size=np.sqrt(app_input_data["topic_sizes"]) / 1.3,
        size_str=[f"{s} documents" for s in app_input_data["topic_sizes"]]
        )

    plot.multi_polygons(name="topics", source=topic_glyphs, xs="x", ys="y",
                        color="color", fill_alpha=0.5, line_alpha=0.3,
                        hover_color="white", hover_alpha=1,
                        selection_fill_alpha=0.7, selection_line_color="white",
                        selection_line_width=2, selection_line_alpha=0.6,
                        nonselection_fill_alpha=0.5, nonselection_line_alpha=0.3,
                        legend_field="theme")
    plot.legend.background_fill_alpha = 0.8
    if theme_to_topics is None:
        plot.legend.visible = False

    # Topic hovering
    hover = HoverTool(names=["topics"])
    hover.tooltips = """
    <font size="+1">
    <strong>Topic</strong>: @topic <br>
    <strong>Size</strong>: @size_str <br>
    <strong>Words</strong>: @words
    </font>
    """
    hover.point_policy = "follow_mouse"
    plot.add_tools(hover)

    return plot, t_colors

In [84]:
# Import input data
app_input_data = joblib.load("input_data.pickle")
BG_URL = "https://minio.lab.sspcloud.fr/avouacr/diffusion/mise-en-prod/points.png"
with open("viz_app/themes2topics.json", "r") as json_file:
    theme_dict = json.load(json_file)

# Display parameters
PALETTE = "tab20"
MARGIN_L, MARGIN_R, MARGIN_B, MARGIN_T = 0.025, 0.15, 0.025, 0.05

# Topic map
N_WORDS_TM_TOOLTIPS = 15
topic_map, __ = build_compute_topic_map(app_input_data, theme_dict)

In [85]:
output_notebook()

In [86]:
show(topic_map)

Questions : 

- comment s'assurer que mon projet va correctement fonctionner sur d'autres environnements d'exécution ?
- comment déployer mon app sur le web, et automatiser cette procédure ?
- comment récupérer les nouveaux documents à échéance régulière ?
- comment permettre à des utilisateurs de requêter mon modèle ?