In [1]:
# Applique ACP sur la meme base macron cocaine
import sys
from pathlib import Path

# Ajouter automatiquement le dossier racine du projet au sys.path
root_dir = Path().resolve().parent  # remonte à la racine
if str(root_dir) not in sys.path:
    sys.path.insert(0, str(root_dir))

# Imports standards
from sentence_transformers import SentenceTransformer
from utils.helper_functions import clean_text
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
import umap
import time
import os

warnings.filterwarnings("ignore")



file_path = root_dir/Path("data/importation-635-focus-AI.csv")
df = pd.read_csv(file_path, sep=";")
df = df[["sentences"]]
texts = (df.sentences.apply(lambda x: clean_text(x))).to_list()

  from .autonotebook import tqdm as notebook_tqdm


# Context

This dataset was extracted from another dataset collected from the Twitter/X platform as part of a study aimed at analyzing trends at the intersection of **AI and climate**. The goal is to gain deeper insights into the specific themes and narratives emerging from posts that relate to both domains.

The data was retrieved using the **official X API**, ensuring compliance with platform constraints and metadata integrity.

In summary, this is a **real-world, multilingual, and noisy dataset**, making it a valuable benchmark to demonstrate the **robustness of our clustering and automatic annotation pipeline**.

## Objective

The goal here is **not to develop a new NLP method or model**, but rather to **justify the choice of UMAP** over other dimensionality reduction techniques within our pipeline.

To achieve this, we will follow the steps below:

1. Apply **sentence embeddings** using the `"all-mpnet-base-v2"` model, after deduplicating the dataset.
2. Perform **dimensionality reduction** using three methods: **PCA (ACP)**, **t-SNE**, and **UMAP**.
3. **Visualize, describe, and compare** the results from each method to highlight the advantages of UMAP in capturing meaningful structure.

> **Remark:**  
> We have dedicated an entire notebook to demonstrate the importance of **data deduplication** — not only for reducing computational complexity, but also for **improving model performance**.  
> Refer to that notebook for a more detailed analysis and empirical evidence supporting this claim.



# Vectorisation 


In [2]:
model_embedding = "all-mpnet-base-v2"
model = SentenceTransformer(model_embedding, device="cuda")  
# Encode the texts
embeddings = model.encode(texts, device="cuda", show_progress_bar=True, batch_size=256)
df["embeddings"] = embeddings.tolist()

shape_before_deduplication = len(df["embeddings"])
df = df.drop_duplicates(subset="embeddings", keep="first") 
embeddings = np.vstack(df["embeddings"].to_numpy())

print("We have deduplicated the embeddings, reducing the dataset from", shape_before_deduplication, "to", len(df["embeddings"]), "that means we have removed", round((shape_before_deduplication - len(df["embeddings"]))*100/shape_before_deduplication, 2),"% of initial data.")

Batches: 100%|██████████| 8/8 [00:05<00:00,  1.59it/s]

We have deduplicated the embeddings, reducing the dataset from 1853 to 1217 that means we have removed 34.32 % of initial data.





# Réduction de dimensions
## UMAP

In [3]:
# Fit the UMAP model on your embeddings
start = time.time()
reducer = umap.UMAP(n_components=2, random_state=123).fit(embeddings)
# Transform the embeddings to 2D
reduced_embeddings = reducer.transform(embeddings)
end = time.time()
print(f"UMAP took {end - start:.2f} seconds to reduce the embeddings to 2D.")

# Print the shape of the reduced embeddings
print(f"Reduced embeddings shape: {reduced_embeddings.shape}")

df['x_umap'] = reduced_embeddings[:,0]
df['y_umap'] = reduced_embeddings[:,1]
umap_time = end - start

UMAP took 8.12 seconds to reduce the embeddings to 2D.
Reduced embeddings shape: (1217, 2)


## ACP

In [None]:
# 1. Initialiser et appliquer l'ACP
start = time.time()
pca = PCA(n_components=2)
reduced_embeddings_pca = pca.fit_transform(embeddings)
end = time.time()
print(f"PCA took {end - start:.2f} seconds to reduce the embeddings to 2D.")
# Afficher la forme des nouvelles données pour vérification
print(f"Shape des embeddings réduits par ACP : {reduced_embeddings_pca.shape}")

# 2. Ajouter les composantes principales au DataFrame
df['x_pca'] = reduced_embeddings_pca[:, 0]
df['y_pca'] = reduced_embeddings_pca[:, 1]

acp_time = end - start
ana

PCA took 0.03 seconds to reduce the embeddings to 2D.
Shape des embeddings réduits par ACP : (1217, 2)


## t-SNE

In [5]:
# 1. Initialiser et appliquer t-SNE
start = time.time()
tsne = TSNE(n_components=2)
reduced_embeddings_tsne = tsne.fit_transform(embeddings)
end = time.time()
print(f"t-SNE took {end - start:.2f} seconds to reduce the embeddings to 2D.")
print(f"Shape des embeddings réduits par t-SNE : {reduced_embeddings_tsne.shape}")

# 2. Ajouter les composantes principales au DataFrame
df['tsne_x'] = reduced_embeddings_tsne[:, 0]
df['tsne_y'] = reduced_embeddings_tsne[:, 1]

tsne_time = end - start


t-SNE took 4.51 seconds to reduce the embeddings to 2D.
Shape des embeddings réduits par t-SNE : (1217, 2)


# 

# Comparative Analysis of Dimensionality Reduction Methods

In [7]:
# 1) Grille 2×3 : 
#    - Ligne 1 : PCA en (1,1), t-SNE en (1,3), case (1,2) vide
#    - Ligne 2 : UMAP en (2,2), cases (2,1) & (2,3) vides
fig = make_subplots(
    rows=2, cols=3,
    specs=[
        [{}, None, {}],
        [None, {}, None]
    ],
    subplot_titles=(f"PCA Projection \n(Default settings – {acp_time:.2f}s)", f"t‑SNE Projection \n(Default settings – {tsne_time:.2f}s)", f"UMAP Projection \n(Default settings – {umap_time:.2f}s)"),


row_heights=[2, 2],
    column_widths=[1.5/3, 1.5/3, 1.5/3],
    horizontal_spacing=0,
    vertical_spacing=0.05
)

# 2) PCA (1,1)
fig.add_trace(
    go.Scatter(x=df['x_pca'], y=df['y_pca'], mode='markers', marker=dict(opacity=0.6)),
    row=1, col=1
)
fig.update_xaxes(title_text="PCA Dim 1", row=1, col=1)
fig.update_yaxes(title_text="PCA Dim 2", row=1, col=1)

# 3) t‑SNE (1,3)
fig.add_trace(
    go.Scatter(x=df['tsne_x'], y=df['tsne_y'], mode='markers', marker=dict(opacity=0.6)),
    row=1, col=3
)
fig.update_xaxes(title_text="t‑SNE Dim 1", row=1, col=2)
fig.update_yaxes(title_text="t‑SNE Dim 2", row=1, col=2)

# 4) UMAP (2,2)
fig.add_trace(
    go.Scatter(x=df['x_umap'], y=df['y_umap'], mode='markers', marker=dict(opacity=0.6)),
    row=2, col=2
)
fig.update_xaxes(title_text="UMAP Dim 1", row=2, col=2)
fig.update_yaxes(title_text="UMAP Dim 2", row=2, col=2)

# 5) Masquage des cases vides
for (r, c) in [(1,2), (2,1), (2,3)]:
    fig.update_xaxes(visible=False, row=r, col=c)
    fig.update_yaxes(visible=False, row=r, col=c)

# 6) Layout global : dimensions pour 3×400px × 2×400px
fig.update_layout(
    width=1200,   # 3 colonnes × 400px
    height=800,   # 2 lignes × 400px
    margin=dict(l=20,     # marge gauche
            r=50,     # marge droite
            t=50,      # marge haute
            b=50),
    # title_text="Comparaison interactive – PCA, t‑SNE (ligne 1) et UMAP centré (ligne 2)",
    showlegend=False
)


fig.show()


*(Dataset: 1,217 texts; CPU execution times)*

| Method   | Time (≈) | Algorithmic Complexity                   | Structure Preserved      | Scalability                  |
|:--------:|:--------:|:----------------------------------------:|:------------------------:|:----------------------------:|
| **PCA**  | 0.02 s   | O(min(n, d)²·max(n, d))                  | Global (linear)          | Excellent                    |
| **t-SNE**| 2.09 s   | O(n²)                                    | Local                    | Limited                      |
| **UMAP** | 0.70 s   | O(n log n)            | Local & global           | Good (≈ O(n log n))          |


## Why the time differences?

1. **PCA (≈ 0.02 s)**  
   - **Principle**: Eigen-decomposition of the covariance matrix.  

2. **t-SNE (≈ 2.09 s)**  
   - **Principle**: Compute all pairwise similarities → O(n²), then optimize with stochastic gradient descent, becomes prohibitive for n > a few thousand.

3. **UMAP (≈ 0.70 s)**  
   - **Principle**:  
     - Build a nearest-neighbor graph (≈ O(n log n)) via NN-descent.  
     - Optimize embedding with SGD (linear in n for fixed iterations).  
   - **Outcome**: Balanced trade-off between speed and quality (local & global), we can say on the graph, it gives the most performant results among the other methods.


## Why choose UMAP in the topic modeling pipeline?

UMAP is a leading choice in topic modeling pipelines because it combines **performance**, **scalability**, and **structural quality**. It is significantly faster than t-SNE and separates clusters more effectively than PCA and t-sne. Its key strength lies in its unique ability to preserve both **local** structure (creating dense, coherent groups) and **global** structure (maintaining the relationships between clusters), which is crucial for a faithful data representation. The method is also highly scalable, handling thousands to over a million documents, especially when accelerated on GPUs with RAPIDS AI to avoid costly quadratic complexity. By integrating it before a clustering step like HDBSCAN, UMAP significantly improves the semantic coherence of topics by providing a richer, better-structured data representation.

But the algorithm's output is **sensitive to its hyperparameters**, such as `n_neighbors` and `min_dist`, requiring careful tuning. Additionally, while fast, its runtime can still range from seconds to minutes on very large datasets, a practical limitation to keep in mind.

> **Note**:  
> With **RAPIDS AI** and GPUs, even hundreds of thousands of texts are no longer a bottleneck.  
> Becareful, not only the method of dimension reduction which gives best quality, but also the algorithm of embeddings.

See More: ["UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction"](https://arxiv.org/pdf/1802.03426)
