In [2]:
from sentence_transformers import SentenceTransformer
import umap
from sklearn.cluster import HDBSCAN
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

  from .autonotebook import tqdm as notebook_tqdm


## Load text and llm-generated summaries

In [7]:
with open('sample.txt', 'r', encoding='utf-8') as file:
    texts = [line.strip() for line in file]

with open('summaries.txt', 'r', encoding='utf-8') as file:
    summaries = [line.strip() for line in file]

## Initialise Topic Modelling Worflow

In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # embed text
clusterer = HDBSCAN(min_cluster_size=2, metric='euclidean') # cluster embeddings
reducer = umap.UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42) # reduce embeddings into 2d space for visualisation

### Implement on raw text

In [None]:
# Encode the raw texts into embeddings using the pre-trained SentenceTransformer model
embeddings = model.encode(texts, show_progress_bar=True)

# Perform clustering on the embeddings using HDBSCAN
cluster_labels = clusterer.fit_predict(embeddings)

# Reduce the dimensionality of the embeddings to 2D for visualization using UMAP
embedding_2d = reducer.fit_transform(embeddings)

# Create a DataFrame to store the 2D embeddings, raw texts, and their cluster labels
df_text = pd.DataFrame({
    'x': embedding_2d[:, 0],  # x-coordinate of the 2D embedding
    'y': embedding_2d[:, 1],  # y-coordinate of the 2D embedding
    'text': texts,            # original raw texts
    'cluster': cluster_labels # cluster labels assigned by HDBSCAN
})

# Create a scatter plot for visualizing the 2D embeddings of the raw texts
scatter_text = go.Scatter(
        x=df_text['x'],        # x-coordinates for the scatter plot
        y=df_text['y'],        # y-coordinates for the scatter plot
        mode='markers',        # marker style for the scatter plot
        marker=dict(
            size=6,            # size of the markers
            color=df_text['cluster'],  # color markers by cluster labels
            colorscale='YlGnBu',       # color scale for the clusters
        ),
        text=df_text['text'],  # text to display on hover
        hoverinfo='text'       # display text on hover
    )

# Update the layout of the scatter plot with a title
scatter_text.update_layout(
    title='Raw text'  # title of the scatter plot
)

NameError: name 'texts' is not defined

### Implement on LLM-generated summaries

In [None]:
embeddings = model.encode(summaries, show_progress_bar=True)
cluster_labels = clusterer.fit_predict(embeddings)
embedding_2d = reducer.fit_transform(embeddings)
df_sum = pd.DataFrame({
    'x': embedding_2d[:, 0],
    'y': embedding_2d[:, 1],
    'text': texts,
    'cluster': cluster_labels
})
scatter_sum = go.Scatter(
    x=df_sum['x'],
    y=df_sum['y'],
    mode='markers',
    marker=dict(
        size=6,
        color=df_sum['cluster'],  
        colorscale='YlGnBu',       
    ),
    text=df_sum['text'],             
    hoverinfo='text'                  
    )

scatter_sum.update_layout(
    title='LLM-Generated Summaries'
)

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.26it/s]


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    scatter_text,
    row=1, col=1
)

fig.add_trace(
    scatter_sum,
    row=1, col=2
)