In [3]:
import pandas as pd
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
import plotly.express as px

In [4]:
df = pd.read_csv("../data/input/brady_2016_2023.csv")

model = SentenceTransformer('bert-base-nli-mean-tokens')

# Generate BERT embeddings for the 'allegation_desc' column
embeddings = model.encode(df['allegation_desc'].tolist())

# Perform dimensionality reduction using UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, metric='cosine')
embedding = reducer.fit_transform(embeddings)

# Perform clustering using HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, max_cluster_size=30, metric='euclidean', cluster_selection_method='eom')
cluster_labels = clusterer.fit_predict(embedding)

# Add cluster labels and embedding coordinates to the DataFrame
df['cluster'] = cluster_labels
df['x'] = embedding[:, 0]
df['y'] = embedding[:, 1]

fig = px.scatter(df, x='x', y='y', color='cluster', hover_data=['allegation_desc'],
                 title='UMAP Projection with HDBSCAN Clustering (BERT Embeddings)')
fig.show()

  _torch_pytree._register_pytree_node(
