In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objs as go

# Example corpus (Bag-of-Words model)
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    "The cat is playing in the garden.",
    "Dogs are barking in the street."
]

# Step 1: Convert corpus to Bag-of-Words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()

# Step 2: Perform KMeans clustering (you can change n_clusters as needed)
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

# Step 3: Dimensionality reduction using PCA to reduce BoW features to 3D
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

# Step 4: Interactive 3D scatter plot using Plotly
fig = go.Figure()

# Adding each cluster to the plot
for cluster in np.unique(labels):
    cluster_points = X_pca[labels == cluster]
    fig.add_trace(go.Scatter3d(
        x=cluster_points[:, 0],
        y=cluster_points[:, 1],
        z=cluster_points[:, 2],
        mode='markers',
        marker=dict(size=8),
        name=f"Cluster {cluster}",
        text=[corpus[i] for i in np.where(labels == cluster)[0]]  # Text shown on hover
    ))

# Customize plot layout
fig.update_layout(
    title="3D Clustering of Bag-of-Words",
    scene=dict(
        xaxis_title='PCA Component 1',
        yaxis_title='PCA Component 2',
        zaxis_title='PCA Component 3'
    ),
    showlegend=True
)

# Show plot
fig.show()
