# Clustering Movie Reviews Using Sentence-BERT and Nearest Neighbors

In this notebook, we load a dataset of movie reviews, compute sentence embeddings using a pretrained 
Sentence-BERT model, and then use K-Nearest Neighbors to identify semantically similar reviews.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

ModuleNotFoundError: No module named 'sentence_transformers'

## 1. Load the Data

In [1]:
df = pd.read_pickle("movie_reviews.pkl")  # Replace with your actual file path
reviews = df["review"].tolist()
print(f"Loaded {len(reviews)} reviews.")

NameError: name 'pd' is not defined

## 2. Compute Sentence Embeddings

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(reviews, show_progress_bar=True)
print("Embeddings computed.")

## 3. Find Nearest Neighbors

In [None]:
n_neighbors = 5
nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
nn_model.fit(embeddings)

# For a sample review (e.g., at index 0), get its neighbors:
sample_idx = 0
distances, indices = nn_model.kneighbors([embeddings[sample_idx]])

print("Sample review:")
print(reviews[sample_idx])
print("\nNearest neighbors:")
for idx, dist in zip(indices[0], distances[0]):
    print(f"\nIndex: {idx}, Cosine distance: {dist:.4f}")
    print(reviews[idx])

## 4. Optional: Cluster Reviews with DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
cluster_labels = dbscan.fit_predict(embeddings)
df["cluster"] = cluster_labels

print("Cluster distribution:")
print(df["cluster"].value_counts())

## 5. Visualize Clusters Using PCA

In [None]:
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

plt.figure(figsize=(10,6))
sns.scatterplot(x=embeddings_2d[:,0], y=embeddings_2d[:,1], hue=df["cluster"], palette="tab10", s=60)
plt.title("Movie Review Clusters (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster", loc="best")
plt.show()