In [4]:
# SECTION 1: Imports & Setup
import os
import pandas as pd
import re
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from datasketch import MinHash, MinHashLSH

# Credentials from kaggle account
'''
os.environ['KAGGLE_USERNAME'] = "xxxx"
os.environ['KAGGLE_KEY'] = "xxxx"
'''

# To download dataset via Kaggle API on first run
# !kaggle datasets download -d mohamedbakhet/amazon-books-reviews --unzip

# Modifying this to test scalability (e.g., 1000, 5000, 10000, 50000, etc.)
SUBSAMPLE_SIZE = 5000

# SECTION 2: Data Loading and Preprocessing

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Loading dataset
books = pd.read_csv("Books_rating.csv") 
#books = pd.read_csv("/Users/azatbekovna/Desktop/algorithms/Books_rating.csv")
books = books[books['review/text'].notnull()].reset_index(drop=True)
books = books.head(SUBSAMPLE_SIZE)

# Text preprocessing to tokenize and normalize
def preprocess(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    filtered = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return set(filtered)
books['tokens'] = books['review/text'].apply(preprocess)

# SECTION 3: MinHash Signature Generation
# --------------------------------------------
def create_minhash(tokens, num_perm=128):
    """Generate a MinHash signature for a given set of tokens."""
    m = MinHash(num_perm=num_perm)
    for token in tokens:
        m.update(token.encode('utf8'))
    return m

# Generating MinHash signatures
books['minhash'] = books['tokens'].apply(create_minhash)

# SECTION 4: LSH Indexing and Similarity Search

# Initialize LSH with chosen Jaccard threshold
lsh = MinHashLSH(threshold=0.5, num_perm=128)

# Index all documents into the LSH
for i, m in enumerate(books['minhash']):
    lsh.insert(f"doc_{i}", m)

# Find similar pairs with approximate Jaccard similarity
similar_pairs = []
visited = set()
start_time = time.time()

for i in range(len(books)):
    result = lsh.query(books['minhash'][i])
    for r in result:
        j = int(r.split('_')[1])
        if i < j and (i, j) not in visited:
            sim = books['minhash'][i].jaccard(books['minhash'][j])
            if sim > 0.5:
                similar_pairs.append({
                    'index_1': i,
                    'index_2': j,
                    'similarity': sim,
                    'review_1': books.loc[i, 'review/text'][:300],
                    'review_2': books.loc[j, 'review/text'][:300]
                })
                visited.add((i, j))

elapsed = time.time() - start_time

# SECTION 5: Results Output
# --------------------------------------------
print(f"\n✅ Found {len(similar_pairs)} similar review pairs (Jaccard > 0.5)")
print(f"⏱️ Time taken: {elapsed:.2f} seconds")

# Show first 3 pairs for demonstration
for i, pair in enumerate(similar_pairs[:3]):
    print(f"\n--- Pair {i+1} ---")
    print(f"Similarity: {pair['similarity']:.2f}")
    print("Review 1:", pair['review_1'])
    print("Review 2:", pair['review_2'])
    print("-" * 80)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/azatbekovna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



✅ Found 3207 similar review pairs (Jaccard > 0.5)
⏱️ Time taken: 1.05 seconds

--- Pair 1 ---
Similarity: 0.99
Review 1: Kurt Seligmann, Surrealist artist par excellence, admitted &amp; unashamed bibliophile, has ravaged his occult library in a miraculous marriage giving birth to this classic historical account of Magic and Occultism; entirely written for the proverbial 'man about the street', and a very cosmic avenue
Review 2: Kurt Seligmann, Surrealist artist par excellence, admitted &amp; unashamed bibliophile, has ravaged his occult library in a miraculous marriage giving birth to this classic historical account of Magic and Occultism; entirely written for the proverbial 'man about the street', and a very cosmic avenue
--------------------------------------------------------------------------------

--- Pair 2 ---
Similarity: 0.61
Review 1: Dr Baker explains clearly and engagingly how one can improve one's life by changing your subconscious pattern through the spiritual technique 