In [None]:
# downoload  lobraries to exceute to colab
!pip install pandas numpy scikit-learn kaggle --quiet


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/antoniosstsip/massive-datasets-project/blob/main/project1.ipynb)



In [None]:
##  Kaggle Dataset Download (Not Executed Locally)
⚠️ Due to SSL library limitations on the local environment (macOS with LibreSSL),
the dataset was manually downloaded from Kaggle and extracted into the working directory.

However, for completeness and reproducibility, the following cell includes
the correct code to download the dataset programmatically via the Kaggle API
when executed on environments like Google Colab or Linux.


In [None]:
# Kaggle API setup --- NOT RUN 
import os

os.environ['KAGGLE_USERNAME'] = "xxxxxx"
os.environ['KAGGLE_KEY'] = "xxxxxx"

# Replace with correct dataset ID if needed
!kaggle datasets download -d rajeevw/amazon-books-reviews

# Unzip (standard location)
import zipfile

with zipfile.ZipFile("amazon-books-reviews.zip", 'r') as zip_ref:
    zip_ref.extractall("amazon_books")


In [None]:
##  Dataset Download (Kaggle API –-- not executed locally)
import zipfile
import os


# file path
dataset_zip = "/Users/antonistsipoulakos/Downloads/archive.zip"

# file path for extraction
extract_dir = "/Users/antonistsipoulakos/Desktop/project1/amazon_books"

if os.path.exists(dataset_zip):
    with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print("✅ Extraction completed")
else:
    print("zip file not found.check path and try again.")


✅ Extraction completed


In [None]:
##Load and preview the dataset (Books_rating.csv)

import pandas as pd

csv_path = "/Users/antonistsipoulakos/Desktop/project1/amazon_books/Books_rating.csv"

df = pd.read_csv(csv_path, usecols=["review/text"])
df = df.dropna(subset=["review/text"]).reset_index(drop=True)

print(f"{len(df)} reviews loaded.")
df.head(3)



2999992 reviews loaded.


Unnamed: 0,review/text
0,This is only for Julie Strain fans. It's a col...
1,I don't care much for Dr. Seuss but after read...
2,"If people become the books they read and if ""t..."


In [None]:
## Preprocessing: clean and tokenize reviews

import re

# cleaning function
def preprocess(text):
    text = str(text).lower()                          # πεζά
    text = re.sub(r"[^\w\s]", "", text)               # αφαίρεση σημείων στίξης
    return set(text.split())                          # μετατροπή σε σύνολο λέξεων

# sample processing (first 500 reviews)
USE_SUBSAMPLE = True
N = 500 if USE_SUBSAMPLE else len(df)

processed_reviews = df["review/text"][:N].apply(preprocess)

print(f" {N} reviews processed .")
processed_reviews.head(3)


 500 reviews processed .


0    {pages, fans, collection, find, theres, isnt, ...
1    {oh, yeats, industry, hesitate, seussgeisel, c...
2    {an, prof, find, via, cubist, avant, verse, is...
Name: review/text, dtype: object

In [None]:
## Compute Jaccard similarity between pairs of reviews


def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

# (brute-force) calculate Jaccard similarity for all pairs
similar_pairs = []
threshold = 0.5  # we can change it 

for i in range(N):
    for j in range(i + 1, N):
        sim = jaccard_similarity(processed_reviews[i], processed_reviews[j])
        if sim >= threshold:
            similar_pairs.append((i, j, sim))

print(f" {len(similar_pairs)} pairs found with similarity ≥ {threshold}")


 7 pairs found with similarity ≥ 0.5


In [None]:
##  Display similar review pairs

# Show first 5 similar pairs
for i, j, sim in similar_pairs[:5]:
    print(f"\nPair {i}-{j} (Similarity: {sim:.2f})")
    print("📘 Review 1:", df["review/text"][i][:300], "...")
    print("📘 Review 2:", df["review/text"][j][:300], "...")



🔗 Pair 162-164 (Similarity: 0.99)
📘 Review 1: Kurt Seligmann, Surrealist artist par excellence, admitted &amp; unashamed bibliophile, has ravaged his occult library in a miraculous marriage giving birth to this classic historical account of Magic and Occultism; entirely written for the proverbial 'man about the street', and a very cosmic avenue ...
📘 Review 2: Kurt Seligmann, Surrealist artist par excellence, admitted &amp; unashamed bibliophile, has ravaged his occult library in a miraculous marriage giving birth to this classic historical account of Magic and Occultism; entirely written for the proverbial 'man about the street', and a very cosmic avenue ...

🔗 Pair 198-201 (Similarity: 0.65)
📘 Review 1: Dr Baker explains clearly and engagingly how one can improve one's life by changing your subconscious pattern through the spiritual technique called treatment. The essence of treatment is this: When the conscious mind of the individual deliberately selects a creative idea and deliver

In [None]:
## Subsampling: control execution time and scalability

To ensure scalability and fast execution, especially in environments like Google Colab, we use a global flag `USE_SUBSAMPLE`.
If `True`, only a small subset of the dataset is used (default: 100 reviews).  
If `False`, the entire dataset is processed.

This allows us to demonstrate correctness on small samples while ensuring that the code can scale up easily.


In [7]:
# Global toggle for subsampling
import re

USE_SUBSAMPLE = True     # Set to False to process the full dataset
SAMPLE_SIZE = 100        # Number of reviews to use when subsampling

# 🧪 Select data size based on toggle
N = SAMPLE_SIZE if USE_SUBSAMPLE else len(df)

# Show what is being processed
print(f"Processing {'a sample of' if USE_SUBSAMPLE else 'all'} {N} reviews.")




# Preprocessing function: lowercase, remove punctuation, tokenize
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)
    return set(text.split())

# Apply preprocessing to selected reviews
processed_reviews = df["review/text"][:N].apply(preprocess)
print(f"✅ Preprocessed {N} reviews.")



# Jaccard similarity function
def jaccard_similarity(set1, set2):
    if not set1 or not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

# Compute similar review pairs
threshold = 0.5
similar_pairs = []

for i in range(N):
    for j in range(i + 1, N):
        sim = jaccard_similarity(processed_reviews[i], processed_reviews[j])
        if sim >= threshold:
            similar_pairs.append((i, j, sim))

print(f"✅ Found {len(similar_pairs)} pairs with similarity ≥ {threshold}")






Processing a sample of 100 reviews.
✅ Preprocessed 100 reviews.
✅ Found 0 pairs with similarity ≥ 0.5


In [9]:
# Jaccard similarity υπολογισμένο "from scratch"
def jaccard_similarity(set1, set2):
    """calculates the Jaccard similarity between two sets of words."""
    intersection = set1 & set2
    union = set1 | set2
    if not union:
        return 0.0
    return len(intersection) / len(union)

# === calculate similarities for all reviews ===

import pandas as pd
from itertools import combinations



# use only the first 100 reviews
SAMPLE_SIZE = 100
reviews = df["review/text"].head(SAMPLE_SIZE).tolist()

# Convert to sets of words (tokens)
tokenized_reviews = [set(str(r).lower().split()) for r in reviews]

# Compute similarities for each pair
similarities = []
for i, j in combinations(range(len(tokenized_reviews)), 2):
    sim = jaccard_similarity(tokenized_reviews[i], tokenized_reviews[j])
    similarities.append(((i, j), sim))

# Ταξινόμηση κατά score φθίνουσα
similarities.sort(key=lambda x: x[1], reverse=True)

# print top 5 similar reviews
print("\n🔝 Top 5 Similar Reviews:")
for idx, ((i, j), score) in enumerate(similarities[:5], 1):
    print(f"\n#{idx} - Similarity: {score:.3f}")
    print(f"Review {i}: {reviews[i][:200]}...")
    print(f"Review {j}: {reviews[j][:200]}...")


🔝 Top 5 Similar Reviews:

#1 - Similarity: 0.247
Review 21: I loved Whisper of the wicked saints. The story was amazing and I was pleasantly surprised at the changes in the book. I am not normaly someone who is into romance novels, but the world was raving abo...
Review 31: I happen to love romance novels, but only if they are goos romance. I am not one who loves everything, but this made my heart rejoice. I absolutlely could not put it down. Wow what a book !! You have ...

#2 - Similarity: 0.230
Review 18: I read the review directly under mine and I have to say I laughed. How can someone write a honest review on a book they read only three pages of? That was funny but also sad that people are mean enoug...
Review 21: I loved Whisper of the wicked saints. The story was amazing and I was pleasantly surprised at the changes in the book. I am not normaly someone who is into romance novels, but the world was raving abo...

#3 - Similarity: 0.225
Review 50: This is a very useful and thoro