In [2]:
import pandas as pd
import datasets
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
datasets.logging.set_verbosity_error()

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aryap\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aryap\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

LOAD

In [3]:
DATASET = 'McAuley-Lab/Amazon-Reviews-2023'
SELECTED_CATEGORY = 'Video_Games'

dataset = datasets.load_dataset(DATASET, 'raw_review_'+SELECTED_CATEGORY, trust_remote_code=True)
raw_df = dataset['full'].to_pandas()

In [4]:
raw_df.rename(columns={'parent_asin': 'product_id'}, inplace=True)

# Remove duplicates (same as original)
raw_df.drop_duplicates(['user_id', 'product_id'], inplace=True)

Enforce Minimum Interaction Threshold

In [5]:
def filter_data(df, min_interactions=5):
    while True:
        user_counts = df['user_id'].value_counts()
        df = df[df['user_id'].isin(user_counts[user_counts >= min_interactions].index)]

        item_counts = df['product_id'].value_counts()
        df = df[df['product_id'].isin(item_counts[item_counts >= min_interactions].index)]

        # Check if filtering is complete
        if (user_counts[df['user_id'].unique()].min() >= min_interactions and 
            item_counts[df['product_id'].unique()].min() >= min_interactions):
            break
    return df.reset_index(drop=True)

filtered_df = filter_data(raw_df, min_interactions=5)

Numerical User and Product ID

In [6]:
user_ids = filtered_df['user_id'].unique()
encode_user_id = {id: i for i, id in enumerate(user_ids)}
decode_user_id = {i: id for i, id in enumerate(user_ids)}
filtered_df['user_id_numeric'] = filtered_df['user_id'].map(encode_user_id)

item_ids = filtered_df['product_id'].unique()
encode_item_id = {id: i for i, id in enumerate(item_ids)}
decode_item_id = {i: id for i, id in enumerate(item_ids)}
filtered_df['product_id_numeric'] = filtered_df['product_id'].map(encode_item_id)

# Convert the ratings to float
filtered_df['rating'] = filtered_df['rating'].astype(float)

# Create a simplified dataset for TF-IDF clustering
tfidf_dataset = filtered_df[['user_id_numeric', 'product_id_numeric', 
                            'user_id', 'product_id',
                            'title', 'text', 'rating', 
                            'timestamp', 'helpful_vote']]

Text Cleaning Functions

In [7]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and extra whitespace
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace special chars with space
    text = re.sub(r'\d+', ' ', text)      # Replace numbers with space
    text = re.sub(r'\s+', ' ', text)      # Replace multiple spaces with single space
    text = text.strip()                   # Remove leading/trailing whitespace
    
    return text

def tokenize_and_remove_stopwords(text):
    if not text:
        return ""
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Apply stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

Apply Text Cleaning

In [8]:
tfidf_dataset['title_clean'] = tfidf_dataset['title'].fillna('').apply(clean_text)
tfidf_dataset['text_clean'] = tfidf_dataset['text'].fillna('').apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tfidf_dataset['title_clean'] = tfidf_dataset['title'].fillna('').apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tfidf_dataset['text_clean'] = tfidf_dataset['text'].fillna('').apply(clean_text)


In [9]:
tfidf_dataset['combined_text'] = (tfidf_dataset['title_clean'] + ' ' + 
                                 tfidf_dataset['text_clean'])
tfidf_dataset['processed_text'] = tfidf_dataset['combined_text'].apply(tokenize_and_remove_stopwords)

Print Data Stats

In [10]:
num_users = user_ids.shape[0]
num_items = item_ids.shape[0]
num_reviews = filtered_df.shape[0]

print(f'Number of users: {num_users}')
print(f'Number of items: {num_items}')
print(f'Number of reviews: {num_reviews}')
print(f'Avg reviews per user: {num_reviews/num_users:.2f}')
print(f'Avg reviews per item: {num_reviews/num_items:.2f}')

Number of users: 113741
Number of items: 28113
Number of reviews: 893040
Avg reviews per user: 7.85
Avg reviews per item: 31.77


Save

In [11]:
tfidf_dataset.to_parquet('amazon_reviews_for_tfidf.parquet')