# Processing Raw Review Data for Collaborative Filtering

**Author**: Stella Zarei <br>
**Created**: 2025/03/02

In [56]:
import pandas as pd

import datasets
datasets.logging.set_verbosity_error()

In [57]:
DATASET = 'McAuley-Lab/Amazon-Reviews-2023'
SELECTED_CATEGORY = 'Video_Games'

dataset = datasets.load_dataset(DATASET, 'raw_review_'+SELECTED_CATEGORY, trust_remote_code=True)
raw_df = dataset['full'].to_pandas()

## Pre-processing

In [58]:
# Remove unwanted columns
raw_df.drop(columns=['images', 'asin', 'verified_purchase'], inplace=True)

# Rename columns
raw_df.rename(columns={'parent_asin': 'product_id'}, inplace=True)

# Remove duplicates
raw_df.drop_duplicates(['user_id', 'product_id'], inplace=True)

print(raw_df.head())

   rating                                              title  \
0     4.0                     It’s pretty sexual. Not my fav   
1     5.0                                   Good. A bit slow   
2     5.0  ... an order for my kids & they have really en...   
3     5.0                        Great alt to pro controller   
4     5.0                                      solid product   

                                                text  product_id  \
0  I’m playing on ps5 and it’s interesting.  It’s...  B07DK1H3H5   
1  Nostalgic fun.  A bit slow.  I hope they don’t...  B07SRWRH5D   
2  This was an order for my kids & they have real...  B07MFMFW34   
3  These work great, They use batteries which is ...  B0BCHWZX95   
4  I would recommend to anyone looking to add jus...  B00HUWA45W   

                        user_id      timestamp  helpful_vote  
0  AGCI7FAH4GL5FI65HYLKWTMFZ2CQ  1608186804795             0  
1  AGCI7FAH4GL5FI65HYLKWTMFZ2CQ  1587051114941             1  
2  AGXVBIUFLFGMVL

## Enforce Minimum Interaction Threshold

In [59]:
def filter_data(df, min_interactions=5):
    while True:
        user_counts = df['user_id'].value_counts()
        df = df[df['user_id'].isin(user_counts[user_counts >= min_interactions].index)]

        item_counts = df['product_id'].value_counts()
        df = df[df['product_id'].isin(item_counts[item_counts >= min_interactions].index)]

        # Check if filtering is complete
        if all(user_counts >= min_interactions) and all(item_counts >= min_interactions):
            break
    return df.reset_index(drop=True)

filtered_df = filter_data(raw_df, min_interactions=5)

## Numerical User and Product ID

This step is completed after the "inactive" users and products have been filtered out

In [60]:
# Map to numerical IDs
user_ids = filtered_df['user_id'].unique()
encode_user_id = {id: i for i, id in enumerate(user_ids)}
decode_user_id = {i: id for i, id in enumerate(user_ids)}
filtered_df['user_id'] = filtered_df['user_id'].map(encode_user_id)

item_ids = filtered_df['product_id'].unique()
encode_item_id = {id: i for i, id in enumerate(item_ids)}
decode_item_id = {i: id for i, id in enumerate(item_ids)}
filtered_df['product_id'] = filtered_df['product_id'].map(encode_item_id)

# Convert the ratings to float
filtered_df['rating'] = filtered_df['rating'].astype(float)

print(filtered_df.head())

   rating                                              title  \
0     4.0               Nice consistent coloring. Rough edge   
1     5.0                         Nice quality for the money   
2     4.0  Comfortable, Nice fit, no other accessories th...   
3     5.0      Really like the controller look with these on   
4     4.0   Looks good, not sure how functional it is though   

                                                text  product_id  user_id  \
0  I really like how consistent the bright was is...           0        0   
1  I ordered these in black and white and they bo...           1        0   
2  This is pretty comfortable and seems to do the...           2        0   
3  The straps are very functional. I do wish they...           3        0   
4  See pics, I have a Vive DAS and rubber insulat...           4        0   

       timestamp  helpful_vote  
0  1630594913298             0  
1  1620231368468             0  
2  1617641445475             0  
3  1613702112995    

In [61]:
# Statistical summary of the dataset
num_users = user_ids.shape[0]
num_items = item_ids.shape[0]
num_reviews = filtered_df.shape[0]
min_rating = filtered_df['rating'].min()
max_rating = filtered_df['rating'].max()

print(f'Number of users: {num_users}')
print(f'Number of items: {num_items}')
print(f'Number of ratings: {num_reviews}')
print(f'Minimum rating: {min_rating}')
print(f'Maximum rating: {max_rating}')

print(f'Avg reviews per user: {num_reviews/num_users:.2f}')
print(f'Avg reviews per item: {num_reviews/num_items:.2f}')

Number of users: 94762
Number of items: 25612
Number of ratings: 814586
Minimum rating: 1.0
Maximum rating: 5.0
Avg reviews per user: 8.60
Avg reviews per item: 31.80


## Normalization

Ratings are normalized using zero-centred mean for each user. This removes the use bias.

In [62]:
# Compute the user bias - the average rating given by each user
user_bias = filtered_df.groupby('user_id')['rating'].mean()

# Normalize the ratings by subtracting the user bias
filtered_df['rating_norm'] = filtered_df['rating'] - filtered_df['user_id'].map(user_bias)

## Collaborative Filtering Dataset

For user-centric collaborative filtering only a subset of the features in the raw data is required. The full data set can be condensed to only include `user_id` `product_id` `timestamp` and `rating_norm`.

The filtering performed in previous steps insures that any `user_id` in the dataset is associated to a minimum of ratings of $N$. <br>
To partition the training and test sets we will take $T$ examples into the test set, $V$ examples into the validation set and the remaining $N-T-V$ examples into the training set.


In [63]:
# Partition size
num_test = 1
num_val = 1

# Split the dataset into training, validation, and testing sets
train_set = []
val_set = []
test_set = []

# Extract the reviews for each user and assign to partitions
for user, user_reviews in filtered_df.groupby('user_id'):
    # order the reviews by timestamp
    user_reviews = user_reviews.sort_values('timestamp')
    # most recent reviews are used for testing
    test_set.append(user_reviews.iloc[-num_test:])
    # second most recent reviews are used for validation
    val_set.append(user_reviews.iloc[-num_test-num_val:-num_test])
    # the rest are used for training
    train_set.append(user_reviews.iloc[:-num_test-num_val])

# Concatenate the partitions and reset the index
train_set = pd.concat(train_set).reset_index(drop=True)
val_set = pd.concat(val_set).reset_index(drop=True)
test_set = pd.concat(test_set).reset_index(drop=True)

# Separate the features and label
X_train = train_set[['user_id', 'product_id', 'timestamp']].values
y_train = train_set['rating_norm'].values

X_val = val_set[['user_id', 'product_id', 'timestamp']].values
y_val = val_set['rating_norm'].values

X_test = test_set[['user_id', 'product_id', 'timestamp']].values
y_test = test_set['rating_norm'].values

# Print the shape of the datasets
print(f'Training set: X={X_train.shape}, y={y_train.shape}')
print(f'Validation set: X={X_val.shape}, y={y_val.shape}')
print(f'Test set: X={X_test.shape}, y={y_test.shape}')

Training set: X=(625062, 3), y=(625062,)
Validation set: X=(94762, 3), y=(94762,)
Test set: X=(94762, 3), y=(94762,)
