In [1]:
import warnings 
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import json
import glob
import ast
from sklearn.preprocessing import LabelEncoder

## Meta dataset preprocessing

In [2]:
meta_csv_file_path = '/kaggle/input/meta-pet-supplies-csv/meta_pet_supplies_csv.csv'
df_meta = pd.read_csv(meta_csv_file_path)

print("Number of rows in the dataset:", df_meta.shape[0])
print("Number of columns in the dataset:", df_meta.shape[1])

Number of rows in the dataset: 205999
Number of columns in the dataset: 19


In [3]:
df_meta.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"['Pet Supplies', 'Dogs', 'Health Supplies', 'H...",,['Dr. Rexy hemp oil has powerful anti-inflamma...,,DR.REXY Hemp Oil for Dogs and Cats - 100% Orga...,[],,DR.REXY,['Made strictly with organic derived ingredien...,"['>#93,463 in Grocery &amp; Gourmet Food (See ...",[],Amazon Home,,,$19.90,061539972X,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,
1,"['Pet Supplies', 'Dogs']",,['Know exactly what your dog is eating with 10...,,Pet Qwerks Treat Cookbook with Cutters,[],,Pet Qwerks,"['Recipe book', 'Cookie cutters', 'Indoor/outd...","190,234 in Pet Supplies (","['B075DYQ1PH', '1604334657', '1604336919', '16...",Pet Supplies,,,$7.86,0615553605,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,
2,"['Pet Supplies', 'Dogs', 'Food', 'Dry']",,"['', '', '', '']",,The Best of 101 Dog Tricks,"['0760339740', '1592533256', '1592535305', '15...",,,[],"76,847 in Movies &amp; TV (","['1592533256', '0760339740', '1592537308', '15...",Movies &amp; TV,,,,0760339597,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,
3,[],,"['', 'The venomous but beautiful scorpionfishe...",,Lionfishes and Other Scorpionfishes: The Compl...,"['1911142186', '1514291983']",,Tfh/Nylabone,['Used Book in Good Condition'],"435,039 in Pet Supplies (",[],Pet Supplies,,,$24.99,0793816793,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,
4,"['Pet Supplies', 'Top Selection from AmazonPets']",,['Volume 1: 96 Words &amp; Phrases! This is th...,,Pet Media Feathered Phonics The Easy Way To Te...,"['B0002FP328', 'B0002FP32S', 'B0002FP32I', 'B0...",,Pet Media,['Award-winning audio CD teaches parrots how t...,"52,435 in Pet Supplies (","['B0002FP328', 'B00CAMARXG', 'B0002FP32S', 'B0...",Pet Supplies,,,$6.97,0972585419,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,


#### Drop duplicates

In [4]:
print('Number of products/rows in meta dataset:', df_meta.shape[0])
print('Number of unique products in meta dataset:', len(df_meta['asin'].unique()))

duplicate_rows = df_meta[df_meta.duplicated()]
print('Number of duplicated rows:', len(duplicate_rows))

df_meta = df_meta.drop_duplicates()
print('Number of products/rows in meta dataset after dropping duplicated rows:', df_meta.shape[0])

Number of products/rows in meta dataset: 205999
Number of unique products in meta dataset: 198265
Number of duplicated rows: 7734
Number of products/rows in meta dataset after dropping duplicated rows: 198265


#### Creating new column with the second category from the list of categories

In [5]:
def get_category(category_str):
    category_list = ast.literal_eval(category_str)
    if len(category_list) == 0: 
        return None
    if len(category_list) < 3:
        return category_list[1]
    return category_list[2]

df_meta['selected_category'] = df_meta['category'].apply(get_category)

#### Some records now contain None value in the selected_category, because some of the lists in the category column are empty, we will remove them

In [6]:
print('Number of products/rows in meta dataset:', df_meta.shape[0])
none_count = df_meta['selected_category'].isna().sum()
print("Number of None values in selected_category column:", none_count)

df_meta = df_meta[df_meta['selected_category'].notna()]

print("Number of None values in selected_category column after dropping rows with None values:", df_meta['selected_category'].isna().sum())
print('Number of products/rows in meta dataset after dropping rows with None values:', df_meta.shape[0])

Number of products/rows in meta dataset: 198265
Number of None values in selected_category column: 16097
Number of None values in selected_category column after dropping rows with None values: 0
Number of products/rows in meta dataset after dropping rows with None values: 182168


#### Some of the selected categories have disturbed format - contain '\&amp;' instead of '&', so they are duplicated

#### For example: 
- Fish \&amp; Aquatic Pets - Fish & Aquatic Pets
- Cat Food \&amp; Supplies - Cat Food & Supplies

#### We will replace '\&amp;' with '&'

In [7]:
df_meta['selected_category'] = df_meta['selected_category'].str.replace('&amp;', '&', regex=False)

In [8]:
df_meta_selected = df_meta[['asin', 'selected_category']].copy()
df_meta_selected

Unnamed: 0,asin,selected_category
0,061539972X,Health Supplies
1,0615553605,Dogs
2,0760339597,Food
4,0972585419,Top Selection from AmazonPets
5,0975412868,Grooming
...,...,...
205994,B01HJ9ULQW,"Collars, Harnesses & Leashes"
205995,B01HJ9OZZA,Carriers & Travel Products
205996,B01HJABKBQ,Apparel & Accessories
205997,B01HJCJ8KO,"Collars, Harnesses & Leashes"


-----

## All users who had at least 3 purchases

In [9]:
file_paths = glob.glob('/kaggle/input/all-users-min3reviews/all_users_min3reviews/min3reviews-seqlen2-users-*.json')

dfs = [pd.read_json(fp) for fp in file_paths]
df_users = pd.concat(dfs, ignore_index=True)
df_users = df_users.rename(columns={
    'bought_before_1': 'purchased_before_1',
    'bought_before_2': 'purchased_before_2'
})

print("Number of records (seguences and targets) collected from all users who had at least 3 purchases:", df_users.shape[0])
df_users.head()

Number of records (seguences and targets) collected from all users who had at least 3 purchases: 2302063


Unnamed: 0,reviewerID,productID,purchased_before_1,purchased_before_2
0,AFK148CK2KIX,B0009YD8OC,B0002H3ZLM,B0006UJW1W
1,AFK148CK2KIX,B0009YD8OC,B0006UJW1W,B0009YD8OC
2,A3VI1QO3F914NE,B001B5AU4Y,B001B57KYW,B0009YJWW4
3,A3VI1QO3F914NE,B00B5I7CO8,B0009YJWW4,B001B5AU4Y
4,A3HHG99SLZOJIG,B000MCVGD6,B001P3PR5O,B002CZLNPC


#### Removing sequences and targets where any of the product that users purchased is not in the metadata dataset

In [10]:
valid_product_ids = df_meta['asin']
print("Number of sequences and targets before removal:", df_users.shape[0])

filtered_df_users = df_users[
    (df_users['productID'].isin(valid_product_ids)) &
    (df_users['purchased_before_1'].isin(valid_product_ids)) &
    (df_users['purchased_before_2'].isin(valid_product_ids))
]

print("Number of sequences and targets after removal:", filtered_df_users.shape[0])

Number of sequences and targets before removal: 2302063
Number of sequences and targets after removal: 2210735


In [11]:
filtered_df_users.head()

Unnamed: 0,reviewerID,productID,purchased_before_1,purchased_before_2
0,AFK148CK2KIX,B0009YD8OC,B0002H3ZLM,B0006UJW1W
1,AFK148CK2KIX,B0009YD8OC,B0006UJW1W,B0009YD8OC
2,A3VI1QO3F914NE,B001B5AU4Y,B001B57KYW,B0009YJWW4
3,A3VI1QO3F914NE,B00B5I7CO8,B0009YJWW4,B001B5AU4Y
4,A3HHG99SLZOJIG,B000MCVGD6,B001P3PR5O,B002CZLNPC


### Creating dictionary with non encoded sequences and targets

In [12]:
users_sequences_non_encoded = {}

for index, row in filtered_df_users.iterrows():
    reviewer_id = row['reviewerID']

    if reviewer_id not in users_sequences_non_encoded:
        users_sequences_non_encoded[reviewer_id] = []
        users_sequences_non_encoded[reviewer_id].append(row['purchased_before_1'])
        users_sequences_non_encoded[reviewer_id].append(row['purchased_before_2'])
        users_sequences_non_encoded[reviewer_id].extend(row[['productID']])

    elif reviewer_id in users_sequences_non_encoded:
        users_sequences_non_encoded[reviewer_id].extend(row[['productID']])

len(users_sequences_non_encoded)

601263

In [17]:
converted_users_sequences_non_encoded = {
    str(key): [str(item) for item in value] for key, value in users_sequences_non_encoded.items()
}

In [18]:
with open('users_sequences_non_encoded.json', 'w') as file:
    json.dump(converted_users_sequences_non_encoded, file)

### Creating dictionary with encoded sequences and targets and encoding dictionaries for product IDs and user IDs

In [19]:
le_products = LabelEncoder()
le_reviewer = LabelEncoder()

product_columns = ['productID', 'purchased_before_1', 'purchased_before_2']
unique_products = pd.unique(filtered_df_users[product_columns].values.ravel('K'))

le_products.fit(unique_products)
product_encoding = {str(product): i + 1 for i, product in enumerate(le_products.classes_)}

for column in product_columns:
    filtered_df_users[column] = filtered_df_users[column].map(product_encoding)

In [20]:
unique_reviewers = pd.unique(filtered_df_users['reviewerID'])
le_reviewer.fit(unique_reviewers)
reviewer_encoding = {reviewer: i + 1 for i, reviewer in enumerate(le_reviewer.classes_)}

filtered_df_users['reviewerID'] = filtered_df_users['reviewerID'].map(reviewer_encoding)

In [21]:
filtered_df_users

Unnamed: 0,reviewerID,productID,purchased_before_1,purchased_before_2
0,511352,9335,5250,8489
1,511352,9335,8489,9335
2,457053,23029,22998,9463
3,457053,67738,9463,23029
4,395146,14270,26998,30293
...,...,...,...,...
2302058,196548,108535,5089,108533
2302059,196548,22344,108533,108535
2302060,196548,18742,108535,22344
2302061,196548,129650,22344,18742


In [22]:
with open('users_encoding.json', 'w') as file:
    json.dump(reviewer_encoding, file)

In [23]:
with open('products_encoding.json', 'w') as file:
    json.dump(product_encoding, file)

In [24]:
users_sequences = {}

for index, row in filtered_df_users.iterrows():
    reviewer_id = row['reviewerID']

    if reviewer_id not in users_sequences:
        users_sequences[reviewer_id] = []
        users_sequences[reviewer_id].append(row['purchased_before_1'])
        users_sequences[reviewer_id].append(row['purchased_before_2'])
        users_sequences[reviewer_id].extend(row[['productID']])
        
    elif reviewer_id in users_sequences:
        users_sequences[reviewer_id].extend(row[['productID']])

len(users_sequences)

601263

In [25]:
converted_users_sequences = {
    int(key): [int(item) for item in value] for key, value in users_sequences.items()
}

In [26]:
with open('users_sequences.json', 'w') as file:
    json.dump(converted_users_sequences, file)