# Convert raw data to 'strict' json

In [1]:
import json
import gzip
import os
dataset_name = "Beauty"
os.makedirs(dataset_name, exist_ok=True)

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.dumps(eval(l))

# Beauty dataset
f = open(f"./{dataset_name}/{dataset_name}.json", 'w')
for l in parse(f"reviews_{dataset_name}_5.json.gz"):
  f.write(l + '\n')

In [17]:
# print the number of lines in the file and the first line
data = open(f"./{dataset_name}/{dataset_name}.json", 'r')
print("Number of lines:", sum(1 for _ in data))
data.seek(0)  # Reset file pointer to the beginning
print("First line:", data.readline().strip())
data.close()

Number of lines: 198502
First line: {"reviewerID": "A1YJEY40YUW4SE", "asin": "7806397051", "reviewerName": "Andrea", "helpful": [3, 4], "reviewText": "Very oily and creamy. Not at all what I expected... ordered this to try to highlight and contour and it just looked awful!!! Plus, took FOREVER to arrive.", "overall": 1.0, "summary": "Don't waste your money", "unixReviewTime": 1391040000, "reviewTime": "01 30, 2014"}


In [29]:
import numpy as np
import pandas as pd

# Initialize mapping dictionaries
userID_mapping = {}
itemID_mapping = {}

# Open the JSON file for reading
data = open(f"./{dataset_name}/{dataset_name}.json", 'r')

# Initialize lists to store userID, itemID, and timestamp
userIDs = []
itemIDs = []
timestamps = []

# Process each line in the JSON file
for line in data:
    review = json.loads(line.strip())
    userID = review['reviewerID']
    itemID = review['asin']
    timestamp = review['unixReviewTime']
    
    # Map userID to an integer starting from 1
    if userID not in userID_mapping:
        userID_mapping[userID] = len(userID_mapping) + 1
    
    # Map itemID to an integer starting from 1
    if itemID not in itemID_mapping:
        itemID_mapping[itemID] = len(itemID_mapping) + 1
    
    # Append mapped values and timestamp to lists
    userIDs.append(userID_mapping[userID])
    itemIDs.append(itemID_mapping[itemID])
    timestamps.append(timestamp)

# Save mapping dictionaries as .npy files
np.save(f'./{dataset_name}/user_mapping.npy', userID_mapping)
print("user_num:", len(userID_mapping))
print("the first five userID mapping:", list(userID_mapping.items())[:5])
np.save(f'./{dataset_name}/item_mapping.npy', itemID_mapping)
print("item_num:", len(itemID_mapping))
print("the first five itemID mapping:", list(itemID_mapping.items())[:5])

# Group itemIDs by userID and sort by timestamp
user_item_mapping = {}
for userID, itemID, timestamp in zip(userIDs, itemIDs, timestamps):
    if userID not in user_item_mapping:
        user_item_mapping[userID] = []
    user_item_mapping[userID].append((itemID, timestamp))

# Sort itemIDs for each user by timestamp
for userID in user_item_mapping:
    user_item_mapping[userID].sort(key=lambda x: x[1])
    user_item_mapping[userID] = [item[0] for item in user_item_mapping[userID]]

# Print a sample of the results
print("user-item mapping:", list(user_item_mapping.items())[:5])

# Split data into training, validation, and testing sets using leave-one-out strategy
train_data = {}
val_data = {}
test_data = {}

for userID, item_sequence in user_item_mapping.items():
    # Assign the last item for testing, the second-to-last for validation, and the rest for training
    train_data[userID] = item_sequence[:-2]
    val_data[userID] = item_sequence[:-1]
    test_data[userID] = item_sequence

# Print a sample of the split data
print("training data:", list(train_data.items())[:5])
print("validation data:", list(val_data.items())[:5])
print("testing data:", list(test_data.items())[:5])

# Prepare data for train, validation, and test sets
def prepare_data(data_dict):
    rows = []
    for userID, item_sequence in data_dict.items():
        history = item_sequence[:-1]
        target = item_sequence[-1]
        rows.append({'user': userID, 'history': history, 'target': target})
    return pd.DataFrame(rows)

# Create dataframes for train, validation, and test sets
train_df = prepare_data(train_data)
print("\nTraining data shape:", train_df.shape)
print("the first 3 rows of training data:\n", train_df.head(3))
val_df = prepare_data(val_data)
print("\nValidation data shape:", val_df.shape)
print("the first 3 rows of validation data:\n", val_df.head(3))
test_df = prepare_data(test_data)
print("\nTesting data shape:", test_df.shape)
print("the first 3 rows of testing data:\n", test_df.head(3))

# Save dataframes to parquet files
train_df.to_parquet(f'./{dataset_name}/train.parquet', index=False)
val_df.to_parquet(f'./{dataset_name}/valid.parquet', index=False)
test_df.to_parquet(f'./{dataset_name}/test.parquet', index=False)

print("Data saved to parquet files.")

data.close()


user_num: 22363
the first five userID mapping: [('A1YJEY40YUW4SE', 1), ('A60XNB876KYML', 2), ('A3G6XNM240RMWA', 3), ('A1PQFP6SAJ6D80', 4), ('A38FVHZTNQ271F', 5)]
item_num: 12101
the first five itemID mapping: [('7806397051', 1), ('9759091062', 2), ('9788072216', 3), ('9790790961', 4), ('9790794231', 5)]
user-item mapping: [(1, [6846, 7873, 4585, 1, 5406]), (2, [816, 10406, 11194, 11651, 9716, 1, 233]), (3, [1, 6050, 7977, 5252, 4211, 243, 11204, 5863, 6609]), (4, [5522, 439, 5161, 11140, 1, 7849]), (5, [1, 10470, 10064, 9403, 10362, 4758, 6500, 11444, 11390])]
training data: [(1, [6846, 7873, 4585]), (2, [816, 10406, 11194, 11651, 9716]), (3, [1, 6050, 7977, 5252, 4211, 243, 11204]), (4, [5522, 439, 5161, 11140]), (5, [1, 10470, 10064, 9403, 10362, 4758, 6500])]
validation data: [(1, [6846, 7873, 4585, 1]), (2, [816, 10406, 11194, 11651, 9716, 1]), (3, [1, 6050, 7977, 5252, 4211, 243, 11204, 5863]), (4, [5522, 439, 5161, 11140, 1]), (5, [1, 10470, 10064, 9403, 10362, 4758, 6500, 11444]

# Generate Item Semantic Embeddings

In [30]:
# Beauty metadata 
f = open(f"./{dataset_name}/{dataset_name}_metadata.json", 'w')
for l in parse(f"meta_{dataset_name}.json.gz"):
  f.write(l + '\n')

In [38]:
# Open the metadata file for reading
with open(f"./{dataset_name}/{dataset_name}_metadata.json", 'r') as metadata_file:
    # Create a reverse mapping from itemID to asin
    reverse_itemID_mapping = {v: k for k, v in itemID_mapping.items()}
    
    # Initialize a dictionary to store the extracted information
    item_info = {}
    
    # Process each line in the metadata file
    for line in metadata_file:
        metadata = json.loads(line.strip())
        asin = metadata.get('asin')
        
        # Check if the asin exists in the reverse mapping
        if asin in reverse_itemID_mapping.values():
            itemID = itemID_mapping[asin]
            item_info[itemID] = {
                'title': metadata.get('title') if metadata.get('title') else None,
                'price': metadata.get('price') if metadata.get('price') else None,
                'salesRank': metadata.get('salesRank') if metadata.get('salesRank') else None,
                'brand': metadata.get('brand') if metadata.get('brand') else None,
                'categories': metadata.get('categories') if metadata.get('categories') else None,
            }

# Print the information for the first 5 items
for itemID, info in list(item_info.items())[:5]:
    print(f"ItemID: {itemID}, Info: {info}")

ItemID: 1, Info: {'title': 'WAWO 15 Color Professionl Makeup Eyeshadow Camouflage Facial Concealer Neutral Palette', 'price': 5.04, 'salesRank': {'Beauty': 10486}, 'brand': 'COKA', 'categories': [['Beauty', 'Makeup', 'Face', 'Concealers & Neutralizers']]}
ItemID: 2, Info: {'title': 'Xtreme Brite Brightening Gel 1oz.', 'price': 19.99, 'salesRank': {'Beauty': 52254}, 'brand': 'Xtreme Brite', 'categories': [['Beauty', 'Hair Care', 'Styling Products', 'Creams, Gels & Lotions']]}
ItemID: 3, Info: {'title': 'Prada Candy By Prada Eau De Parfum Spray 1.7 Oz For Women', 'price': 65.86, 'salesRank': {'Beauty': 78916}, 'brand': 'Prada', 'categories': [['Beauty', 'Fragrance', "Women's", 'Eau de Parfum']]}
ItemID: 4, Info: {'title': 'Versace Bright Crystal Eau de Toilette Spray for Women, 3 Ounce', 'price': 52.33, 'salesRank': {'Beauty': 764}, 'brand': 'Versace', 'categories': [['Beauty', 'Fragrance', "Women's", 'Eau de Toilette']]}
ItemID: 5, Info: {'title': 'Stella McCartney Stella', 'price': Non

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize the SentenceTransformer model
#  modelscope download --model sentence-transformers/sentence-t5-base  --local_dir ./dir
model = SentenceTransformer('./sentence-t5-base')

# Prepare data for embedding
item_embeddings = []
for itemID, info in item_info.items():
    # Combine relevant fields into a single text for embedding
    semantics = f"'title':{info.get('title', '')}\n 'price':{info.get('price', '')}\n 'salesRank':{info.get('salesRank', '')}\n 'brand':{info.get('brand', '')}\n 'categories':{info.get('categories', '')}"
    embedding = model.encode(semantics)
    item_embeddings.append({'ItemID': itemID, 'embedding': embedding.tolist()})

# Convert to DataFrame
item_emb_df = pd.DataFrame(item_embeddings)

print("\nItem embeddings DataFrame shape:", item_emb_df.shape)
print("The first 3 rows of item embeddings DataFrame:\n", item_emb_df.head(3))

# Save to parquet file
item_emb_df.to_parquet(f'./{dataset_name}/item_emb.parquet', index=False)

print("Item embeddings saved to item_emb.parquet.")
# embeddings = np.array([item['embedding'] for item in item_embeddings])
# np.save(f'./{dataset_name}/item_emb.npy', embeddings)

# print("Item embeddings saved to item_emb.npy.")


Item embeddings DataFrame shape: (12101, 2)
The first 3 rows of item embeddings DataFrame:
    ItemID                                          embedding
0       1  [0.005812718532979488, 0.0014312762068584561, ...
1       2  [-0.005331065971404314, -0.0387347936630249, 0...
2       3  [-0.03325002267956734, -0.030644146725535393, ...
Item embeddings saved to item_emb.parquet.
