# user params

In [1]:
file_name_review = "Subscription_Boxes_review.json"
file_name_metadata = "Subscription_Boxes_metadata.json"
embedder_name = 'all-MiniLM-L6-v2'  # dim : 384, max_len : 256 (probably too short for some cases)

max_samples = 1_000

# Utility functions

In [2]:
import pandas as pd
import numpy as np
import os

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

model = SentenceTransformer(embedder_name)  # A good balance between performance and dimensionality


  from tqdm.autonotebook import tqdm, trange


### Paths manipulation

In [3]:
from multiprocessing import process


base_folder = os.path.join('..', 'data', 'raw')

# create paths
review_file_path = os.path.join(base_folder, file_name_review)
metadata_file_path = os.path.join(base_folder, file_name_metadata)

# read json file into dataframe
df_metadata = pd.read_json(metadata_file_path)
df_review = pd.read_json(review_file_path)


### Data cleaning

**Reviews**

In [4]:
# Expand timestamp
df_review['timestamp'] = pd.to_datetime(df_review['timestamp'], unit='ms')  # Convert Unix timestamp to datetime
df_review['year'] = df_review['timestamp'].dt.year
df_review['month'] = df_review['timestamp'].dt.month
df_review['day'] = df_review['timestamp'].dt.day
df_review['hour'] = df_review['timestamp'].dt.hour

# Handle missing values (ugly, to fix later)
# -> if str fill with empty string
# -> if numeric or categorical: fill with a dummy value
df_review['year'] = df_review['year'].fillna(-1).astype('int')
df_review['month'] = df_review['month'].fillna(-1).astype('int')
df_review['day'] = df_review['day'].fillna(-1).astype('int')
df_review['hour'] = df_review['hour'].fillna(-1).astype('int')
df_review['rating'] = df_review['rating'].fillna(-1).astype('int')
df_review['title'] = df_review['title'].fillna('').astype('str')
df_review['text'] = df_review['text'].fillna('').astype('str')
df_review['user_id'] = df_review['user_id'].fillna('').astype('str')
df_review['helpful_vote'] = df_review['helpful_vote'].fillna(-1).astype('int')
df_review['verified_purchase'] = df_review['verified_purchase'].fillna(-1).astype('int')

# concatenate some text together
# -> this way less features because each embeddings is of high dimensionality
df_review['review_text'] = df_review['title'] + '/n/n' + df_review['text']

# drop some features:
# - timestamp: tranformed
# - images: too complex
# - user_id: not informative
features_drop = ['timestamp', 'text', 'title']
features_drop += ['images', 'user_id']
df_review = df_review.drop(columns=features_drop)

# Drop some elements that cannot be used
df_review.drop_duplicates(inplace=True)
df_review = df_review.dropna(subset=['asin', 'parent_asin'])  # drop if 'asin' or 'parent_asin'  not filled


**Metadata**

In [5]:
# Handle missing values
# features -> concatenated in a string
df_metadata['main_category'] = df_metadata['main_category'].fillna('').astype('category')
df_metadata['title'] = df_metadata['title'].fillna('').astype('str')
df_metadata['average_rating'] = df_metadata['average_rating'].fillna(-1).astype('float')
df_metadata['rating_number'] = df_metadata['rating_number'].fillna(-1).astype('int')
df_metadata['features'] = df_metadata['features'].apply(lambda x: ' '.join(x) if isinstance(x, list) and x else '').astype('str')
df_metadata['store'] = df_metadata['store'].fillna('').astype('category')
df_metadata['parent_asin'] = df_metadata['parent_asin'].fillna('').astype('str')
#df_metadata['description'] = df_metadata['description'].fillna('')
#df_metadata['price'] = df_metadata['price'].fillna(0)

# concatenate some text together
# -> this way less features because each embeddings is of high dimensionality
df_metadata['metadata_text'] = df_metadata[['title','features']].astype(str).agg('/n/n'.join, axis=1)

# Drop some features
# - `images` (list of str): too complex
# - `videos` (list of str): too complex
# - `details` (dict): mostly empty
# - `categories`: mostly empty
# - `bought_together` (boolean): mostly empty
# - `price` (float): mostly empty
# - `description` (list of str): mostly empty
features_to_drop = ['title', 'features']
features_to_drop += ['images', 'videos', 'details', 'categories', 'bought_together', 'price', 'description']
df_metadata = df_metadata.drop(columns=features_to_drop)

# Drop duplicates
df_metadata.drop_duplicates(inplace=True)

# Drop rows if 'asin' or 'parent_asin' are not filled
df_metadata = df_metadata.dropna(subset=['parent_asin'])

### Merge

In [6]:
# Merge the datasets on 'parent_asin' with suffixes for duplicate columns
merged_df = pd.merge(df_review, df_metadata, on='parent_asin', how='inner', suffixes=('_review', '_metadata'))
display(merged_df.head(1))

# limit to wanted sample size
merged_df = merged_df.sample(n=max_samples, random_state=42)  # random_state for reproducibility

# list feature types
# Automatically retrieve column types
numerical_features = merged_df.select_dtypes(include=['number']).columns.tolist()
categorical_features = merged_df.select_dtypes(include=['category', 'bool']).columns.tolist()
textual_features = merged_df.select_dtypes(include=['object']).columns.tolist()
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)
print("Textual features:", textual_features)


Unnamed: 0,rating,asin,parent_asin,helpful_vote,verified_purchase,year,month,day,hour,review_text,main_category,average_rating,rating_number,store,metadata_text
0,1,B07G584SHG,B09WC47S3V,2,1,2020,10,8,5,USELESS/n/nAbsolutely useless nonsense and a c...,SUBSCRIPTION BOXES,4.1,2962,KitNipBox,KitNipBox | Happy Cat Box | Monthly Cat Subscr...


Numerical features: ['rating', 'helpful_vote', 'verified_purchase', 'year', 'month', 'day', 'hour', 'average_rating', 'rating_number']
Categorical features: ['main_category', 'store']
Textual features: ['asin', 'parent_asin', 'review_text', 'metadata_text']


### Embeddings of textual features

In [7]:
review_embeddings = model.encode(merged_df['review_text'].tolist(), show_progress_bar=True)
metadata_embeddings = model.encode(merged_df['metadata_text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:


# Scale Text Embeddings: 
# - possibility A:Ensure the text embeddings have unit variance as a group. + weighting
# - possibility B:if already in [-1, 1], just do weighting to keep hyperspace geometry
embedding_size = model.get_sentence_embedding_dimension()
review_embeddings_scaled = review_embeddings/embedding_size
metadata_embeddings_scaled = metadata_embeddings/embedding_size

# Other solution : add some similarity score such as
# - positive_review
# - similarity to description
# - language
# ...


### data encoding and scaling

In [45]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Directly scale numerical data : remove the mean and scale to unit variance
scaler_numerical = StandardScaler()
X_numerical = merged_df[numerical_features].values
X_numerical_standardized = scaler_numerical.fit_transform(X_numerical)

# Scale Encoded Categorical Features
# After encoding (ex for OH), scale each group of categorical features so that the entire group has unit variance.
# Note : TODO could use different scaler if too big number of categories
# TODO: save the encoders for future use (inference)

categorical_encoder_list = []
X_categorical_scaled = []
for cat_feature in categorical_features:
    encoder = OneHotEncoder(sparse_output=False)
    encoded_data = encoder.fit_transform(merged_df[[cat_feature]])
    num_categories = encoded_data.shape[1]
    scaled_data = encoded_data / num_categories
    X_categorical_scaled.append(scaled_data)
    categorical_encoder_list.append(encoder)


In [50]:
# Combine all features into a single dataset 
# Note : 
# - if sparse use hstack of scipy, else numpy
# - X_categorical_scaled is a list of arrays -> need to put it in a single array

X_combined = np.hstack((
    X_numerical_standardized, 
    np.hstack(X_categorical_scaled),
    review_embeddings_scaled,
    metadata_embeddings_scaled,
    ))
combined_df = pd.DataFrame(X_combined, columns=[f'feature_{i}' for i in range(X_combined.shape[1])])
 

In [51]:
# print some stats
n_samples = combined_df.shape[0]
print(f'length of combined dataset: {combined_df.shape}')
display(combined_df.head(1))

# Save the combined dataset (if needed)
file_name = os.path.join('..', 'data', 'processed', f'combined_dataset_{n_samples}.parquet')
combined_df.to_parquet(file_name, index=False)

length of combined dataset: (1000, 943)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_933,feature_934,feature_935,feature_936,feature_937,feature_938,feature_939,feature_940,feature_941,feature_942
0,0.782901,-0.158229,-2.645751,2.044837,-0.719922,-0.430744,0.741478,-0.140808,-0.828933,1.0,...,-6.5e-05,1.1e-05,-9.9e-05,-0.000115,0.000431,-9e-05,7e-05,-0.000157,-4.3e-05,-2e-06
