# user params

In [12]:
file_name_review = "Subscription_Boxes_review.json"
file_name_metadata = "Subscription_Boxes_metadata.json"
embedder_name = 'all-MiniLM-L6-v2'

max_samples = 1000

# Utility functions

In [13]:
import pandas as pd
import numpy as np
import os

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

model = SentenceTransformer(embedder_name)  # A good balance between performance and dimensionality


### Paths manipulation

In [22]:
base_folder = os.path.join('..', 'data', 'raw')

# create paths
review_file_path = os.path.join(base_folder, file_name_review)
metadata_file_path = os.path.join(base_folder, file_name_metadata)

# read json file into dataframe
df_metadata = pd.read_json(metadata_file_path)
df_review = pd.read_json(review_file_path)


### Data cleaning

In [None]:
# Drop duplicates
df_metadata.drop_duplicates(inplace=True)
df_review.drop_duplicates(inplace=True)

# Handle missing values
# TODO


### Data filtering

In [23]:

# removed fields : 
# - reviews : 'images'
# - metadata : 'images', 'videos'

# fields to keep
keep_labels_reviews = ['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
                       'timestamp', 'helpful_vote', 'verified_purchase']

keep_labels_metadata = ['main_category', 'title', 'average_rating', 'rating_number', 'features',
                        'description', 'price', 'store', 'categories', 'details', 'parent_asin', 'bought_together']

# filter dataframes
merged_df_metadata = merged_df_metadata[keep_labels_metadata]
merged_df_review = merged_df_review[keep_labels_reviews]

# Merge the datasets on 'parent_asin' with suffixes for duplicate columns
merged_df = pd.merge(df_review, df_metadata, on='parent_asin', how='inner', suffixes=('_review', '_metadata'))
display(merged_df.head(1))

# list feature types
date_feature = ['timestamp']
numerical_features = ['rating_review', 'helpful_vote', 'average_rating', 'price', 'rating_number']
categorical_features = ['main_category', 'store', 'verified_purchase', 'bought_together']
textual_features = ['title_review', 'title_metadata', 'text', 'description', 'features', 'categories', 'details']


Unnamed: 0,rating,title_review,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,title_metadata,average_rating,rating_number,features,description,price,store,categories,details,bought_together
0,1,USELESS,Absolutely useless nonsense and a complete was...,B07G584SHG,B09WC47S3V,AEMJ2EG5ODOCYUTI54NBXZHDJGSQ,2020-10-08 05:10:57.705,2,True,SUBSCRIPTION BOXES,KitNipBox | Happy Cat Box | Monthly Cat Subscr...,4.1,2962,[The best-selling cat subscription box! Get a ...,[],,KitNipBox,[],{},


### Data processing

In [29]:
# Expand timestamp
merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp'], unit='ms')  # Convert Unix timestamp to datetime
merged_df['year'] = merged_df['timestamp'].dt.year
merged_df['month'] = merged_df['timestamp'].dt.month
merged_df['day'] = merged_df['timestamp'].dt.day
merged_df['hour'] = merged_df['timestamp'].dt.hour
merged_df = merged_df.drop(columns=['timestamp'])  # Drop the original timestamp column if no longer needed

# Process categorical features
# TODO


In [None]:

# List of textual features to be embedded separately
textual_features_review = ['title_review', 'text']
textual_features_metadata = ['title_metadata', 'description', 'features', 'categories', 'details']

# Concatenate all textual features into a single string for each group
merged_df['combined_text_review'] = merged_df[textual_features_review].astype(str).agg(' '.join, axis=1)
merged_df['combined_text_metadata'] = merged_df[textual_features_metadata].astype(str).agg(' '.join, axis=1)


review_embeddings = model.encode(merged_df['combined_text_review'].tolist(), show_progress_bar=True)
metadata_embeddings = model.encode(merged_df['combined_text_metadata'].tolist(), show_progress_bar=True)

In [None]:
# process numerical features
X_numerical = merged_df[numerical_features].fillna(0).values
scaler = StandardScaler()
X_numerical_standardized = scaler.fit_transform(X_numerical)

# Combine all features into a single dataset
X_combined = np.hstack((reviews_embeddings, metadata_embeddings, X_numerical))
combined_df = pd.DataFrame(X_combined, columns=[f'feature_{i}' for i in range(X_combined.shape[1])])


In [None]:
# Save the combined dataset (if needed)
combined_df.to_parquet('combined_dataset.parquet', index=False)