# user params

In [1]:
file_name_review = "Subscription_Boxes_review.json"
file_name_metadata = "Subscription_Boxes_metadata.json"
embedder_name = 'all-MiniLM-L6-v2'  # dim : 384, max_len : 256 (probably too short for some cases)

max_samples = 1_000

# Utility functions

In [2]:
import pandas as pd
import numpy as np
import os

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

model = SentenceTransformer(embedder_name)  # A good balance between performance and dimensionality


  from tqdm.autonotebook import tqdm, trange


### Paths manipulation

In [3]:
from multiprocessing import process


base_folder = os.path.join('..', 'data', 'raw')

# create paths
review_file_path = os.path.join(base_folder, file_name_review)
metadata_file_path = os.path.join(base_folder, file_name_metadata)

# read json file into dataframe
df_metadata = pd.read_json(metadata_file_path)
df_review = pd.read_json(review_file_path)


### Data cleaning

In [4]:
from src.utils.preprocessing_cleaning import clean_enrich_reviews, clean_enrich_metadata
df_review = clean_enrich_reviews(df_review)
df_metadata = clean_enrich_metadata(df_metadata)


### Merge

In [6]:
# Merge the datasets on 'parent_asin' with suffixes for duplicate columns
merged_df = pd.merge(df_review, df_metadata, on='parent_asin', how='inner', suffixes=('_review', '_metadata'))
merged_df = merged_df.drop(columns=['parent_asin', 'asin']) # those features will be OOD at inference for new data

# limit to wanted sample size
merged_df = merged_df.sample(n=max_samples, random_state=42)  # random_state for reproducibility

display(merged_df.head(1))


Unnamed: 0,rating,asin,parent_asin,helpful_vote,verified_purchase,year,month,day,hour,review_text,main_category,average_rating,rating_number,store,metadata_text
0,1,B07G584SHG,B09WC47S3V,2,1,2020,10,8,5,USELESS/n/nAbsolutely useless nonsense and a c...,SUBSCRIPTION BOXES,4.1,2962,KitNipBox,KitNipBox | Happy Cat Box | Monthly Cat Subscr...


Numerical features: ['rating', 'helpful_vote', 'verified_purchase', 'year', 'month', 'day', 'hour', 'average_rating', 'rating_number']
Categorical features: ['main_category', 'store']
Textual features: ['asin', 'parent_asin', 'review_text', 'metadata_text']


### data encoding and scaling

In [50]:
from src.utils.preprocessing_encoding import encode_numerical, encode_categorical, encode_textual
X_numerical_standardized = encode_numerical(merged_df)
X_categorical_scaled = encode_categorical(merged_df)
X_textual_scaled = encode_textual(merged_df, model)

# Combine all features into a single dataset 
X_combined = np.hstack((
    X_numerical_standardized, 
    X_categorical_scaled,
    X_textual_scaled
    ))
combined_df = pd.DataFrame(X_combined, columns=[f'feature_{i}' for i in range(X_combined.shape[1])])
 

In [51]:
# print some stats
n_samples = combined_df.shape[0]
print(f'length of combined dataset: {combined_df.shape}')
display(combined_df.head(1))

# Save the combined dataset (if needed)
file_name = os.path.join('..', 'data', 'processed', f'combined_dataset_{n_samples}.parquet')
combined_df.to_parquet(file_name, index=False)

length of combined dataset: (1000, 943)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_933,feature_934,feature_935,feature_936,feature_937,feature_938,feature_939,feature_940,feature_941,feature_942
0,0.782901,-0.158229,-2.645751,2.044837,-0.719922,-0.430744,0.741478,-0.140808,-0.828933,1.0,...,-6.5e-05,1.1e-05,-9.9e-05,-0.000115,0.000431,-9e-05,7e-05,-0.000157,-4.3e-05,-2e-06
