# User params

In [1]:
bool_debug = False
embedder_name = 'all-MiniLM-L6-v2'  # dim : 384, max_len : 256 (probably too short for some cases)
max_samples = 1_000
bool_debug = False

In [2]:
import os, sys
sys.path.append(os.path.abspath('../src'))
from src.config import BASE_PATH_DATA, PATH_PROJECT
os.chdir(PATH_PROJECT)

file_path_train_raw = os.path.join(BASE_PATH_DATA, 'processed', 'train.parquet')
file_path_test_raw = os.path.join(BASE_PATH_DATA, 'processed', 'test.parquet')

file_path_train_encoded = os.path.join(BASE_PATH_DATA, 'processed', f'df_train_encoded{max_samples}.parquet')
file_path_test_encoded = os.path.join(BASE_PATH_DATA, 'processed', f'df_test_encoded{max_samples}.parquet')


# Data cleaning, enrichment and encoding

In [3]:
import pandas as pd
from src.utils.io import load_dataframe

# read json file into dataframe
df_train = load_dataframe(file_path_train_raw)
df_test = load_dataframe(file_path_test_raw)

# limit to wanted sample size, random_state for reproducibility
df_train = df_train.sample(n=max_samples, random_state=42)
df_test = df_test.sample(n=max_samples, random_state=42)

display(df_train.head(1))

2024-08-01 14:15:29,751 - src.config - DEBUG - calling load_dataframe
2024-08-01 14:15:29,752 - src.config - INFO - Current working directory: C:\Users\alexi\WORKSPACE\interview_abwaab\ReviewOutliers
2024-08-01 14:15:29,871 - src.config - DEBUG - calling load_dataframe
2024-08-01 14:15:29,872 - src.config - INFO - Current working directory: C:\Users\alexi\WORKSPACE\interview_abwaab\ReviewOutliers


Unnamed: 0,main_category,title_review,average_rating,rating_number,features,store,rating,title_metadata,text,user_id,timestamp,helpful_vote,verified_purchase
5030,SUBSCRIPTION BOXES,TheraBox Self Care Subscription Box - Self Car...,4.2,2108,"[𝗔𝗦 𝗦𝗘𝗘𝗡 𝗢𝗡 - Oprah, Forbes, Today Show, ABC N...",TheraBox,5,Great stuff!,My wife loved it!,AE3NCR3QW5O3QD7PDGDURQKG7K5A,2021-01-09 03:46:51.392,1,False


In [4]:
from src.utils.preprocessing.preprocessing import preprocess_data

# encode the data
df_train_encoded = preprocess_data(df_train, training=True)
df_test_encoded = preprocess_data(df_test, training=False)

# print some stats
print(f'length of df_train_encoded dataset: {df_train_encoded.shape}')
print(f'length of df_test_encoded dataset: {df_test_encoded.shape}')
display(df_train_encoded.head(1))

# Save the combined dataset (if needed)
df_train_encoded.to_parquet(file_path_train_encoded, index=False)
df_test_encoded.to_parquet(file_path_test_encoded, index=False)

  from tqdm.autonotebook import tqdm, trange
2024-08-01 14:15:36,018 - datasets - INFO - PyTorch version 2.3.1 available.
2024-08-01 14:15:36,349 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cpu
2024-08-01 14:15:36,350 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-08-01 14:15:36,354 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
2024-08-01 14:15:36,738 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0
2024-08-01 14:15:37,007 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/11" 200 0
2024-08-01 14:15:37,303 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-08-01 14:15:50,958 - src.config - DEBUG - calling preprocess_data
2024-08-01 14:15:50,960 - src.config - DEBUG - calling clean_enrich
2024-08-01 14:15:50,998 - src.config - DEBUG - calling encode_data


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

length of df_train_encoded dataset: (100, 833)
length of df_test_encoded dataset: (100, 833)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_823,feature_824,feature_825,feature_826,feature_827,feature_828,feature_829,feature_830,feature_831,feature_832
0,0.031573,-0.192871,0.791257,-0.50033,-3.179797,0.708972,-1.468312,-0.77397,-1.325515,1.0,...,7.7e-05,0.000176,-0.000265,-8.8e-05,-0.000275,7.7e-05,7e-06,4e-06,6e-06,6e-06
