In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("dataset/cleaned_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Product URL        210 non-null    object
 1   Product Name       210 non-null    object
 2   Product Price      209 non-null    object
 3   Rating             208 non-null    object
 4   Number of reviews  208 non-null    object
 5   Manufacturer       154 non-null    object
 6   ASIN               156 non-null    object
 7   product_name       210 non-null    object
dtypes: object(8)
memory usage: 13.3+ KB


In [4]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def get_word_embeddings(text, tokenizer, model):
    # Tokenize the input text and get the input IDs and attention mask
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # outputs[0] contains the hidden states of all tokens in the input
    # Shape of outputs[0]: [batch_size, sequence_length, hidden_size]
    token_embeddings = outputs.last_hidden_state.squeeze(0)
    
    # Get the embeddings for each token (excluding special tokens like [CLS], [SEP])
    token_embeddings = token_embeddings[1:-1]
    
    # Get the corresponding tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))[1:-1]
    
    return tokens, token_embeddings


In [6]:
# Initialize new columns to store tokens and embeddings as empty lists
df['tokens'] = None
df['embeddings'] = None

# Loop through product names and get word embeddings
for index, name in enumerate(df['product_name']):
    tokens, embeddings = get_word_embeddings(name, tokenizer, model)
    # Convert embeddings tensor to a NumPy array if it is a tensor
    if isinstance(embeddings, torch.Tensor):
        embeddings = embeddings.detach().cpu().numpy()  # Convert to NumPy array

    # Store tokens and embeddings in the DataFrame
    df.at[index, 'tokens'] = tokens  # Store tokens
    df.at[index, 'embeddings'] = embeddings  # Store embeddings (tensor or array)


In [7]:
# df.to_csv("dataset/preprocessed_data.csv", index=False)

# saving in pkl because tensor were being convert to str in csv
# Save the DataFrame to a pickle file
df.to_pickle('dataset/preprocessed_data.pkl')
