In [7]:
training_user_data_loc = r"drive/MyDrive/MSUoA/raw_user_data.csv"

In [9]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

import nltk

# Ensure required packages are downloaded
nltk.download('punkt', download_dir='../nltk_data')
nltk.download('punkt_tab', force=True, download_dir='../nltk_data')
nltk.download('stopwords', download_dir='../nltk_data')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.data.path.append('../nltk_data')


users_data = pd.read_csv(training_user_data_loc)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess_reviews(reviews):
    lemmatized_reviews = []
    for review in reviews:
        tokens = word_tokenize(review)
        tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
        lemmatized = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
        lemmatized_reviews.append(' '.join(lemmatized))
    return lemmatized_reviews

users_data['user_reviews'] = users_data['user_reviews'].apply(preprocess_reviews)
users_data.to_csv('users_data_without_bert_text_processed.csv', index=False)

[nltk_data] Downloading package punkt to ../nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ../nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to ../nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import ast

users_data = pd.read_csv('users_data_without_bert_text_processed.csv', index_col=0)

# BERT Embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(reviews):
    inputs = tokenizer(reviews, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: [batch_size, 768]
    # Average if multiple embeddings exist
    if cls_embeddings.shape[0] > 1:
        return torch.mean(cls_embeddings, dim=0)  # Shape: [768]
    else:
        return cls_embeddings.squeeze(0)  # Shape: [768]


# Iterating over rows
train_data_with_embeddings = pd.DataFrame(columns=users_data.columns)
train_data_with_embeddings.to_csv('users_data_bert_embeddings.csv', header=True)
batch_data = []
batch_size = 10
for index, row in users_data.iterrows():
    user_reviews = row['user_reviews']
    embedding = get_bert_embeddings(user_reviews)
    row_copy = row.copy()
    row_copy['user_reviews'] = embedding
    print(f"Index: {index}, row {row_copy}")
    batch_data.append(row_copy)
    if (index + 1) % batch_size == 0:
        batch_df = pd.DataFrame(batch_data)
        batch_df.to_csv('users_data_bert_embeddings.csv', mode='a', header=False)
        batch_data = []  # Clear memory



Index: 0, row user_id                                                 ur0000006
movie_ids                                           ['tt0048613']
user_ratings                                                 [10]
user_reviews    [tensor(-0.0130), tensor(0.8864), tensor(0.553...
Name: 0, dtype: object
Index: 1, row user_id                                                 ur0000011
movie_ids       ['tt0029712', 'tt0030337', 'tt0031270', 'tt003...
user_ratings               [6, 10, 6, 5, 7, 6, 6, 6, 4, 5, 7, 10]
user_reviews    [tensor(-0.0104), tensor(0.9601), tensor(0.410...
Name: 1, dtype: object
Index: 2, row user_id                                                 ur0000157
movie_ids       ['tt0050480', 'tt0054911', 'tt0057273', 'tt005...
user_ratings                                     [4, 10, 4, 7, 7]
user_reviews    [tensor(0.0305), tensor(0.9307), tensor(0.4076...
Name: 2, dtype: object
Index: 3, row user_id                                                 ur0000206
movie_ids        