In [6]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import numpy as np
import os
from sklearn.preprocessing import normalize

data = pd.read_csv('Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")

unique_items = data.drop_duplicates(subset=['item_id'])

# This can be changed to different approachs to include other texts like description
unique_items['combined_text'] = unique_items['title'] + ": " + unique_items['features']

embedding_dir = 'embeddings'
os.makedirs(embedding_dir, exist_ok=True)

print("creating embeddings with all-mpnet-base-v2")
model_mpnet_base = SentenceTransformer('all-mpnet-base-v2')
embeddings_mpnet_base = model_mpnet_base.encode(unique_items['combined_text'].tolist())
embeddings_mpnet_base = normalize(embeddings_mpnet_base, norm='l2')
np.save(os.path.join(embedding_dir, 'all_mpnet_base_embeddings.npy'), embeddings_mpnet_base)

print("All embeddings salved")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_items['combined_text'] = unique_items['title'] + ": " + unique_items['features']


creating embeddings with all-mpnet-base-v2


In [10]:
data = pd.read_csv('Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")

unique_items = data.drop_duplicates(subset=['item_id'])

# This can be changed to different approachs to include other texts like description
unique_items['combined_text'] = unique_items['title'] + ": " + unique_items['features']

embedding_dir = 'embeddings'
os.makedirs(embedding_dir, exist_ok=True)

print("creating embeddings with bert-base-nli-mean-tokens")
model_bert_base = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings_bert_base = model_bert_base.encode(unique_items['combined_text'].tolist())
embeddings_bert_base = normalize(embeddings_bert_base, norm='l2')
np.save(os.path.join(embedding_dir, 'bert_base_embeddings.npy'), embeddings_bert_base)

print("creating embeddings with all-distilroberta-v1")
model_distilroberta = SentenceTransformer('all-distilroberta-v1')
embeddings_distilroberta = model_distilroberta.encode(unique_items['combined_text'].tolist())
embeddings_distilroberta = normalize(embeddings_distilroberta, norm='l2')
np.save(os.path.join(embedding_dir, 'distilroberta_embeddings.npy'), embeddings_distilroberta)

print("creating embeddings with BM25...")
tokenized_texts = [text.split() for text in unique_items['combined_text'].tolist()]
bm25 = BM25Okapi(tokenized_texts)
bm25_embeddings = np.array([bm25.get_scores(text.split()) for text in unique_items['combined_text'].tolist()])
bm25_embeddings = normalize(bm25_embeddings, norm='l2')
np.save(os.path.join(embedding_dir, 'bm25_embeddings.npy'), bm25_embeddings)

print("All embeddings salved")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_items['combined_text'] = unique_items['title'] + ": " + unique_items['features']


creating embeddings with bert-base-nli-mean-tokens
creating embeddings with all-distilroberta-v1
creating embeddings with BM25...
All embeddings salved


In [9]:
# divide train and test dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

data = pd.read_csv('Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")

data.drop(['description', 'features', 'title'], axis=1, inplace=True)

# consider serendipity is important
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['serendipity'])

train_data.to_csv('train_data.csv', index=False, sep=';')
test_data.to_csv('test_data.csv', index=False, sep=';')
