In [1]:
import sys
import os
import pandas as pd 
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

from recommenders.datasets.amazon_reviews import get_review_data
from recommenders.datasets.split_utils import filter_k_core
from recommenders.models.sasrec.model import SASREC
from recommenders.models.sasrec.ssept import SSEPT
from recommenders.models.sasrec.sampler import WarpSampler
from recommenders.models.sasrec.util import SASRecDataSet
from recommenders.utils.notebook_utils import store_metadata
from recommenders.utils.timer import Timer


print(f"System version: {sys.version}")
print(f"Tensorflow version: {tf.__version__}")

System version: 3.9.19 (main, Mar 21 2024, 17:21:27) [MSC v.1916 64 bit (AMD64)]
Tensorflow version: 2.12.0


In [2]:
data_dir = os.path.join("./")

# SerenLens modified
dataset = "reviews_Books_5"

In [3]:
outfile = dataset + '.txt'

In [11]:
df = pd.read_csv("../Data/serenlens_with_features_cleaned_filtered.csv", dtype={'item_id': str}, sep=";")
df.head()

Unnamed: 0,user_id,item_id,timestamp,rating,serendipity,title,description,features
0,a10f5lmypxqydf,380761319,1140048000,5,0,the shadow and the star,"from publishers weekly, kinsale the prince of...",from nationally acclaimed bestselling author l...
1,a10f5lmypxqydf,373294395,1173484800,4,0,beau crusoe,"about the author, i started writing regencies ...","shipwrecked!stranded alone on a desert island,..."
2,a10f5lmypxqydf,60090383,1176595200,5,1,rachels holiday,"about the author, marian keyes is the author o...",the fast lane is much too slow for rachel wals...
3,a10f5lmypxqydf,60724560,1183852800,3,0,"princess on the brink princess diaries, vol. 8","from, booklist, princess mia, now a highschool...","at last, mia is a junior. an upperclassperson...."
4,a10f8dlu94ackp,545123283,1302048000,4,0,linger,"amazon.com review, amazon exclusive a qa with ...","in maggie stiefvaters shiver, grace and sam fo..."


In [12]:
df['userID'] = df['user_id'].str.lower()
df['itemID'] = df['item_id'].str.lower()
df['time'] = df['timestamp']
df['label'] = df['serendipity']

df.drop(columns=['user_id','item_id','timestamp','serendipity','title','features','description', 'rating'], inplace=True)
df.head()

Unnamed: 0,userID,itemID,time,label
0,a10f5lmypxqydf,380761319,1140048000,0
1,a10f5lmypxqydf,373294395,1173484800,0
2,a10f5lmypxqydf,60090383,1176595200,1
3,a10f5lmypxqydf,60724560,1183852800,0
4,a10f8dlu94ackp,545123283,1302048000,0


In [13]:
user_set, item_set = set(df['userID'].unique()), set(df['itemID'].unique())
user_map = dict()
item_map = dict()
for u, user in enumerate(user_set):
    user_map[user] = u+1
for i, item in enumerate(item_set):
    item_map[item] = i+1

df["userID"] = df["userID"].apply(lambda x: user_map[x])
df["itemID"] = df["itemID"].apply(lambda x: item_map[x])
df = df.sort_values(by=["userID", "time"])
df.drop(columns=["time"], inplace=True)

In [15]:
count_label_1_seren = df[df['label'] == 1.0].shape[0]


# Exibindo o resultado
print("Número de entradas com label igual a 1.0 seren:", count_label_1_seren)

Número de entradas com label igual a 1.0 seren: 494


In [16]:
df.to_csv(os.path.join(data_dir, outfile), sep="\t", header=False, index=False)