In [36]:
import pandas as pd
serenlens = pd.read_csv("Data/SerenLens_Books.csv")
# convert label to serendipity for better readibility
serenlens[['serendipity']] = serenlens[['label']].astype(int)
# remove not used serenlens fields
serenlens.drop(['timestamp', 'review', 'rating', 'label'], axis=1, inplace=True)
serenlens.head()

Unnamed: 0,user_id,item_id,serendipity
0,a10e3f50diujee,61148512,0
1,a10e3f50diujee,786718617,0
2,a10e3f50diujee,446617687,0
3,a10e3f50diujee,375725601,0
4,a10e3f50diujee,141800356,0


In [39]:
num_records = len(serenlens)
print(f"The inital number of records is «: {num_records}")

num_records_serendipitous = (serenlens['serendipity'] == 1).sum()
print("Number of serendipituos products:", num_records_serendipitous)

num_users = serenlens['user_id'].nunique()
print("Number of users:", num_users)

num_products = serenlens['item_id'].nunique()
print("Number of products:", num_products)

The inital number of records is «: 265037
Number of serendipituos products: 2557
Number of users: 2346
Number of products: 113876


In [40]:
import json

serenlens['item_id'] = serenlens['item_id'].astype(str).str.lower()
serenlens_ids = set(serenlens['item_id'])

filtered_data = []

with open('Data/meta_Books.jsonl', 'r') as fp:
    for line in fp:
        item = json.loads(line.strip())
        book_id = str(item['parent_asin']).lower()  #parent_asin is item_id in the metadata
        if book_id in serenlens_ids:
            filtered_data.append(item)

df_products = pd.DataFrame(filtered_data)

print(df_products.head())

  main_category                                              title  \
0         Books                        Service: A Navy SEAL at War   
1         Books  A Most Dangerous Book: Tacitus's Germania from...   
2         Books   Out of Sheer Rage: Wrestling with D. H. Lawrence   
3         Books                              One September Morning   
4         Books                  Looking for Peyton Place: A Novel   

                               subtitle  \
0               Hardcover – May 8, 2012   
1  Hardcover – Illustrated, May 2, 2011   
2         Paperback – November 10, 2009   
3           Paperback – January 1, 2009   
4             Hardcover – July 12, 2005   

                                              author  average_rating  \
0  {'avatar': 'https://m.media-amazon.com/images/...             4.7   
1  {'avatar': 'https://m.media-amazon.com/images/...             4.2   
2  {'avatar': 'https://m.media-amazon.com/images/...             4.2   
3  {'avatar': 'https://m.media-a

In [41]:
# dictionary to map items title, description and features
item_title_map = dict(zip(df_products['parent_asin'], df_products['title']))

item_description_map = dict(zip(df_products['parent_asin'], df_products['description']))

item_feature_map = dict(zip(df_products['parent_asin'], df_products['features']))

# Adicionar a nova coluna 'description' na base serenhance
serenlens['title'] = serenlens['item_id'].map(item_title_map)

# Adicionar a nova coluna 'description' na base serenhance
serenlens['description'] = serenlens['item_id'].map(item_description_map)

# Adicionar a nova coluna 'description' na base serenhance
serenlens['features'] = serenlens['item_id'].map(item_feature_map)

# Exibir as primeiras linhas do DataFrame resultante para verificar se a operação foi bem-sucedida
print(serenlens.head())

# Salvar o DataFrame resultante em um novo arquivo CSV
serenlens.to_csv('Data/serenlens_with_metadata.csv', index=False)

          user_id     item_id  serendipity  \
0  a10e3f50diujee  0061148512            0   
1  a10e3f50diujee  0786718617            0   
2  a10e3f50diujee  0446617687            0   
3  a10e3f50diujee  0375725601            0   
4  a10e3f50diujee  0141800356            0   

                                               title  \
0                                                NaN   
1  Lover of Unreason: Assia Wevill, Sylvia Plath'...   
2                                   Sex and the City   
3  The Devil in the White City: Murder, Magic, an...   
4                                                NaN   

                                         description  \
0                                                NaN   
1  [From Publishers Weekly, The "other woman" in ...   
2  [Excerpt. © Reprinted by permission. All right...   
3  [Review, “Engrossing . . . exceedingly well do...   
4                                                NaN   

                                            featu

In [42]:
import pandas as pd
serenlens = pd.read_csv("Data/serenlens_with_metadata.csv")
print(len(serenlens))

265037


In [43]:
# remove nulls
serenlens = serenlens.dropna(subset=['features'])
serenlens = serenlens.dropna(subset=['description'])

print(len(serenlens))
serenlens.head()

217757


Unnamed: 0,user_id,item_id,serendipity,title,description,features
1,a10e3f50diujee,786718617,0,"Lover of Unreason: Assia Wevill, Sylvia Plath'...","['From Publishers Weekly', 'The ""other woman"" ...",['The failure of the marriage between Sylvia P...
2,a10e3f50diujee,446617687,0,Sex and the City,['Excerpt. © Reprinted by permission. All righ...,['Enter a world where the sometimes shocking a...
3,a10e3f50diujee,375725601,0,"The Devil in the White City: Murder, Magic, an...","['Review', '“Engrossing . . . exceedingly well...",['#1 NATIONAL BESTSELLER • NATIONAL BOOK AWARD...
5,a10e3f50diujee,743249895,0,Female Chauvinist Pigs: Women and the Rise of ...,"['Amazon.com Review', 'Ariel Levy\x92s debut b...","[""A contributing editor at New York magazine e..."
6,a10e3f50diujee,812817079,0,My Story,[],['Marilyn Monroe presents a revealing portrait...


In [45]:
import re
from bs4 import BeautifulSoup

def list_to_str(l):
    if isinstance(l, list):
        return list_to_str(', '.join(l))
    else:
        return l

def feature_process(feature):
    sentence = ""
    if isinstance(feature, float):
        sentence += str(feature)
        sentence += '.'
    elif isinstance(feature, list) and len(feature) > 0:
        for v in feature:
            sentence += clean_text(v)
            sentence += ', '
        sentence = sentence[:-2]
        sentence += '.'
    else:
        sentence = clean_text(feature)
    return sentence


def clean_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+', '', text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    return text

serenlens['clean_title'] = serenlens['title'].apply(feature_process)
serenlens['clean_features'] = serenlens['features'].apply(feature_process)
serenlens['clean_description'] = serenlens['description'].apply(feature_process)

# Remove empty text after preprocessing
serenlens = serenlens[serenlens['clean_description'].str.strip() != '']
serenlens = serenlens[serenlens['clean_features'].str.strip() != '']

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [46]:
serenlens.head()

Unnamed: 0,user_id,item_id,serendipity,title,description,features,clean_title,clean_features,clean_description
1,a10e3f50diujee,786718617,0,"Lover of Unreason: Assia Wevill, Sylvia Plath'...","['From Publishers Weekly', 'The ""other woman"" ...",['The failure of the marriage between Sylvia P...,"lover of unreason assia wevill, sylvia plaths ...",the failure of the marriage between sylvia pla...,"from publishers weekly, the other woman in the..."
2,a10e3f50diujee,446617687,0,Sex and the City,['Excerpt. © Reprinted by permission. All righ...,['Enter a world where the sometimes shocking a...,sex and the city,enter a world where the sometimes shocking and...,excerpt. reprinted by permission. all rights ...
3,a10e3f50diujee,375725601,0,"The Devil in the White City: Murder, Magic, an...","['Review', '“Engrossing . . . exceedingly well...",['#1 NATIONAL BESTSELLER • NATIONAL BOOK AWARD...,"the devil in the white city murder, magic, and...",1 national bestseller national book award fin...,"review, engrossing . . . exceedingly well docu..."
5,a10e3f50diujee,743249895,0,Female Chauvinist Pigs: Women and the Rise of ...,"['Amazon.com Review', 'Ariel Levy\x92s debut b...","[""A contributing editor at New York magazine e...",female chauvinist pigs women and the rise of r...,a contributing editor at new york magazine exa...,"amazon.com review, ariel levyx92s debut book i..."
8,a10e3f50diujee,345439104,0,Drowning Ruth: A Novel (Oprah's Book Club),"['Review', '“Gripping . . . A story of deep fa...",['“POWERFUL . . . SUSPENSEFUL . . . RICHLY TEX...,drowning ruth a novel oprahs book club,powerful . . . suspenseful . . . richly textur...,"review, gripping . . . a story of deep family ..."


In [47]:
num_records_final = len(serenlens)
print(f"The final number of records is «: {num_records_final}")

num_records_serendipitous_final = (serenlens['serendipity'] == 1).sum()
print("Number of serendipituos products:", num_records_serendipitous_final)

num_users_final = serenlens['user_id'].nunique()
print("Number of users:", num_users_final)

num_products_final = serenlens['item_id'].nunique()
print("Number of products:", num_products_final)

The final number of records is «: 184028
Number of serendipituos products: 1898
Number of users: 2346
Number of products: 71830


In [48]:
serenlens.to_csv('Data/serenlens_with_metadata_cleaned.csv', index=False, sep = ";")

In [7]:
import pandas as pd

serenlens = pd.read_csv("Data/serenlens_with_metadata_cleaned.csv", sep = ";")

serenlens['title'] = serenlens['clean_title']
serenlens['description'] = serenlens['clean_description']
serenlens['features'] = serenlens['clean_features']

serenlens.drop(['clean_title', 'clean_features', 'clean_description'], axis=1, inplace=True)

min_text_length = 200
max_text_length = 2100

# filter description and feature to meet minimal character number
# limit character length for performance purpose
filtered_data = serenlens[(serenlens['description'].apply(len) + serenlens['features'].apply(len) >= min_text_length) & 
                           (serenlens['description'].apply(len) +  serenlens['features'].apply(len) <= max_text_length)]


# filter users with with at least one serendipitious item
users_with_positive_labels = filtered_data[filtered_data['serendipity'] == 1]['user_id'].unique()
filtered_data = filtered_data[filtered_data['user_id'].isin(users_with_positive_labels)]

num_records_final = len(filtered_data)
print(f"The final number of records is «: {num_records_final}")

num_records_serendipitous_final = (filtered_data['serendipity'] == 1).sum()
print("Number of serendipituos products:", num_records_serendipitous_final)

num_users_final = filtered_data['user_id'].nunique()
print("Number of users:", num_users_final)

num_products_final = filtered_data['item_id'].nunique()
print("Number of products:", num_products_final)
filtered_data.head()

The final number of records is «: 9169
Number of serendipituos products: 390
Number of users: 382
Number of products: 7330


Unnamed: 0,user_id,item_id,serendipity,title,description,features
997,a10xn3j1jexltj,1423111451,0,"deadly little lies touch, book 2","about the author, laurie faria stolarz, is the...","last fall, sixteenyearold camelia fell for ben..."
1001,a10xn3j1jexltj,802720749,0,"hearts at stake drake chronicles, book 1","about the author, alyxandra harvey studied cre...","on solanges sixteenth birthday, she is going t..."
1004,a10xn3j1jexltj,1416978909,0,devoured,"from school library journal, grade 8 upever si...",chapter 1
1006,a10xn3j1jexltj,545140315,0,the lonely hearts club,"from, booklist, after a devastating betrayal b...",love is all you need... or is it? pennys about...
1009,a10xn3j1jexltj,316154431,0,secrets of my hollywood life,"about the author, its no secret how jen caloni...",for fans of the princess diaries and famous in...


In [58]:
filtered_data.to_csv('Data/serenlens_with_metadata_cleaned_filtered.csv', index=False, sep = ";")