## Load Needed Libraries

In [None]:
### Just a reminder of the minimal libraries that we need for pre-processing
# %pip install pandas
# %pip install top2vec
# %pip install umap
# %pip install scipy==1.12.0

In [None]:
import numpy as np
import pandas as pd
from top2vec import Top2Vec
import json
from pathlib import Path
from umap.umap_ import UMAP


## Load the Top2Vec Model and transfer to Pandas DataFrames

#### Get the Model

In [None]:
BASE_DIR = Path.home() / "SEED_DATA/impact_and_fiction"
model_whole = Top2Vec.load(BASE_DIR / "t2v_model-speed_learn-book_lemmas-uncased-min_df_0.01-max_df_0.5-content.model")
model_whole.get_num_topics()

In [None]:
print(model_whole.topic_vectors.shape)
print(model_whole.document_vectors.shape)
print(model_whole.word_vectors.shape)
print(model_whole.document_ids.shape)

#### Get the Topic Sizes

In [None]:
topic_sizes, topic_nums = model_whole.get_topic_sizes()
df = pd.DataFrame(topic_sizes, index=topic_nums, columns=["topic_size"])
print(f"Num of Documents = {df['topic_size'].sum()}")
total_topics_found = df.shape[0]
print(f"Num of Topics = {total_topics_found}")
df

#### Get the most frequent topic words and their scores per topic

In [None]:
topic_words, word_scores, topic_nums = model_whole.get_topics(total_topics_found)
topic_dict = [{"topic_words": row_tw[:20], "word_scores": row_sc[:20]} for row_tw, row_sc in zip(topic_words, word_scores)]
df_topic_words = pd.DataFrame(topic_dict, index=topic_nums, columns=["topic_words", "word_scores"])
df_score_words = pd.concat([df, df_topic_words], axis=1) 
df_score_words

#### Get the Document IDs (ISBNs) that belong to each ropic and their belongness scores (probability)

In [None]:

topic_documents = []
doc2topic = {} # defaultdict(set)
for topic_id, topic_row in df_score_words.iterrows():
    document_scores, document_ids = model_whole.search_documents_by_topic(topic_num=topic_id, num_docs=topic_row["topic_size"])
    isbn_only = [x.split("_")[1] for x in document_ids]
    topic_documents.append({"doc_ids": isbn_only, "doc_scores": list(document_scores)})
    for score, doc_id in zip(document_scores, isbn_only):
        doc2topic[doc_id] = topic_id 
        # print(f"Document: {doc_id}, Score: {score}")
        # print("-----------")
df_topic_docs = pd.DataFrame(topic_documents, index=df_score_words.index)
df_topic_docs = pd.concat([df_score_words, df_topic_docs], axis=1)

df_topic_docs.to_csv("data/topic_docs_table.csv", index=True)

df_topic_docs

#### Compress the Document Vectors into 2-dimensions using UMAP

In [None]:
umap = UMAP(verbose=True)
reduced_document_vectors = umap.fit_transform(model_whole.document_vectors)
reduced_document_vectors.shape

#### Filter and Clean the Table 
So we only keep the deduplicated books that are in `novels_tokens`.
We also create the division between `Non-fiction` and `Fiction` books

In [None]:
import json

def remove_book_duplicates(top2vec_doc_ids, doc_vecs, valid_files_mapping):
    """This exists because the top2vec was created with some duplicated books (e.g. different publishers of the same book).
    In order to map it to only one ISBN and index it properly without loosing which vectors belong to which book we deduplicate here 
    using a predefined JSON file containing the list of deduplicated files

    Args:
        top2vec_doc_ids (_type_): The IDs as saved in the Top2Vec model
        doc_vecs (_type_): The 2-Dim Topic Vectors
        valid_files_mapping (_type_): Mapping from the actual deduplicated files in novels_tokens folder to toher properties
    """
    clean_ids, clean_vecs_x, clean_vecs_y = [], [], []
    for doc_id, doc_vec in zip(top2vec_doc_ids, doc_vecs):
        # doc_id now looks like this: '/home/wfa010/out/txt/IP1574938630912/20191029143048_9789045020860'
        key = f"{doc_id.split('/')[-1]}-tokens.txt.gz"
        clean_doc_id = doc_id.split('/')[-1].split("_")[1]
        if key in valid_files_mapping:
            clean_ids.append(clean_doc_id)
            clean_vecs_x.append(doc_vec[0])
            clean_vecs_y.append(doc_vec[1])
    
    return clean_ids, clean_vecs_x, clean_vecs_y

valid_files_mapping = json.load(open(BASE_DIR / "filename2isbn.json"))
clean_doc_ids, clean_doc_vecs_x, clean_doc_vecs_y = remove_book_duplicates(model_whole.document_ids, list(reduced_document_vectors), valid_files_mapping)
print(len(clean_doc_ids))

df_doc_vectors = pd.DataFrame({"doc_x": clean_doc_vecs_x, "doc_y": clean_doc_vecs_y}, index=clean_doc_ids)
df_doc2topic = pd.DataFrame(doc2topic.items(), columns=["doc_id", "topic_id"])
df_doc2topic = df_doc2topic.set_index("doc_id")
book_db = pd.concat([df_doc_vectors, df_doc2topic], axis=1)
book_db

### Add NUR Information (Genres of interest) columns

In [None]:
# def get_isbn_nur(isbn_mappings_file):
#     dtype = {
#         'isbn': str,
#         'nur': str
#     }

#     temp_df = pd.read_csv(isbn_mappings_file, index_col=False, sep='\t', dtype=dtype)
#     isbn_nur = temp_df[['isbn', 'nur']].set_index('isbn') # .drop_duplicates() ?
#     isbn_nur['nur'] = isbn_nur.nur.apply(lambda x: int(x) if isinstance(x, str) else 0)
#     return pd.get_dummies(isbn_nur.nur)

# BASE_DIR = Path.home() / "SEED_DATA/impact_and_fiction"
# isbn_mappings_file = BASE_DIR / "work-isbn-mapping.tsv"
# assert Path.is_file(isbn_mappings_file)  
# isbn_nur = get_isbn_nur(isbn_mappings_file)
# print(isbn_nur.shape)
# isbn_nur.to_csv("data/isbn_nur_table.csv")
# isbn_nur

In [None]:
# This cell is not matching all of the ISBNs form the files with the isbn_nur_table (more than 14K are missing). So we skip this step. The next table has the nurs already.

# def get_nur_from_isbn(isbn_nur_df, isbn):
#     valid_row = isbn_nur_df[isbn_nur_df.index == isbn]
#     if valid_row.empty:
#         return []
#     else:
#         return valid_row.columns[isbn_nur_df.loc[isbn_nur_df.index == isbn].all(axis=0)].tolist()

# # print(get_nur_from_isbn(isbn_nur, "9789028426214"))

# book_db["nur_id"] = book_db.apply(lambda row: get_nur_from_isbn(isbn_nur, row.name), axis=1)
# book_db.to_csv("book_nur_table.csv")
# book_db



In [None]:
book_metadata = BASE_DIR / "book-metadata.csv"
metadata_df = pd.read_csv(book_metadata)
metadata_df = metadata_df.set_index('isbn')

common_indices = metadata_df.index.intersection(book_db.index)
metadata_df = metadata_df.loc[~metadata_df.index.duplicated(keep='first')]

meta_filtered = metadata_df.loc[common_indices].sort_index()
meta_filtered

In [None]:
nur_labels_EN = {
    0: 'unknown',
    285: 'Young adult fiction',
    300: 'literary fiction general',
    301: 'Dutch literary novel, novella',
    302: 'translated literary novel, novella',
    305: 'literary thriller',
    311: 'literary fiction, pocket',
    312: 'popular fiction, pocket',
    313: 'suspense pocket',
    315: 'translated pocket',
    330: 'suspense general',
    331: 'detective',
    332: 'thriller',
    333: 'science fiction',
    334: 'fantasy',
    335: 'scary- and ghost stories, horror',
    336: 'adventure novel',
    337: 'war and resistance novel',
    338: 'spy novel',
    340: 'popular fiction general',
    342: 'historical novel (popular)',
    343: 'romance'
}

In [None]:
def get_nur_names(nur_ids):
    nur_names = []

    if isinstance(nur_ids, str):
        nur_ids = eval(nur_ids)
    elif isinstance(nur_ids, list):
        pass
    else:
        nur_ids = []

    for nid in nur_ids:
        if int(nid) in nur_labels_EN:
            nur_names.append(nur_labels_EN.get(int(nid)))
        else:
            pass
            # print(f"Not found: {nid}")
    return nur_names


metadata_df = meta_filtered[["title", "author", "publisher", "nur"]]
metadata_df = book_db.join(metadata_df)
metadata_df["nur_names"] = metadata_df.apply(lambda x: get_nur_names(x['nur']), axis=1)

metadata_df

In [None]:
# ## Don't need this cell anymore but it is a very nice trick to merge duplicated rows into a single row with all values in a list per row

# df_books = pd.read_csv(isbn_mappings_file, sep="\t")
# print(df_books[df_books["isbn"] == "9789044630039"])

# df_books[df_books["isbn"] == "0312347324"]
# df_books[df_books["isbn"] == "9789023449348"]

# df_books_mini = df_books[["isbn", "title", "author", "publisher"]]

# def merge_to_list(group):
#     id_value = group['isbn'].iloc[0]
#     merged_values = [id_value]
#     merged_values.append(group['title'].tolist())
#     merged_values.append(group['author'].tolist())
#     merged_values.append(group['publisher'].tolist())
    
#     return pd.Series(merged_values, index=['isbn', 'title', 'author', 'publisher'])

# # Apply the function to each group
# transformed_df = df_books_mini.groupby('isbn').apply(merge_to_list).reset_index(drop=True)

# transformed_df[transformed_df["isbn"] == "9789044630039"]
# # transformed_df[transformed_df["isbn"] == "9789023449348"]


### Add the ISBN -> Work-Id Mapping

The known books are the 18,467 valid files with the full book content inside `novels_tokens` folder

In [None]:
work_isbn_path = BASE_DIR / "work_isbn_title_genre.tsv"
work_isbn_db = pd.read_csv(work_isbn_path, sep="\t")
work_isbn_db = work_isbn_db[work_isbn_db['record_id_type'] == 'isbn']

work_isbn_db = work_isbn_db[['work_id', 'record_id']].set_index('record_id')
work_isbn_db.index = work_isbn_db.index.str.strip()
work_isbn_db.index.name = 'isbn'
work_isbn_db

In [None]:
import json
# Filter to only have the relevant ISBNs for the dataset (18,467)
common_indices = metadata_df.index.intersection(work_isbn_db.index)
relevant_work_isbns = work_isbn_db.loc[common_indices]
relevant_work_isbns.to_csv("data/isbn_to_work_ids.csv")

# Save as dictionary to re-use later by other notebooks
isbn_dict = relevant_work_isbns.to_dict(orient='index')
isbn2workId = {k: v['work_id'] for k,v in isbn_dict.items()}
workId2isbn = {v: k for k,v in isbn2workId.items()}
json.dump(isbn2workId, open("data/isbn2workId.json", "w", encoding='utf-8'))
json.dump(workId2isbn, open("data/workId2isbn.json", "w", encoding='utf-8'))

relevant_work_isbns

## Final Book Table

In [None]:
def process_element_as_list(elem, get_only_first):
    if isinstance(elem, float): # NaN is a float
        return elem
    if isinstance(elem, str):
        new_elem = eval(elem)
    elif isinstance(elem, list):
        new_elem = elem

    # PROBLEMATIC CASES. How To solve it?    
    # if len(new_elem) > 1:
    #     print(sorted(new_elem))

    if get_only_first and len(new_elem) > 0:
        return sorted(new_elem)[0]
    elif get_only_first and len(new_elem) == 0:
        return None
    else:
        return new_elem

    
book_db = metadata_df.join(relevant_work_isbns, how='inner') # There are actuall only 18,465 valid records, the other two have NaN values anyway so we get rid of them...

book_db['title'] = book_db['title'].apply(process_element_as_list, args=(True,))
book_db['author'] = book_db['author'].apply(process_element_as_list, args=(True,))
book_db['publisher'] = book_db['publisher'].apply(process_element_as_list, args=(True,)) ## WARN!!! Here we are arbitrarily choosing only the First PUBLISHER
book_db['nur'] = book_db['nur'].apply(process_element_as_list, args=(False,))             ## WARN!!! Here we are arbitrarily choosing only the First NUR
book_db['genre'] = book_db['nur_names'].apply(process_element_as_list, args=(True,)) ## WARN!!! Here we are arbitrarily choosing only the First NUR_NAME
book_db



book_db.to_csv("data/book_topic.tsv", sep="\t")

book_db

### Get the Unique Values to use them later as drop-down filters

In [None]:

# We force the explode because sometimes the fields are stringified lists and they need to be python lists in order to `explode` 
def force_explode(column_value):
    if isinstance(column_value, str):
        vals = eval(column_value)
    elif isinstance(column_value, list):
        pass
    else:
        vals = []
    return pd.Series(vals)

nur_counts = metadata_df['nur'].apply(force_explode).stack().value_counts()
nur_counts = nur_counts[nur_counts > 2]
nur_counts.to_csv("data/unique_counts_nur.csv")

author_counts = metadata_df['nur'].apply(force_explode).stack().value_counts()
author_counts = author_counts[author_counts > 2]
author_counts.to_csv("data/unique_counts_author.csv")

publisher_counts = metadata_df['nur'].apply(force_explode).stack().value_counts()
publisher_counts = publisher_counts[publisher_counts > 2]
publisher_counts.to_csv("data/unique_counts_publisher.csv")