## book_reviews

In [None]:
## book_reviews
from kagglehub import KaggleDatasetAdapter
import kagglehub
import pandas as pd

book_reviews = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "mohamedbakhet/amazon-books-reviews",
    path="Books_rating.csv"
)

def preprocess_books_rating(df):
    # Convert review/helpfulness to %
    df[['helpful_votes', 'total_votes']] = df['review/helpfulness'].str.split('/', expand=True).astype(float)
    df['review_helpfulness'] = df.apply(
        lambda row: (row['helpful_votes'] / row['total_votes'] * 100) if row['total_votes'] > 0 else None,
        axis=1
    ).round(2)

    # Standardize review/time
    df['review_time'] = pd.to_datetime(df['review/time'], unit='s').dt.date

    # Drop and rename columns
    df.drop(columns=['review/helpfulness', 'helpful_votes', 'total_votes', 'review/time', 'profileName'], inplace=True)
    df.rename(columns={
        'review/score': 'review_score',
        'review/summary': 'review_summary',
        'review/text': 'review_text',
        'Id': 'id',
        'Title': 'title',
        'User_id': 'user_id',
        'Price': 'price',
    }, inplace=True)

    return df

book_reviews = preprocess_books_rating(book_reviews)

  book_reviews = kagglehub.load_dataset(


## books_metadata

In [None]:
## books_metadata
import numpy as np

books_metadata = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "mohamedbakhet/amazon-books-reviews",
    path="books_data.csv"
)

def preprocess_books_data(books_metadata, books_rating):
    # Standardize publishedDate
    books_metadata['published_year'] = books_metadata['publishedDate'].astype(str).str.extract(r'\b(1[0-9]{3}|20[0-9]{2})\b')[0]

    # Drop and rename columns
    books_metadata.drop(columns=['publishedDate','previewLink'], inplace=True, errors='ignore')
    books_metadata.rename(columns={
        'infoLink': 'info_link',
        'ratingsCount': 'ratings_count',
        'Title': 'title'
    }, inplace=True)

    # Compute review counts from book_reviews
    review_counts = books_rating.groupby('title').size().reset_index(name='review_count')

    # Merge and fill ratings_count
    books_metadata = books_metadata.merge(review_counts, on='title', how='left')
    books_metadata['ratings_count'] = books_metadata['ratings_count'].fillna(books_metadata['review_count'])
    books_metadata['ratings_count'] = books_metadata['ratings_count'].fillna(0).astype(np.int64)

    # Drop temporary review count column
    books_metadata.drop(columns='review_count', inplace=True)

    return books_metadata

books_metadata = preprocess_books_data(books_metadata, book_reviews)

  books_metadata = kagglehub.load_dataset(


In [None]:
# Create categories table
import ast
import re

def safe_eval(val):
    if isinstance(val, str):
        return ast.literal_eval(val)
    return val

books_metadata['categories'] = books_metadata['categories'].fillna('[]').apply(safe_eval)

def split_categories(cat_list):
    flat = []
    for item in cat_list:
        parts = re.split(r'&|,', item)
        flat.extend([p.strip() for p in parts if p.strip()])
    return flat

books_metadata['categories'] = books_metadata['categories'].apply(split_categories)

expand = books_metadata[['title', 'categories']].explode('categories')

unique_categories = expand['categories'].dropna().unique()
categories = pd.DataFrame({
    'category_id': range(1, len(unique_categories) + 1),
    'category_name': unique_categories
})

books_metadata.drop(columns='categories', inplace=True)

## book_categories

In [None]:
## book_categories relationship
category_map = dict(zip(categories['category_name'], categories['category_id']))
expand['category_id'] = expand['categories'].map(category_map)
book_categories = expand[['title', 'category_id']].dropna().drop_duplicates()
book_categories

Unnamed: 0,title,category_id
0,Its Only Art If Its Well Hung!,1.0
0,Its Only Art If Its Well Hung!,2.0
1,Dr. Seuss: American Icon,3.0
1,Dr. Seuss: American Icon,4.0
2,Wonderful Worship in Smaller Churches,5.0
...,...,...
212397,The Magic of the Soul: Applying Spiritual Powe...,25.0
212398,Autodesk Inventor 10 Essentials Plus,54.0
212399,The Orphan Of Ellis Island (Time Travel Advent...,34.0
212400,Red Boots for Christmas,34.0


In [None]:
from google.colab import files
book_categories.to_csv('book_categories.csv', index=False)
files.download('book_categories.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## isbndb_book

In [None]:
!pip install datasets
from datasets import load_dataset
isbndb_book = load_dataset("P1ayer-1/isbndb-full-database")

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/671 [00:00<?, ?B/s]

(…)-00000-of-00011-8c76aaae89c32750.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

(…)-00001-of-00011-0570d9f1f07ee5ea.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

(…)-00002-of-00011-43bde59cf032f28e.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00003-of-00011-cc6f9f6e8311bb1d.parquet:   0%|          | 0.00/300M [00:00<?, ?B/s]

(…)-00004-of-00011-c0423bfbe2df7a76.parquet:   0%|          | 0.00/321M [00:00<?, ?B/s]

(…)-00005-of-00011-bfd8f039d2714da0.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

(…)-00006-of-00011-18be629cdddd6f97.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

(…)-00007-of-00011-ea8e09c51301bce7.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

(…)-00008-of-00011-bff7d5ac18f4238c.parquet:   0%|          | 0.00/297M [00:00<?, ?B/s]

(…)-00009-of-00011-84ef5d0ca46f90be.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

(…)-00010-of-00011-dfef6d372d63bc4c.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28086774 [00:00<?, ? examples/s]

In [None]:
## isbndb_book
import re

def preprocess_function(example):
    # Standardize date
    match = re.search(r'\b(1[0-9]{3}|20[0-9]{2})\b', str(example.get('date_published', '')))
    example['published_year'] = match.group(0) if match else None

    # Simplify language
    lang = str(example.get('language', ''))
    example['language'] = lang.split('_')[0] if '_' in lang else lang

    return example

isbndb_book['train'] = isbndb_book['train'].map(preprocess_function)
isbndb_book['train'] = isbndb_book['train'].remove_columns(['date_published', 'title_long'])
isbndb_book['train'].to_csv('isbndb_book.csv')

Map:   0%|          | 0/28086774 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
# Create authors table
clean_df = isbndb_book['train'].to_pandas().dropna(subset=['authors']).copy()
expanded_df = clean_df.explode('authors')

# Split authors to individual names
def smart_split(author_str):
    if not isinstance(author_str, str):
        return []

    if re.search(r'\s+&\s+|\s+and\s+|;', author_str):
        return [a.strip() for a in re.split(r'\s+&\s+|\s+and\s+|;', author_str) if a.strip()]

    parts = [p.strip() for p in author_str.split(',')]

    if len(parts) == 2 and len(parts[0].split()) >= 2 and len(parts[1].split()) >= 2:
        return parts

    return [author_str.strip()]

expanded_df['cleaned_author'] = expanded_df['authors'].apply(smart_split)

final_authors = expanded_df.explode('cleaned_author')

bad_terms = ['Unknown', 'Last', 'First', 'Na', 'Author', 'Editor']
pattern = '|'.join([f'^{re.escape(term)}$' for term in bad_terms])

final_authors = final_authors[
    final_authors['cleaned_author'].notna() &
    ~final_authors['cleaned_author'].astype(str).str.strip().str.match(pattern, case=False, na=False)
]

authors = final_authors[['cleaned_author']].drop_duplicates().reset_index(drop=True)
authors = authors.rename(columns={'cleaned_author': 'author_name'})
authors['author_id'] = authors.index + 1

isbndb_book['train'].drop(columns=['authors'], inplace=True, errors='ignore')

## book_authors

In [None]:
## book_authors relationship
book_authors = final_authors.merge(
    authors,
    left_on='cleaned_author',
    right_on='author_name',
    how='inner'
)

book_authors = book_authors[['isbn13', 'author_id']].rename(columns={'isbn13': 'isbn'})

book_authors = book_authors.drop_duplicates().reset_index(drop=True)
book_authors