In [None]:
## called PRH API again to find out the categories called 'classics'
## exported classics based on these categories
## then worked with CGPT to 1) clean these results (.json to df, keep only relevant columns) and 2) enrich and deduplicate against existing book list

In [None]:
import pandas as pd

df1 = pd.read_json('Categories-1765200986129.json')
df2 = pd.read_json('Categories-1765201667362.json')
df3 = pd.read_json('Categories-1765201975361.json')
df4 = pd.read_json('Categories-1765202008959.json')
df5 = pd.read_json('Categories-1765202091477.json')
df6 = pd.read_json('Categories-1765202115878.json')
df7 = pd.read_json('Categories-1765202145247.json')
df8 = pd.read_json('Categories-1765202167643.json')
df9 = pd.read_json('Categories-1765202243894.json')
df10 = pd.read_json('Categories-1765202265960.json')
df11 = pd.read_json('Categories-1765202289743.json')
df12 = pd.read_json('Categories-1765202313896.json')

In [None]:
df_clean1 = pd.DataFrame(df1.data.titles)
df_clean2 = pd.DataFrame(df2.data.titles)
df_clean3 = pd.DataFrame(df3.data.titles)
df_clean4 = pd.DataFrame(df4.data.titles)
df_clean5 = pd.DataFrame(df5.data.titles)
df_clean6 = pd.DataFrame(df6.data.titles)
df_clean7 = pd.DataFrame(df7.data.titles)
df_clean8 = pd.DataFrame(df8.data.titles)
df_clean9 = pd.DataFrame(df9.data.titles)
df_clean10 = pd.DataFrame(df10.data.titles)
df_clean11 = pd.DataFrame(df11.data.titles)
df_clean12 = pd.DataFrame(df12.data.titles)

In [None]:
concatenated = pd.DataFrame(pd.concat([df_clean1, df_clean2, df_clean3, df_clean4, df_clean5, df_clean6, df_clean7, df_clean8, df_clean9, df_clean10, df_clean11, df_clean12], axis=0))

In [None]:
cols_to_keep = ['title', 'author', 'price', 'pages', ]
df_new = concatenated[cols_to_keep]


In [None]:
import numpy as np

df_new['length'] = np.where(df_new['pages'] > 350, 'long',
                np.where(df_new['pages'] < 250, 'short', 'average'))

df_new['classics'] = 1
df_new['classics'] = df_new['classics'].astype('int64')


In [None]:
price_expanded = df_new['price'].apply(lambda x: x[0]).apply(pd.Series)
df = pd.concat([df_new, price_expanded], axis=1)


In [None]:
rates = {
    'USD': 1/1.165,  # ~0.859
    'CAD': 1/1.63,   # ~0.613
    'EUR': 1.0       # if some prices already in EUR
}

def price_to_eur(price_entry):
    d = price_entry[0]  # assuming list with a dict
    amount = d.get('amount')
    curr = d.get('currencyCode')
    rate = rates.get(curr)
    return amount * rate if amount is not None and rate is not None else None

df['price_eur'] = df_new['price'].apply(price_to_eur)


In [None]:
cols_to_keep = ['title', 'author', 'length', 'classics','price_eur']
df_final = df[cols_to_keep]


In [None]:
df_existing = pd.read_csv('deduplicated.csv')

In [None]:
!pip install rapidfuzz

In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz, process
import re

# Make copies
df_existing = df_existing.copy()
df_final = df_final.copy()

# Ensure 'classics' exists
if 'classics' not in df_existing.columns:
    df_existing['classics'] = 0

# Make sure all rows in df_final have classics = 1
df_final['classics'] = 1

# Normalization function (lowercase, strip punctuation, normalize spaces)
def normalize_text(s):
    s = str(s).lower()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[^\w\s]', '', s)
    return s.strip()

# Normalize columns
df_existing['author_norm'] = df_existing['author'].apply(normalize_text)
df_existing['title_norm'] = df_existing['title'].apply(normalize_text)
df_final['author_norm'] = df_final['author'].apply(normalize_text)
df_final['title_norm'] = df_final['title'].apply(normalize_text)

# Blocking key: first letter of author + first letter of title
df_existing['block_key'] = df_existing['author_norm'].str[0] + df_existing['title_norm'].str[0]
df_final['block_key'] = df_final['author_norm'].str[0] + df_final['title_norm'].str[0]

matched_indices_final = []
threshold = 70  # similarity threshold

# Fuzzy matching within blocks
for block in df_final['block_key'].unique():
    block_final = df_final[df_final['block_key'] == block]
    block_existing = df_existing[df_existing['block_key'] == block]

    if block_existing.empty:
        continue

    existing_strings = (block_existing['author_norm'] + ' ' + block_existing['title_norm']).tolist()

    for _, row in block_final.iterrows():
        query = f"{row['author_norm']} {row['title_norm']}"
        result = process.extractOne(query, existing_strings, scorer=fuzz.token_set_ratio, score_cutoff=threshold)
        if result is not None:
            match, score, idx = result
            existing_idx = block_existing.index[idx]

            # Average price_eur
            df_existing.at[existing_idx, 'price_eur'] = np.nanmean([
                df_existing.at[existing_idx, 'price_eur'],
                row['price_eur']
            ])
            # Set classics for matched row
            df_existing.at[existing_idx, 'classics'] = 1
            matched_indices_final.append(row.name)  # <-- fixed index here

# Append unmatched rows from df_final
df_to_append = df_final.drop(index=matched_indices_final)

# Ensure all appended rows have classics = 1 (redundant but safe)
df_to_append['classics'] = 1

# Add missing columns with NaN
for col in df_existing.columns:
    if col not in df_to_append.columns:
        df_to_append[col] = np.nan

# Reorder columns to match df_existing
df_to_append = df_to_append[df_existing.columns]

# Combine
df_merged = pd.concat([df_existing, df_to_append], ignore_index=True)

# Drop temporary columns
df_merged = df_merged.drop(columns=['author_norm', 'title_norm', 'block_key'])

# Optional: check counts
print(f"Number of 1s in df_existing: {df_existing['classics'].sum()}")
print(f"Number of 1s in df_merged: {df_merged['classics'].sum()}")
print(f"Number of df_final rows appended: {len(df_to_append)}")
print(f"Number of df_final rows matched: {len(matched_indices_final)}")


In [None]:
df_merged.to_csv('added_classics.csv')