In [95]:
import re
import time
import random
from urllib.parse import urljoin

import requests
import pandas as pd
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns

In [None]:
df = pd.read_csv(r"..\data\clean\books_merged_clean.csv")

In [96]:
df

Unnamed: 0,title,author,published_year,language,subjects,cover,soup
0,21st century houses: riba award-winning homes,dominic bradbury,2022,english,"domestic architecture, architecture, awards, m...",https://openlibrary.org/images/icons/avatar_bo...,dominic bradbury dominic bradbury dominic brad...
1,architecture china 2020 building with nature j...,l. xiangning,2022,english,"architecture, awards, sustainable architecture...",https://openlibrary.org/images/icons/avatar_bo...,l. xiangning l. xiangning l. xiangning archite...
2,cyberarts 2021: international compendium prix ...,markus jandl,2022,english,"computer art, awards, computer animation, prix...",https://covers.openlibrary.org/b/id/13794706-M...,markus jandl markus jandl markus jandl cyberar...
3,"deutsche bank ""artists of the year"" 2021: maxw...",maxwell alexandre,2022,english,"modern art, exhibitions, art, awards, maxwell ...",https://openlibrary.org/images/icons/avatar_bo...,maxwell alexandre maxwell alexandre maxwell al...
4,in the shadow of trees,belgium) photobrussels festival (6th 202...,2022,french,"artistic photography, exhibitions, awards, pho...",https://openlibrary.org/images/icons/avatar_bo...,belgium) photobrussels festival (6th 202... be...
...,...,...,...,...,...,...,...
1033,premiums offered by the society instituted at ...,"society for the encouragement of arts, manufac...",1772,english,"plants, commerce, art, fisheries, awards, inve...",https://covers.openlibrary.org/b/id/10444430-L...,"society for the encouragement of arts, manufac..."
1034,i pregj delle belle arti,"onofrio alfani, accademia di san luca",1762,italian,"accademia di san luca., art -- awards -- italy...",https://covers.openlibrary.org/b/id/6517671-L.jpg,"onofrio alfani, accademia di san luca onofrio ..."
1035,per la solenne distribuzione del premio alle t...,celestino petracchi,1728,italian,"awards, art, early works to 1800, accademia cl...",https://covers.openlibrary.org/b/id/11908831-L...,celestino petracchi celestino petracchi celest...
1036,il teatro d'onore,ducale collegio de' nobili di parma.,1694,italian,ducale collegio de' nobili di parma -- awards....,https://covers.openlibrary.org/b/id/6074353-L.jpg,ducale collegio de' nobili di parma. ducale co...


In [None]:
# --- Feature Engineering: Weighted Features ---

# Fill NaN values in the text fields
df['title'] = df['title'].fillna('')
df['author'] = df['author'].fillna('')
df['subjects'] = df['subjects'].fillna('')
df['language'] = df['language'].fillna('')

# Create a weighted 'soup' of features
df['soup'] = df['author'].apply(lambda x: ' '.join([x,x,x])) + ' ' + \
             df['title'].apply(lambda x: ' '.join([x,x])) + ' ' + \
             df['language'].apply(lambda x: ' '.join([x,x])) + ' ' + \
             df['subjects']


In [None]:
# --- TF-IDF and Cosine Similarity ---

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'soup' column
tfidf_matrix = tfidf.fit_transform(df['soup'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Shape of Cosine Similarity Matrix:", cosine_sim.shape)


In [None]:
# --- Book Search Function ---

def find_books_by_keyword(keyword):
    """
    Searches for books containing the keyword and returns a detailed table of the top 5 matches.
    """
    # Search for the keyword (case-insensitive) in the 'soup'
    results = df[df['soup'].str.contains(keyword, case=False, na=False)]

    if results.empty:
        return f"No books found with the keyword: '{keyword}'"

    # Return the first 5 results with detailed columns
    return results[['title', 'author', 'published_year', 'language', 'subjects']].head(5)


In [None]:
# --- Recommendation Function ---

# Create a reverse map of indices and book titles for easy lookup
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    """
    This function takes a book title as input and returns a DataFrame with
    the top 10 most similar books and their details.
    """
    # Get the index of the book that matches the title
    if title not in indices:
        possible_titles = [t for t in indices.index if title in t]
        if not possible_titles:
            return f"Book with title '{title}' not found."
        title = possible_titles[0]
        print(f"Found a partial match, showing recommendations for: '{title}'")

    idx = indices[title]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books (excluding the book itself)
    sim_scores = sim_scores[1:11]

    # Get the book indices and similarity scores
    book_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]

    # Get the top 10 most similar books
    recommendations_df = df.iloc[book_indices][['title', 'author', 'published_year', 'language', 'subjects']]

    # Add the similarity scores to the DataFrame
    recommendations_df['similarity'] = similarity_scores

    return recommendations_df


In [103]:
# --- Example Usage ---

# 1. Search for a book by keyword
print("--- Step 1: Search for a book ---")
search_results = find_books_by_keyword("cartoons") # You can change "History" to any keyword
display(search_results)


--- Step 1: Search for a book ---


Unnamed: 0,title,author,published_year,language,subjects
106,acclaimed press coverage of latin american cou...,heinz-dietrich fischer,2019,english,"press coverage, caricatures and cartoons, pict..."
131,eyüpsultan belediyesi ortaokul ve liseler aras...,eyüpsultan belediyesi,2019,turkish,"caricatures and cartoons, exhibitions, awards,..."
746,"editorial cartoon awards, 1922-1997","heinz dietrich fischer, erika fischer",1999,english,editorial cartoons -- awards -- united states....


In [102]:
# 2. Get recommendations for a book from the search results
print("\n--- Step 2: Get recommendations for a book from the search ---")
if isinstance(search_results, pd.DataFrame) and not search_results.empty:
    # Pick the first book from the search results to get recommendations
    example_title = search_results['title'].iloc[1]
    print(f"\nRecommendations for: '{example_title}'\n")
    recommendations = get_recommendations(example_title)
    display(recommendations) # Use display() for nice table formatting in notebooks
else:
    print("No search results to get recommendations from.")



--- Step 2: Get recommendations for a book from the search ---
No search results to get recommendations from.
