In [189]:
import pandas as pd
import requests
import json
import re
import ast
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Sample book data (replace with your dataset)
df = pd.read_csv("goodreads_data.csv")
df.drop(columns = ["Unnamed: 0", "URL"], inplace = True)
df.dropna(inplace = True)
df["Genres"] = df["Genres"].str.split(", ").apply(lambda x: [genre.strip("[]") for genre in x])
df["Genres"] = df["Genres"].apply(lambda x: ', '.join(x))
df["Genres"] = df["Genres"].apply(lambda x: x.replace("'", ""))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [190]:
df

Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings
0,To Kill a Mockingbird,Harper Lee,The unforgettable novel of a childhood in a sl...,"Classics, Fiction, Historical Fiction, School,...",4.27,5691311
1,Harry Potter and the Philosopher’s Stone (Harr...,J.K. Rowling,Harry Potter thinks he is an ordinary boy - un...,"Fantasy, Fiction, Young Adult, Magic, Children...",4.47,9278135
2,Pride and Prejudice,Jane Austen,"Since its immediate success in 1813, Pride and...","Classics, Fiction, Romance, Historical Fiction...",4.28,3944155
3,The Diary of a Young Girl,Anne Frank,Discovered in the attic in which she spent the...,"Classics, Nonfiction, History, Biography, Memo...",4.18,3488438
4,Animal Farm,George Orwell,Librarian's note: There is an Alternate Cover ...,"Classics, Fiction, Dystopia, Fantasy, Politics...",3.98,3575172
...,...,...,...,...,...,...
9995,"Breeders (Breeders Trilogy, #1)",Ashley Quigley,How far would you go? If human society was gen...,"Dystopia, Science Fiction, Post Apocalyptic, P...",3.44,276
9996,Dynamo,Eleanor Gustafson,Jeth Cavanaugh is searching for a new life alo...,,4.23,60
9997,The Republic of Trees,Sam Taylor,This dark fable tells the story of four Englis...,"Fiction, Horror, Dystopia, Coming Of Age",3.29,383
9998,"Waking Up (Healing Hearts, #1)",Renee Dyer,For Adriana Monroe life couldn’t get any bette...,"New Adult, Romance, Contemporary Romance, Cont...",4.13,263


In [191]:
df["Genres"][0]

'Classics, Fiction, Historical Fiction, School, Literature, Young Adult, Historical'

In [192]:
def fetch_book_details(volume_id):
    url = f"https://www.googleapis.com/books/v1/volumes/{volume_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

In [193]:
def extract_volume_id(url):
    match = re.search(r'/books/edition/.+/([^/?]+)', url)
    if match:
        return match.group(1)
    else:
        return None

In [194]:
def format_book_data(book_data):
    volume_info = book_data.get("volumeInfo", {})
    book = volume_info.get("title", "N/A")
    authors = ", ".join(volume_info.get("authors", ["N/A"]))
    description = volume_info.get("description", "N/A")
    genres = ", ".join(volume_info.get("categories", ["N/A"]))
    avg_rating = volume_info.get("averageRating", "N/A")
    
    book_dict = {
        "Book": book,
        "Author": authors,
        "Description": description,
        "Genres": genres,
        "Avg_Rating": avg_rating
    }
    
    return book_dict

In [195]:
def preprocess_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ""        # If NaN, return an empty string
    
    # Remove square brackets and split by commas
    cleaned_text = text.strip("[]").replace("'", "").split(", ")
    
    # Tokenize each genre
    tokens = [token.strip() for token in cleaned_text]
    
    # Lowercase and remove stopwords/punctuation
    clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]
    return " ".join(clean_tokens)

In [196]:
def compute_similarity(descriptions, genres):
    # Vectorize text
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions)

    # Compute cosine similarity for descriptions
    description_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Compute cosine similarity for genres

    tfidf_vectorizer_genres = TfidfVectorizer()
    tfidf_matrix_genres = tfidf_vectorizer.fit_transform(genres)
    genre_similarity = cosine_similarity(tfidf_matrix_genres, tfidf_matrix_genres)

    return description_similarity, genre_similarity

In [197]:
def find_recommendations(input_book, book_data, description_similarity, genre_similarity):
    input_index = book_data.index[book_data["Book"] == input_book][0]

    combined_similarity = (description_similarity[input_index] + genre_similarity[input_index]) / 2

    similar_indices = combined_similarity.argsort()[-4:-1][::-1] 
    similar_books = [(book_data.iloc[idx]["Book"], combined_similarity[idx]) for idx in similar_indices]

    return similar_books

In [198]:
def main():
    url = input("Paste your Google Books URL here: ")
    volume_id = extract_volume_id(url)
    book_data = fetch_book_details(volume_id)
    formatted_data = format_book_data(book_data)

    input_book = formatted_data["Book"]
    input_description = preprocess_text(formatted_data["Description"])
    input_genres = preprocess_text(formatted_data["Genres"])

    # Compute similarity
    description_similarity, genre_similarity = compute_similarity(df["Description"], df["Genres"])

    # Find recommendations
    recommendations = find_recommendations(input_book, df, description_similarity, genre_similarity)
    print(f"Recommendations for '{input_book}':")
    for book, similarity in recommendations:
        print(f"Book: {book}, Similarity: {similarity}")

In [199]:
if __name__ == "__main__":
    main()

Paste your Google Books URL here:  https://www.google.de/books/edition/Industrial_Society_and_Its_Future/9ja1zwEACAAJ?hl=en


KeyboardInterrupt: 

In [200]:
import pandas as pd
import requests
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# NLTK setup
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Load the dataset
df = pd.read_csv("goodreads_data.csv")
df.drop(columns=["Unnamed: 0", "URL"], inplace=True)
df.dropna(inplace=True)
df["Genres"] = df["Genres"].str.split(", ").apply(lambda x: [genre.strip("[]") for genre in x])
df["Genres"] = df["Genres"].apply(lambda x: ', '.join(x))
df["Genres"] = df["Genres"].apply(lambda x: x.replace("'", ""))

def fetch_book_details(volume_id):
    url = f"https://www.googleapis.com/books/v1/volumes/{volume_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

def extract_volume_id(url):
    match = re.search(r'/books/edition/.+/([^/?]+)', url)
    if match:
        return match.group(1)
    else:
        return None

def format_book_data(book_data):
    volume_info = book_data.get("volumeInfo", {})
    book = volume_info.get("title", "N/A")
    authors = ", ".join(volume_info.get("authors", ["N/A"]))
    description = volume_info.get("description", "N/A")
    genres = ", ".join(volume_info.get("categories", ["N/A"]))
    avg_rating = volume_info.get("averageRating", "N/A")
    
    book_dict = {
        "Book": book,
        "Author": authors,
        "Description": description,
        "Genres": genres,
        "Avg_Rating": avg_rating
    }
    
    return book_dict

def preprocess_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ""        # If NaN, return an empty string
    
    # Tokenize, lemmatize, and remove stopwords
    tokens = word_tokenize(text)
    clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]
    return " ".join(clean_tokens)

def compute_similarity(descriptions, genres):
    # Vectorize text
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_desc = tfidf_vectorizer.fit_transform(descriptions)
    tfidf_matrix_genres = tfidf_vectorizer.fit_transform(genres)

    # Compute cosine similarity
    description_similarity = cosine_similarity(tfidf_matrix_desc)
    genre_similarity = cosine_similarity(tfidf_matrix_genres)

    return description_similarity, genre_similarity

def find_recommendations(input_book_idx, book_data, description_similarity, genre_similarity):
    combined_similarity = (description_similarity[input_book_idx] + genre_similarity[input_book_idx]) / 2

    similar_indices = combined_similarity.argsort()[-4:-1][::-1]
    similar_books = [(book_data.iloc[idx]["Book"], combined_similarity[idx]) for idx in similar_indices]

    return similar_books

def main():
    url = input("Paste your Google Books URL here: ")
    volume_id = extract_volume_id(url)
    book_data = fetch_book_details(volume_id)
    formatted_data = format_book_data(book_data)

    input_book = formatted_data["Book"]
    input_description = preprocess_text(formatted_data["Description"])
    input_genres = preprocess_text(formatted_data["Genres"])

    # Add input book's data to the dataframe using pd.concat
    input_book_df = pd.DataFrame([{
        "Book": input_book,
        "Description": input_description,
        "Genres": input_genres
    }])
    
    df_extended = pd.concat([df, input_book_df], ignore_index=True)

    # Compute similarity
    description_similarity, genre_similarity = compute_similarity(df_extended["Description"], df_extended["Genres"])

    # Find recommendations
    input_book_idx = df_extended.index[df_extended["Book"] == input_book][0]
    recommendations = find_recommendations(input_book_idx, df_extended, description_similarity, genre_similarity)
    print(f"Recommendations for '{input_book}':")
    for book, similarity in recommendations:
        print(f"Book: {book}, Similarity: {similarity}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Paste your Google Books URL here:  https://www.google.de/books/edition/Industrial_Society_and_Its_Future/9ja1zwEACAAJ?hl=en


Recommendations for 'Industrial Society and Its Future':
Book: The Social Contract, Similarity: 0.15005458661282167
Book: Leviathan, Similarity: 0.14608071181760068
Book: The Prince, Similarity: 0.14595345873246726


In [201]:
import pandas as pd
import requests
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# NLTK setup
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Load the dataset
df = pd.read_csv("goodreads_data.csv")
df.drop(columns=["Unnamed: 0", "URL"], inplace=True)
df.dropna(inplace=True)
df["Genres"] = df["Genres"].str.split(", ").apply(lambda x: [genre.strip("[]") for genre in x])
df["Genres"] = df["Genres"].apply(lambda x: ', '.join(x))
df["Genres"] = df["Genres"].apply(lambda x: x.replace("'", ""))

def fetch_book_details(volume_id):
    url = f"https://www.googleapis.com/books/v1/volumes/{volume_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

def extract_volume_id(url):
    match = re.search(r'/books/edition/.+/([^/?]+)', url)
    if match:
        return match.group(1)
    else:
        return None

def format_book_data(book_data):
    volume_info = book_data.get("volumeInfo", {})
    book = volume_info.get("title", "N/A")
    authors = ", ".join(volume_info.get("authors", ["N/A"]))
    description = volume_info.get("description", "N/A")
    genres = ", ".join(volume_info.get("categories", ["N/A"]))
    avg_rating = volume_info.get("averageRating", "N/A")
    
    book_dict = {
        "Book": book,
        "Author": authors,
        "Description": description,
        "Genres": genres,
        "Avg_Rating": avg_rating
    }
    
    return book_dict

def preprocess_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ""        # If NaN, return an empty string
    
    # Tokenize, lemmatize, and remove stopwords
    tokens = word_tokenize(text)
    clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]
    return " ".join(clean_tokens)

def compute_similarity(descriptions, genres):
    # Vectorize text
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_desc = tfidf_vectorizer.fit_transform(descriptions)
    tfidf_matrix_genres = tfidf_vectorizer.fit_transform(genres)

    # Compute cosine similarity
    description_similarity = cosine_similarity(tfidf_matrix_desc)
    genre_similarity = cosine_similarity(tfidf_matrix_genres)

    return description_similarity, genre_similarity

def find_recommendations(input_book_idx, book_data, description_similarity, genre_similarity):
    combined_similarity = (description_similarity[input_book_idx] + genre_similarity[input_book_idx]) / 2

    similar_indices = combined_similarity.argsort()[-4:-1][::-1]
    similar_books = [(book_data.iloc[idx]["Book"], book_data.iloc[idx]["Author"]) for idx in similar_indices]

    return similar_books

def main():
    url = input("Paste your Google Books URL here: ")
    volume_id = extract_volume_id(url)
    book_data = fetch_book_details(volume_id)
    formatted_data = format_book_data(book_data)

    input_book = formatted_data["Book"]
    input_description = preprocess_text(formatted_data["Description"])
    input_genres = preprocess_text(formatted_data["Genres"])

    # Add input book's data to the dataframe using pd.concat
    input_book_df = pd.DataFrame([{
        "Book": input_book,
        "Description": input_description,
        "Genres": input_genres,
        "Author": formatted_data["Author"]  # Add author information
    }])
    
    df_extended = pd.concat([df, input_book_df], ignore_index=True)

    # Compute similarity
    description_similarity, genre_similarity = compute_similarity(df_extended["Description"], df_extended["Genres"])

    # Find recommendations
    input_book_idx = df_extended.index[df_extended["Book"] == input_book][0]
    recommendations = find_recommendations(input_book_idx, df_extended, description_similarity, genre_similarity)
    print(f"Recommendations for '{input_book}':")
    for book, author in recommendations:
        print(f"Book: {book}, Author: {author}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Paste your Google Books URL here:  https://www.google.de/books/edition/Industrial_Society_and_Its_Future/9ja1zwEACAAJ?hl=en


Recommendations for 'Industrial Society and Its Future':
Book: The Social Contract, Author: Jean-Jacques Rousseau
Book: Leviathan, Author: Thomas Hobbes
Book: The Prince, Author: Niccolò Machiavelli


In [202]:
import pandas as pd
import requests
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# NLTK setup
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Load the dataset
df = pd.read_csv("goodreads_data.csv")
df.drop(columns=["Unnamed: 0", "URL"], inplace=True)
df.dropna(inplace=True)
df["Genres"] = df["Genres"].str.split(", ").apply(lambda x: [genre.strip("[]") for genre in x])
df["Genres"] = df["Genres"].apply(lambda x: ', '.join(x))
df["Genres"] = df["Genres"].apply(lambda x: x.replace("'", ""))

def fetch_book_details(volume_id):
    url = f"https://www.googleapis.com/books/v1/volumes/{volume_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

def extract_volume_id(url):
    match = re.search(r'/books/edition/.+/([^/?]+)', url)
    if match:
        return match.group(1)
    else:
        return None

def format_book_data(book_data):
    volume_info = book_data.get("volumeInfo", {})
    book = volume_info.get("title", "N/A")
    authors = ", ".join(volume_info.get("authors", ["N/A"]))
    description = volume_info.get("description", "N/A")
    genres = ", ".join(volume_info.get("categories", ["N/A"]))
    avg_rating = volume_info.get("averageRating", "N/A")
    
    book_dict = {
        "Book": book,
        "Author": authors,
        "Description": description,
        "Genres": genres,
        "Avg_Rating": avg_rating
    }
    
    return book_dict

def search_google_books(book_title, author):
    query = f"{book_title} {author}"
    url = f"https://www.googleapis.com/books/v1/volumes?q={query}"
    response = requests.get(url)
    if response.status_code == 200:
        results = response.json().get("items", [])
        if results:
            return results[0]["volumeInfo"].get("infoLink", "N/A")
    return "N/A"

def preprocess_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ""        # If NaN, return an empty string
    
    # Tokenize, lemmatize, and remove stopwords
    tokens = word_tokenize(text)
    clean_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]
    return " ".join(clean_tokens)

def compute_similarity(descriptions, genres):
    # Vectorize text
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_desc = tfidf_vectorizer.fit_transform(descriptions)
    tfidf_matrix_genres = tfidf_vectorizer.fit_transform(genres)

    # Compute cosine similarity
    description_similarity = cosine_similarity(tfidf_matrix_desc)
    genre_similarity = cosine_similarity(tfidf_matrix_genres)

    return description_similarity, genre_similarity

def find_recommendations(input_book_idx, book_data, description_similarity, genre_similarity):
    combined_similarity = (description_similarity[input_book_idx] + genre_similarity[input_book_idx]) / 2

    similar_indices = combined_similarity.argsort()[-4:-1][::-1]
    similar_books = [(book_data.iloc[idx]["Book"], book_data.iloc[idx]["Author"]) for idx in similar_indices]

    return similar_books

def main():
    url = input("Paste your Google Books URL here: ")
    volume_id = extract_volume_id(url)
    book_data = fetch_book_details(volume_id)
    formatted_data = format_book_data(book_data)

    input_book = formatted_data["Book"]
    input_description = preprocess_text(formatted_data["Description"])
    input_genres = preprocess_text(formatted_data["Genres"])

    # Add input book's data to the dataframe using pd.concat
    input_book_df = pd.DataFrame([{
        "Book": input_book,
        "Description": input_description,
        "Genres": input_genres,
        "Author": formatted_data["Author"]
    }])
    
    df_extended = pd.concat([df, input_book_df], ignore_index=True)

    # Compute similarity
    description_similarity, genre_similarity = compute_similarity(df_extended["Description"], df_extended["Genres"])

    # Find recommendations
    input_book_idx = df_extended.index[df_extended["Book"] == input_book][0]
    recommendations = find_recommendations(input_book_idx, df_extended, description_similarity, genre_similarity)
    
    print(f"Recommendations for '{input_book}':")
    for book, author in recommendations:
        link = search_google_books(book, author)
        print(f"Book: {book}, Author: {author}, Link: {link}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akirichenko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Paste your Google Books URL here:  https://www.google.de/books/edition/Industrial_Society_and_Its_Future/9ja1zwEACAAJ?hl=en


Recommendations for 'Industrial Society and Its Future':
Book: The Social Contract, Author: Jean-Jacques Rousseau, Link: https://play.google.com/store/books/details?id=GK1GAQAAMAAJ&source=gbs_api
Book: Leviathan, Author: Thomas Hobbes, Link: https://play.google.com/store/books/details?id=RI9qEAAAQBAJ&source=gbs_api
Book: The Prince, Author: Niccolò Machiavelli, Link: http://books.google.de/books?id=bRdLCgAAQBAJ&dq=The+Prince+Niccol%C3%B2+Machiavelli&hl=&source=gbs_api
