In [2]:
import pandas as pd

# Example: Load dataset
books = pd.read_csv('books.csv')  # Book metadata
reviews = pd.read_csv('ratings.csv')  # User reviews

In [4]:
print(books.columns)

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')


In [5]:
books['combined_features'] = books['title'] + ' ' + books['authors']
books['processed_features'] = books['combined_features'].str.lower()

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenize
    return ' '.join([word for word in words if word not in stop_words])  # Remove stopwords

# Combine 'title' and 'authors' columns into a single feature
books['combined_features'] = books['title'] + ' ' + books['authors']

# Apply the preprocessing function to the combined text
books['processed_features'] = books['combined_features'].apply(preprocess_text)

# Optional: Check the result
print(books[['title', 'authors', 'processed_features']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(books['processed_features'])

# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [11]:
def get_recommendations(title, cosine_sim=cosine_sim):
    indices = pd.Series(books.index, index=books['title']).drop_duplicates()
    if title not in indices:
        return "Book not found in the dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 recommendations

    book_indices = [i[0] for i in sim_scores]
    return books.iloc[book_indices][['title', 'authors']]

In [12]:
recommendations = get_recommendations('Harry Potter and the Sorcerer\'s Stone')
print(recommendations)

Book not found in the dataset.


In [14]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357278 sha256=3799c3973a4087a5a1d8d2c7d09473567f5280c408be9d2a85d9d0567d0b07da
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [25]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Prepare the ratings data
ratings_data = books[['goodreads_book_id', 'average_rating']]

# Add a 'user_id' column. Here, we just use a dummy user_id for simplicity
ratings_data['user_id'] = 1  # or you could create a series of unique user IDs if needed

# Rename columns for surprise compatibility
ratings_data = ratings_data.rename(columns={'goodreads_book_id': 'item_id', 'average_rating': 'rating'})

# Set up the Reader
reader = Reader(rating_scale=(1, 5))

# Load the dataset using surprise's load_from_df
data = Dataset.load_from_df(ratings_data[['user_id', 'item_id', 'rating']], reader)

# Train a model (SVD as an example)
model = SVD()
model.fit(data.build_full_trainset())

# Cross-validation to evaluate the model
cross_validate(model, data, cv=3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_data['user_id'] = 1  # or you could create a series of unique user IDs if needed


{'test_rmse': array([0.25385435, 0.25365695, 0.25732319]),
 'test_mae': array([0.19963105, 0.19633613, 0.2025452 ]),
 'fit_time': (0.1221320629119873, 0.12379312515258789, 0.12244749069213867),
 'test_time': (0.016458511352539062,
  0.014517784118652344,
  0.014871597290039062)}

In [26]:
# Function to get top N book recommendations for a user
def get_top_n_recommendations(user_id, n=10):
    # Generate a list of all books (item_ids)
    all_books = books['goodreads_book_id'].unique()

    # List to store predictions
    predictions = []

    # Predict ratings for all books
    for book_id in all_books:
        pred = model.predict(user_id, book_id)
        predictions.append((book_id, pred.est))

    # Sort predictions by predicted rating (highest first)
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Get the top N book recommendations
    top_n_books = predictions[:n]

    # Return the top N books (book_id and predicted rating)
    return top_n_books

# Get top 10 book recommendations for the user
user_id = 1  # Replace with a unique user ID if you have one
top_books = get_top_n_recommendations(user_id, n=10)

In [27]:
# Convert top N book recommendations to titles
top_books_df = books[books['goodreads_book_id'].isin([book[0] for book in top_books])]
top_books_df['predicted_rating'] = [book[1] for book in top_books]

# Display the top N books with their predicted ratings
print(top_books_df[['title', 'predicted_rating']])

                                                  title  predicted_rating
26    Harry Potter and the Half-Blood Prince (Harry ...          4.077297
3240                 Crooked Kingdom (Six of Crows, #2)          4.074241
3274  Harry Potter Boxed Set, Books 1-5 (Harry Potte...          4.072156
3735  Harry Potter Page to Screen: The Complete Film...          4.070909
4482  It's a Magical World: A Calvin and Hobbes Coll...          4.070443
4777           The Holy Bible: English Standard Version          4.066517
5918                  Life Application Study Bible: NIV          4.061294
8853                           Mark of the Lion Trilogy          4.061122
9359         The Green Mile, Part 6: Coffey on the Mile          4.061098
9565  Attack of the Deranged Mutant Killer Monster S...          4.061064


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_books_df['predicted_rating'] = [book[1] for book in top_books]


In [40]:
# Get the user input for the author name
author_name = input("Enter the author you're interested in: ").lower()

# Filter books that match the author
books_by_author = books[books['authors'].str.lower().str.contains(author_name, na=False)].copy()

# Calculate weighted rating for filtered books
books_by_author.loc[:, 'weighted_rating'] = books_by_author['average_rating'] * books_by_author['ratings_count']

# Sort the filtered books by average rating in descending order (highest rating first)
recommended_books_by_author_weighted = books_by_author.sort_values(by='average_rating', ascending=False)

# Display the top recommended books by the selected author with average rating
print(f"\nTop recommended books by {author_name.title()} (based on average ratings):")
print(recommended_books_by_author_weighted[['title', 'average_rating', 'ratings_count', 'weighted_rating']].head())


Enter the author you're interested in: Stephen King

Top recommended books by Stephen King (based on average ratings):
                                                  title  average_rating  \
9359         The Green Mile, Part 6: Coffey on the Mile            4.55   
2092                          The Stand: Soul Survivors            4.52   
9138                Carrie / 'Salem's Lot / The Shining            4.52   
8975  Rita Hayworth and Shawshank Redemption: A Stor...            4.52   
8370  The Green Mile, Part 4: The Bad Death of Eduar...            4.52   

      ratings_count  weighted_rating  
9359          11936         54308.80  
2092          40626        183629.52  
9138          11063         50004.76  
8975          11499         51975.48  
8370          12958         58570.16  


In [41]:
# Display the top recommended books by the selected author, excluding the 'authors' column
recommended_books_by_author = books_by_top_author.sort_values(by='average_rating', ascending=False)
print(f"\nTop recommended books by {top_author}:")
print(recommended_books_by_author[['title', 'average_rating', 'ratings_count']].head())



Top recommended books by Stephen King:
                                                  title  average_rating  \
9359         The Green Mile, Part 6: Coffey on the Mile            4.55   
2092                          The Stand: Soul Survivors            4.52   
9138                Carrie / 'Salem's Lot / The Shining            4.52   
8975  Rita Hayworth and Shawshank Redemption: A Stor...            4.52   
8370  The Green Mile, Part 4: The Bad Death of Eduar...            4.52   

      ratings_count  
9359          11936  
2092          40626  
9138          11063  
8975          11499  
8370          12958  
