Importing the necessary libraries

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

trying to get the shape of data and removing unncessary columns

In [4]:
df = pd.read_csv("books.csv")
df_book_tags = pd.read_csv("book_tags.csv")
df_tags = pd.read_csv("tags.csv")
# 
df = df.drop([
    'work_id', 'isbn', 'isbn13', 'language_code', 'image_url', 'small_image_url', 'best_book_id', 'books_count','ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5'
],axis = 1)
df.dtypes



book_id                        int64
goodreads_book_id              int64
authors                       object
original_publication_year    float64
original_title                object
title                         object
average_rating               float64
ratings_count                  int64
work_ratings_count             int64
work_text_reviews_count        int64
dtype: object

In [5]:
df_tags_named = pd.merge(
    df_book_tags,
    df_tags,
    on='tag_id',
    how='left' # Keep all tag records, even if a tag_name is missing (rare)
)
df_tags_named.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,1,11305,37174,fantasy
2,1,11557,34173,favorites
3,1,8717,12986,currently-reading
4,1,33114,12716,young-adult


In [6]:
# List of generic tags to exclude (these describe user intent, not book content)
GENERIC_TAGS_TO_REMOVE = [
    'to-read',
    'currently-reading',
    'owned',
    'favorites',
    'default',
    'ebook',
    'kindle',
    'library',
    'owned-books',
    'my-books',
    'books'
]

# 1. Filter out generic tags
df_content_tags = df_tags_named[~df_tags_named['tag_name'].isin(GENERIC_TAGS_TO_REMOVE)].copy()

# Sort by count (descending) to prioritize the most important tags for each book
df_content_tags = df_content_tags.sort_values(
    ['goodreads_book_id', 'count'],
    ascending=[True, False]
)

# 2. Keep only the top 10 most relevant content tags per book (N=10)
N_TAGS = 10
df_top_tags_per_book = (
    df_content_tags.groupby('goodreads_book_id')
    .head(N_TAGS)
    .reset_index(drop=True)
)

# Aggregate the top 10 tag names into a single string ('tag_soup')
df_tags_grouped = (
    df_top_tags_per_book.groupby('goodreads_book_id')['tag_name']
    .apply(lambda x: ' '.join(x.astype(str).str.replace(' ', '-'))) # Join, replacing spaces with hyphens for better NLP
    .reset_index(name='tag_soup')
)

print("Aggregated Tags Sample:")
print(df_tags_grouped.head())

Aggregated Tags Sample:
   goodreads_book_id                                           tag_soup
0                  1  fantasy young-adult fiction harry-potter books...
1                  2  fantasy children children-s all-time-favorites...
2                  3  fantasy young-adult fiction harry-potter books...
3                  5  fantasy young-adult fiction harry-potter books...
4                  6  fantasy young-adult fiction harry-potter ya se...


Now merging this modified df_tags_grouped table with our main table

In [7]:




# 3. Final Merge with Main Books Table (using the goodreads_book_id)
df = pd.merge(
    df,
    df_tags_grouped,
    on='goodreads_book_id',
    how='left' 
)

df['tag_soup'] = df['tag_soup'].fillna('')

print("--- Final Books Content Data Sample ---")
print(df[['book_id', 'goodreads_book_id', 'title', 'authors', 'tag_soup']].head())

print(f"\nTotal rows in final dataframe: {len(df)}")

--- Final Books Content Data Sample ---
   book_id  goodreads_book_id  \
0        1            2767052   
1        2                  3   
2        3              41865   
3        4               2657   
4        5               4671   

                                               title  \
0            The Hunger Games (The Hunger Games, #1)   
1  Harry Potter and the Sorcerer's Stone (Harry P...   
2                            Twilight (Twilight, #1)   
3                              To Kill a Mockingbird   
4                                   The Great Gatsby   

                       authors  \
0              Suzanne Collins   
1  J.K. Rowling, Mary GrandPrÃ©   
2              Stephenie Meyer   
3                   Harper Lee   
4          F. Scott Fitzgerald   

                                            tag_soup  
0  young-adult fiction dystopian dystopia fantasy...  
1  fantasy young-adult fiction harry-potter books...  
2  young-adult fantasy vampires ya fiction parano... 

In [8]:
# Assuming df_books_content is your current DataFrame

# Function to clean and combine features
def clean_and_combine_features(row):
    # Ensure authors and title are clean and concatenated
    authors = str(row['authors']).replace(' ', '').replace(',', ' ')
    title = str(row['title']).replace(' ', '').replace(',', ' ')
    
    # The tag_soup already has spaces replaced by hyphens, which is good
    tags = str(row['tag_soup'])
    
    # Combine the key features into one string
    return authors + ' ' + title + ' ' + tags

# Create the new 'content_soup' column
df['content_soup'] = df.apply(clean_and_combine_features, axis=1)

print("\nContent Soup Sample (Cleaned and Combined Features):")
print(df[['title', 'content_soup']].head(2).to_string())


Content Soup Sample (Cleaned and Combined Features):
                                                      title                                                                                                                                                         content_soup
0                   The Hunger Games (The Hunger Games, #1)                         SuzanneCollins TheHungerGames(TheHungerGames #1) young-adult fiction dystopian dystopia fantasy ya science-fiction books-i-own sci-fi series
1  Harry Potter and the Sorcerer's Stone (Harry Potter, #1)  J.K.Rowling MaryGrandPrÃ© HarryPotterandtheSorcerer'sStone(HarryPotter #1) fantasy young-adult fiction harry-potter books-i-own ya series favourites magic childrens


Preparing the ratings.csv data to be used for neural collaborative filtering

In [12]:
df_cf_input = pd.read_csv("ratings.csv")
# Assuming df_cf_input is your filtered ratings DataFrame

# 1a. Create dense ID mapping
user_to_index = {original_id: index for index, original_id in enumerate(df_cf_input['user_id'].unique())}
book_to_index = {original_id: index for index, original_id in enumerate(df_cf_input['book_id'].unique())}

df_cf_input['user_index'] = df_cf_input['user_id'].map(user_to_index)
df_cf_input['book_index'] = df_cf_input['book_id'].map(book_to_index)

n_users = len(user_to_index)
n_books = len(book_to_index)

# 1b. Create binary target (only using high ratings as positive)
POSITIVE_RATING_THRESHOLD = 4
df_positive = df_cf_input[df_cf_input['rating'] >= POSITIVE_RATING_THRESHOLD].copy()
df_positive['target'] = 1

print(f"Total Unique Users: {n_users}, Total Unique Books: {n_books}")
print(f"Number of Positive Interactions: {len(df_positive)}")



# --- Assumptions ---
# df_positive, n_users, n_books are defined
# NEGATIVE_SAMPLE_RATIO = 1 (or your desired ratio)

# 1. Get the set of all known interactions (CRITICAL for fast lookup)
all_interactions = set(zip(df_cf_input['user_index'], df_cf_input['book_index']))
positive_samples_count = len(df_positive)

# 2. Determine the target number of negative samples
target_neg_count = positive_samples_count * NEGATIVE_SAMPLE_RATIO

# --- OPTIMIZATION STARTS HERE ---

# 3. Generate a large pool of random (user, book) candidates
# We generate 1.5x the needed amount to account for overlaps with positive interactions
OVERSAMPLE_FACTOR = 1.5
total_candidates_to_generate = int(target_neg_count * OVERSAMPLE_FACTOR)

print(f"Generating {total_candidates_to_generate} candidate samples...")

# Vectorized generation of random user and book indices
random_user_indices = np.random.randint(0, n_users, total_candidates_to_generate)
random_book_indices = np.random.randint(0, n_books, total_candidates_to_generate)

# Create a temporary DataFrame of candidate negative samples
df_candidates = pd.DataFrame({
    'user_index': random_user_indices,
    'book_index': random_book_indices,
})

# 4. Filter out any samples that are already in the 'all_interactions' set

# Convert the candidates into tuples for fast set lookup
candidate_interactions = set(zip(df_candidates['user_index'], df_candidates['book_index']))

# Identify which candidates are actual interactions (collisions)
# This is a highly optimized set operation
valid_neg_interactions = list(candidate_interactions - all_interactions)

# 5. Select the required number of valid negative samples
# We must ensure we only take the exact number needed (target_neg_count)
df_valid_neg = pd.DataFrame(valid_neg_interactions, columns=['user_index', 'book_index'])

# Ensure we have enough and slice to the target count
if len(df_valid_neg) < target_neg_count:
    print("\nWARNING: Not enough unique negative samples generated. Increase OVERSAMPLE_FACTOR.")
    df_negative = df_valid_neg
else:
    df_negative = df_valid_neg.head(target_neg_count)

# Add the target column
df_negative['target'] = 0

# --- Final Consolidation ---

# 6. Combine positive and negative samples
df_training_final = pd.concat([df_positive[['user_index', 'book_index', 'target']], df_negative], ignore_index=True)

# 7. Shuffle the data
df_training_final = df_training_final.sample(frac=1).reset_index(drop=True)

# 8. Print Results
print(f"\nOptimization Complete.")
print(f"Total final training samples: {len(df_training_final)}")
print(f"Positive samples: {len(df_positive)}, Negative samples: {len(df_negative)}")
print("Final Training Data Sample:")
print(df_training_final.head())


Total Unique Users: 53424, Total Unique Books: 10000
Number of Positive Interactions: 4122111
Generating 6183166 candidate samples...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_negative['target'] = 0



Optimization Complete.
Total final training samples: 8244222
Positive samples: 4122111, Negative samples: 4122111
Final Training Data Sample:
   user_index  book_index  target
0        9487        4438       0
1       18721        1009       0
2       18649        7712       0
3        5011        9169       1
4       51440        7302       0


Now our nerual collaborative filtering step begins
Step 1 is to split our data into training and testing

In [13]:
from sklearn.model_selection import train_test_split

# Separate the features (inputs) from the target (output)
users = df_training_final['user_index'].values
books = df_training_final['book_index'].values
targets = df_training_final['target'].values

# Split into 80% training and 20% validation sets
X_train_u, X_val_u, X_train_i, X_val_i, y_train, y_val = train_test_split(
    users, books, targets, test_size=0.2, random_state=42, stratify=targets
)

print(f"\nTraining Samples: {len(X_train_u)}")
print(f"Validation Samples: {len(X_val_u)}")


Training Samples: 6595377
Validation Samples: 1648845


Step 2 is defining the base ncf model architecture

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Flatten, Concatenate, Dense, Multiply
from tensorflow.keras.models import Model

# Define the model function again (ensure you have n_users and n_books in scope)
def build_ncf_model(num_users, num_items, embedding_dim=10):
    # --- Input Layers ---
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')

    # --- Embedding Layers (Shared for GMF and MLP paths) ---
    # GMF Path Embeddings
    user_embedding_gmf = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_gmf_embedding')(user_input)
    item_embedding_gmf = Embedding(input_dim=num_items, output_dim=embedding_dim, name='item_gmf_embedding')(item_input)
    # MLP Path Embeddings
    user_embedding_mlp = Embedding(input_dim=num_users, output_dim=embedding_dim, name='user_mlp_embedding')(user_input)
    item_embedding_mlp = Embedding(input_dim=num_items, output_dim=embedding_dim, name='item_mlp_embedding')(item_input)

    # --- GMF Path (Linear Interaction) ---
    gmf_user_flat = Flatten()(user_embedding_gmf)
    gmf_item_flat = Flatten()(item_embedding_gmf)
    gmf_interaction = Multiply()([gmf_user_flat, gmf_item_flat])

    # --- MLP Path (Non-linear Interaction) ---
    mlp_user_flat = Flatten()(user_embedding_mlp)
    mlp_item_flat = Flatten()(item_embedding_mlp)
    mlp_interaction = Concatenate()([mlp_user_flat, mlp_item_flat])
    
    # Simple DNN layers for non-linear learning
    mlp_layer = Dense(embedding_dim * 2, activation='relu')(mlp_interaction)
    mlp_layer = Dense(embedding_dim, activation='relu')(mlp_layer)

    # --- Fusion Layer (Combine GMF and MLP) ---
    fusion = Concatenate()([gmf_interaction, mlp_layer])
    
    # --- Prediction Layer ---
    # Output uses sigmoid for binary classification (predicting P(interaction))
    output = Dense(1, activation='sigmoid', name='output')(fusion)

    model = Model(inputs=[user_input, item_input], outputs=output)
    return model

# Initialize and compile the model
EMBEDDING_DIM = 10 # Hyperparameter: determines the size of the latent factors
ncf_model = build_ncf_model(n_users, n_books, embedding_dim=EMBEDDING_DIM)
ncf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
                  loss='binary_crossentropy') # Binary Cross-Entropy for binary classification (0 or 1)

print("\nNCF Model Architecture Summary:")
ncf_model.summary()

# Image of Neural Collaborative Filtering Architecture

Step 3 is training the model

In [None]:
# Train the model
BATCH_SIZE = 256
EPOCHS = 10 # Start with 10 epochs, adjust based on validation loss

history = ncf_model.fit(
    [X_train_u, X_train_i], y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    validation_data=([X_val_u, X_val_i], y_val)
)

print("\nModel Training Complete!")

now predicting

In [None]:
import numpy as np

def recommend_ncf_for_user(ncf_model, user_id, df_cf_input, book_to_index, n_books, top_k=10):
    
    # 1. Map the original user_id to the internal index
    user_index = user_to_index.get(user_id)
    
    if user_index is None:
        print(f"Error: User ID {user_id} not found in the trained dataset.")
        return []

    # 2. Identify books the user HAS interacted with (exclude these)
    user_interactions = df_cf_input[df_cf_input['user_id'] == user_id]['book_index'].unique()
    
    # 3. Identify all book indices the user HAS NOT interacted with (candidates)
    all_book_indices = np.arange(n_books)
    candidate_book_indices = np.setdiff1d(all_book_indices, user_interactions)
    
    # 4. Prepare input for the model
    # The model expects two arrays: (user_indices, book_indices)
    user_indices_array = np.full(len(candidate_book_indices), user_index)
    
    # 5. Predict scores (inference)
    predictions = ncf_model.predict([user_indices_array, candidate_book_indices], verbose=0).flatten()
    
    # 6. Rank the candidates
    # Get the indices that would sort the predictions in descending order
    top_k_indices = predictions.argsort()[-top_k:][::-1]
    
    # Get the corresponding book indices (the *internal* IDs)
    top_book_indices = candidate_book_indices[top_k_indices]
    top_scores = predictions[top_k_indices]
    
    # 7. Map back to original book_id (the key to your main books table)
    
    # Create the inverse map (index_to_book)
    index_to_book = {v: k for k, v in book_to_index.items()}
    
    # Map the internal index back to the external book_id
    top_book_ids = [index_to_book[idx] for idx in top_book_indices]

    # 8. Return results
    results = pd.DataFrame({
        'original_book_id': top_book_ids,
        'ncf_score': top_scores
    })
    
    return results

# Example Usage: Assuming you want recommendations for a reliable user with original user_id=2
# Replace 2 with a real user_id from your dataset
# user_id_to_test = 2 
# recommendations = recommend_ncf_for_user(ncf_model, user_id_to_test, df_cf_input, book_to_index, n_books)
# print(f"\nRecommendations for User {user_id_to_test}:\n", recommendations)

using the model to get for some user id, the top k books it would highly rate

In [None]:
# --- Find a Valid User ID to Test ---
# Get the first user_id from the set of users the model was trained on
user_id_to_test = df_cf_input['user_id'].iloc[0]

print(f"Generating NCF recommendations for User ID: {user_id_to_test}\n")

# Call your defined recommendation function
recommendations_ncf = recommend_ncf_for_user(
    ncf_model=ncf_model,
    user_id=user_id_to_test,
    df_cf_input=df_cf_input,
    book_to_index=book_to_index,
    n_books=n_books,
    top_k=10
)

print("--- Raw NCF Output (Original Book IDs and Scores) ---")
print(recommendations_ncf)

# Assuming your main book content table is named df_books_content
# Ensure you have loaded or kept df_books_content from earlier steps!

# Perform a merge to join the predicted scores with the book details
final_recommendations_ncf = pd.merge(
    recommendations_ncf,
    df_books_content[['book_id', 'title', 'authors', 'average_rating']], # Select key columns
    left_on='original_book_id',
    right_on='book_id',
    how='left'
)

# Clean up and display the final result
final_recommendations_ncf = final_recommendations_ncf.sort_values(
    'ncf_score', 
    ascending=False
).reset_index(drop=True)

# Rename for clarity
final_recommendations_ncf = final_recommendations_ncf.rename(
    columns={'ncf_score': 'Predicted Score (NCF)'}
)

print("\n--- Final Personalized Recommendations (NCF) ---")
print(final_recommendations_ncf[[
    'Predicted Score (NCF)', 
    'title', 
    'authors', 
    'average_rating'
]].head(10).to_string())

# Content-Based Filtering (CBF) Model Implementation

The Content-Based Filtering component solves the critical **New User/New Item Cold Start Problem** by generating recommendations based purely on descriptive features of the books. This ensures that even users with no prior rating history receive relevant suggestions instantly, and new books (with no ratings) can still be recommended.

## ðŸŽ¯ Core Mechanism: Item Similarity

Instead of relying on user behavior, the CBF model generates **book-to-book similarity scores** using their shared content.

### 1. Feature Consolidation (`tag_soup`)
Features like **title, authors, and descriptive tags** were aggregated into a single text block (`content_soup` or `tag_soup`) for each book.

### 2. Vectorization (TF-IDF)
The text features were converted into a **sparse numerical matrix** using the **Term Frequency-Inverse Document Frequency (TF-IDF)** scheme.  
- TF-IDF weights **rare, distinguishing tags** (e.g., 'Austen', 'dystopian') higher than **common terms** (e.g., 'book', 'fiction').

### 3. Similarity Matrix Generation
The final step involves calculating the **Cosine Similarity** between every book's TF-IDF vector:

\[
\text{Similarity}(B_i, B_j) = \text{CosineSimilarity}(\text{Vector}_{B_i}, \text{Vector}_{B_j})
\]

The result is a comprehensive **Item Similarity Matrix**, where we can instantly look up the content similarity score between any two books in the dataset.


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Assume df is loaded and preprocessed with the 'tag_soup' column

print("Starting TF-IDF Vectorization...")

# 1. Initialize the TF-IDF Vectorizer
# max_df=0.9: Ignore terms that appear in more than 90% of documents (too common)
# min_df=5: Ignore terms that appear in fewer than 5 documents (too rare)
tfidf = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5)

# Fit the vectorizer to the 'tag_soup' and transform the text into a sparse matrix
tfidf_matrix = tfidf.fit_transform(df['tag_soup'])

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print("Calculating Cosine Similarity Matrix (This may take a moment)...")

# 2. Compute the Cosine Similarity Matrix
# The resulting matrix (N x N) holds the similarity score between every pair of books.
# This matrix will be the core of your CBF model.
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Cosine Similarity Matrix Calculation Complete.")

Starting TF-IDF Vectorization...
TF-IDF Matrix Shape: (10000, 1164)
Calculating Cosine Similarity Matrix (This may take a moment)...
Cosine Similarity Matrix Calculation Complete.


In [None]:
# Create a mapping series to quickly lookup the internal index from the book_id
book_to_index_series = pd.Series(df_books_content.index, index=df_books_content['book_id']).drop_duplicates()

def recommend_cbf_by_book_id(book_id, cosine_sim_matrix, df_content, top_k=10):
    
    # 1. Get the internal index corresponding to the input book_id
    try:
        idx = book_to_index_series[book_id]
    except KeyError:
        return f"Error: Book ID {book_id} not found in content data."

    # 2. Get the similarity scores for this book against all other books
    # The scores are the row/column corresponding to 'idx' in the matrix
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # 3. Sort the books based on the similarity scores (descending)
    # We ignore the first element (index 0) because it's the book itself (similarity = 1.0)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_k + 1] 

    # 4. Get the book indices and similarity scores
    book_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    # 5. Get the original book IDs and titles for the top recommendations
    recommended_books = df_content.iloc[book_indices].copy()
    
    # 6. Format and return
    results = pd.DataFrame({
        'original_book_id': recommended_books['book_id'],
        'cbf_score': scores,
        'title': recommended_books['title'],
        'authors': recommended_books['authors']
    })
    
    return results.reset_index(drop=True)

# Example Usage: Find books similar to the book with original book_id=1
# You need to ensure book_id=1 is in your df_books_content
# recommendation_cbf = recommend_cbf_by_book_id(1, cosine_sim, df_books_content)
# print("\n--- Content-Based Recommendations for Book ID 1 ---")
# print(recommendation_cbf.to_string())