In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/working/data.csv')
df = df[:30000]
df.info()

In [None]:
# Calculate the average rating for each book
top_books = df.groupby('title')['rating'].mean().reset_index()
top_books.columns = ['title', 'avg_rating']
top_books = top_books.sort_values(by='avg_rating', ascending=False).head(10)

# Plot the top books
plt.figure(figsize=(12, 6))
sns.barplot(x='avg_rating', y='title', data=top_books, palette='viridis')
plt.title('Top 10 Books by Average Rating')
plt.xlabel('Average Rating')
plt.ylabel('Book Title')
plt.show()

In [48]:
from wordcloud import WordCloud
from sklearn.cluster import KMeans

In [None]:
# Use TF-IDF to vectorize book descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])

# Perform KMeans clustering
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)

# Add cluster labels to the dataset
df['cluster'] = clusters

# Visualize the clusters using t-SNE
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(tfidf_matrix.toarray())

df['tsne_x'] = tsne_results[:, 0]
df['tsne_y'] = tsne_results[:, 1]

# Plot the clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='tsne_x', y='tsne_y', hue='cluster', data=df, palette='viridis', legend='full')
plt.title('Book Clusters (t-SNE)')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()



In [None]:
# Generate a word cloud from book titles
all_titles = ' '.join(data['title'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_titles)

# Display the word cloud
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Book Titles')
plt.show()

In [46]:
# Fill missing values in the 'description' and 'genres' columns
df['description'] = df['description'].fillna('')
df['genres'] = df['genres'].fillna('')

In [22]:

# Combine relevant features for content-based filtering
df['content'] = df['title'] + ' ' + df['author'] + ' ' + df['genres'] + ' ' + df['description']

In [23]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['content'])

In [24]:
# Compute cosine similarity between books
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [35]:
# Function to get content-based recommendations
def recommend_books_content(title, num_recommendations=5):
    # Find the index of the book with the given title
    idx = df[df['title'] == title].index[0]
    
    # Get similarity scores for all books with the selected book
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort books by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the most similar books
    sim_indices = [i[0] for i in sim_scores[1:num_recommendations + 1]]
    
    # Return the titles of the recommended books
    return df.iloc[sim_indices][['title', 'author', 'genres']]

In [27]:
# ==================== Collaborative Filtering ====================
# Create a mock dataset for user interactions (for demonstration purposes)
# Replace this with actual user-item interaction data if available
user_data = pd.DataFrame({
    'userId': np.random.randint(1, 100, 1000),
    'bookId': np.random.choice(df['bookId'], 1000),
    'rating': np.random.randint(1, 6, 1000)
})


In [30]:
# Prepare the dataset for the Surprise library
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(user_data[['userId', 'bookId', 'rating']], reader)

In [31]:
# Build a collaborative filtering model using SVD
trainset = data_surprise.build_full_trainset()
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x781cb9774d30>

In [36]:
# Function to get collaborative filtering recommendations
def recommend_books_collaborative(user_id, num_recommendations=5):
    # Get a list of all book IDs
    all_books = df['bookId'].unique()
    
    # Predict ratings for all books the user has not interacted with
    predictions = [(book, model.predict(user_id, book).est) for book in all_books]
    
    # Sort by predicted rating
    predictions = sorted(predictions, key=lambda x: x[1], reverse=True)
    
    # Get the top book IDs
    top_books = [pred[0] for pred in predictions[:num_recommendations]]
    
    # Return the titles of the recommended books
    return df[df['bookId'].isin(top_books)][['title', 'author', 'genres']]

In [37]:
# ==================== Hybrid Recommendation ====================
def recommend_books_hybrid(title, user_id, num_recommendations=5):
    # Get content-based recommendations
    content_recommendations = recommend_books_content(title, num_recommendations * 2)
    
    # Get collaborative filtering recommendations
    collaborative_recommendations = recommend_books_collaborative(user_id, num_recommendations * 2)
    
    # Combine and rank the recommendations
    combined = pd.concat([content_recommendations, collaborative_recommendations]).drop_duplicates()
    combined['score'] = combined.index  # Dummy score for sorting
    combined = combined.sort_values('score', ascending=True)
    
    return combined.head(num_recommendations)

In [38]:
# ==================== Example Usage ====================
# Content-based filtering example
book_title = "The Hunger Games"  # Replace with a title from your dataset
content_recommendations = recommend_books_content(book_title, 5)
print("Content-Based Recommendations for:", book_title)
print(content_recommendations)

Content-Based Recommendations for: The Hunger Games
                                                   title  \
5240   SAMPLER ONLY: Catching Fire (The Hunger Games,...   
326                                           Mockingjay   
221                                        Catching Fire   
29464                     The Hunger Games Tribute Guide   
184                      The Hunger Games Trilogy Boxset   

                                                  author  \
5240                                     Suzanne Collins   
326                                      Suzanne Collins   
221                                      Suzanne Collins   
29464                     Emily Seife (Goodreads Author)   
184    Suzanne Collins, Guillaume Fournier (Translato...   

                                                  genres  
5240   ['Dystopia', 'Young Adult', 'Science Fiction',...  
326    ['Young Adult', 'Dystopia', 'Fiction', 'Fantas...  
221    ['Young Adult', 'Dystopia', 'Fiction', 'Fa

In [39]:

# Collaborative filtering example
user_id = 1  # Replace with a user ID from your dataset
collaborative_recommendations = recommend_books_collaborative(user_id, 5)
print("Collaborative Recommendations for User:", user_id)
print(collaborative_recommendations)

Collaborative Recommendations for User: 1
                                           title  \
9554                                        Cage   
10473                           The Suitcase Kid   
15279                      Murder Must Advertise   
16871  The Endless Steppe: Growing Up in Siberia   
17820                              Word of Honor   

                                                 author  \
9554                    Harper Sloan (Goodreads Author)   
10473  Jacqueline Wilson, Nick Sharratt (Illustrations)   
15279                                 Dorothy L. Sayers   
16871                                    Esther Hautzig   
17820                                     Tom Kirkbride   

                                                  genres  
9554   ['Romance', 'Contemporary Romance', 'Military ...  
10473  ['Childrens', 'Fiction', 'Middle Grade', 'Cont...  
15279  ['Mystery', 'Fiction', 'Crime', 'Classics', 'B...  
16871  ['Historical Fiction', 'Young Adult', 'Fiction.

In [40]:
# Hybrid recommendation example
hybrid_recommendations = recommend_books_hybrid(book_title, user_id, 5)
print("Hybrid Recommendations for User and Book:", book_title, user_id)
print(hybrid_recommendations)

Hybrid Recommendations for User and Book: The Hunger Games 1
                                                  title  \
184                     The Hunger Games Trilogy Boxset   
221                                       Catching Fire   
326                                          Mockingjay   
2047                                      Tower of Dawn   
4713  The Hunger Games: Official Illustrated Movie C...   

                                                 author  \
184   Suzanne Collins, Guillaume Fournier (Translato...   
221                                     Suzanne Collins   
326                                     Suzanne Collins   
2047                   Sarah J. Maas (Goodreads Author)   
4713                                          Kate Egan   

                                                 genres  score  
184   ['Young Adult', 'Fiction', 'Fantasy', 'Dystopi...    184  
221   ['Young Adult', 'Dystopia', 'Fiction', 'Fantas...    221  
326   ['Young Adult', 'Dystopia', 