# AI-Book-Recommender: Main Pipeline
This notebook integrates the entire AI-Book-Recommender pipeline, including data loading, preprocessing, model development, evaluation, visualization, and a Streamlit-based web interface for interactive book recommendations.

## Overview
- **Data Loading and Cleaning**: Load and preprocess books, users, and ratings datasets.
- **Exploratory Data Analysis (EDA)**: Visualize rating distributions, top books, and user activity.
- **Recommendation Models**: Implement popularity-based, content-based, collaborative filtering (SVD), and hybrid recommenders.
- **Evaluation**: Assess model performance using RMSE.
- **Visualization**: Generate heatmaps, PCA plots, and other insights.
- **Streamlit Interface**: Deploy an interactive web app for users to explore recommendations.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.decomposition import PCA
import streamlit as st

# Set seaborn style
sns.set(style='whitegrid')

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Ensure plots appear inline in Jupyter
%matplotlib inline

In [None]:
# Load and preprocess datasets
def load_and_preprocess_data():
    # Load datasets
    books = pd.read_csv('books.csv', low_memory=False)
    users = pd.read_csv('users.csv')
    ratings = pd.read_csv('ratings.csv')

    # Clean Books Dataset
    books.drop_duplicates(subset='ISBN', inplace=True)
    books.dropna(subset=['ISBN', 'Book-Title', 'Book-Author'], inplace=True)
    books['Publisher'].fillna('Unknown', inplace=True)
    books['Book-Title'] = books['Book-Title'].fillna('')

    # Clean Users Dataset
    users.drop_duplicates(subset='User-ID', inplace=True)
    users.loc[(users['Age'] < 5) | (users['Age'] > 100), 'Age'] = np.nan
    users['Age'].fillna(users['Age'].median(), inplace=True)

    # Clean Ratings Dataset
    ratings = ratings[ratings['Book-Rating'] > 0]
    ratings.drop_duplicates(inplace=True)

    return books, users, ratings

books, users, ratings = load_and_preprocess_data()

# Preview datasets
print('Books Dataset:')
print(books.head())
print('\nUsers Dataset:')
print(users.head())
print('\nRatings Dataset:')
print(ratings.head())

In [None]:
# Popularity-Based Recommender
def popularity_recommender(ratings, books, top_n=10):
    pop_scores = ratings['ISBN'].value_counts().head(top_n).index
    return books[books['ISBN'].isin(pop_scores)][['ISBN', 'Book-Title', 'Book-Author', 'Publisher']]

# Content-Based Recommender
def content_based_recommender(book_title, books, top_n=10):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(books['Book-Title'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    indices = pd.Series(books.index, index=books['Book-Title']).drop_duplicates()
    idx = indices[book_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]
    return books.iloc[book_indices][['ISBN', 'Book-Title', 'Book-Author', 'Publisher']]

# Collaborative Filtering with SVD
def svd_recommender(ratings, user_id, books, top_n=10, k=50):
    user_item_matrix = ratings.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
    user_ratings_mean = user_item_matrix.mean(axis=1)
    R_demeaned = user_item_matrix.sub(user_ratings_mean, axis=0)
    U, sigma, Vt = svds(R_demeaned, k=k)
    sigma = np.diag(sigma)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.values.reshape(-1, 1)
    preds_df = pd.DataFrame(all_user_predicted_ratings, columns=user_item_matrix.columns, index=user_item_matrix.index)
    sorted_user_predictions = preds_df.loc[user_id].sort_values(ascending=False)
    user_data = ratings[ratings['User-ID'] == user_id]['ISBN'].values
    recommendations = sorted_user_predictions[~sorted_user_predictions.index.isin(user_data)].head(top_n)
    return books[books['ISBN'].isin(recommendations.index)][['ISBN', 'Book-Title', 'Book-Author', 'Publisher']]

# Hybrid Recommender
def hybrid_recommender(user_id, book_title, ratings, books, top_n=10):
    print('Content-Based:')
    print(content_based_recommender(book_title, books, top_n))
    print('\nCollaborative Filtering:')
    print(svd_recommender(ratings, user_id, books, top_n))
    print('\nPopularity-Based:')
    print(popularity_recommender(ratings, books, top_n))

In [None]:
# Model Evaluation
def evaluate_split_small(ratings, sample_size=1000):
    sample = ratings.sample(n=sample_size, random_state=42)
    train, test = train_test_split(sample, test_size=0.2, random_state=42)
    common_users = list(set(train['User-ID']) & set(test['User-ID']))
    
    if len(common_users) == 0:
        print('No common users between train and test. Increase sample size.')
        return
    
    train = train[train['User-ID'].isin(common_users)]
    test = test[test['User-ID'].isin(common_users)]
    
    train_matrix = train.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
    if train_matrix.shape[0] < 2 or train_matrix.shape[1] < 2:
        print('Not enough data for SVD.')
        return
    
    user_ratings_mean = train_matrix.mean(axis=1)
    R_demeaned = train_matrix.sub(user_ratings_mean, axis=0)
    U, sigma, Vt = svds(R_demeaned, k=50)
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.values.reshape(-1, 1)
    preds_df = pd.DataFrame(predicted_ratings, columns=train_matrix.columns, index=train_matrix.index)
    
    actual = []
    predicted = []
    for _, row in test.iterrows():
        user = row['User-ID']
        isbn = row['ISBN']
        if user in preds_df.index and isbn in preds_df.columns:
            actual.append(row['Book-Rating'])
            predicted.append(preds_df.loc[user, isbn])
    
    if len(actual) > 0:
        rmse = sqrt(mean_squared_error(actual, predicted))
        print(f'Actual RMSE (small sample): {rmse}')
    else:
        print('No valid predictions to compute RMSE.')

evaluate_split_small(ratings, sample_size=1000)

In [None]:
# Visualizations
def visualize_data(ratings, books, users):
    # Distribution of book ratings
    plt.figure(figsize=(8, 6))
    sns.histplot(ratings['Book-Rating'], bins=10, kde=False, color='skyblue')
    plt.title('Distribution of Book Ratings', fontsize=14)
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.xticks(range(0, 11))
    plt.tight_layout()
    plt.show()

    # Top 10 most rated books
    popular_counts = ratings['ISBN'].value_counts().head(10).reset_index()
    popular_counts.columns = ['ISBN', 'Rating Count']
    popular_titles = books[['ISBN', 'Book-Title']]
    popular_books = pd.merge(popular_counts, popular_titles, on='ISBN', how='left')
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Rating Count', y='Book-Title', data=popular_books, palette='viridis')
    plt.title('Top 10 Most Rated Books')
    plt.xlabel('Number of Ratings')
    plt.ylabel('Book Title')
    plt.tight_layout()
    plt.show()

    # User Age Distribution
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=users['Age'], color='lightgreen')
    plt.title('User Age Distribution (Boxplot)')
    plt.xlabel('Age')
    plt.tight_layout()
    plt.show()

    # Heatmap of Top 50 Users vs Top 50 Books
    top_50_users = ratings['User-ID'].value_counts().head(50).index
    top_50_books = ratings['ISBN'].value_counts().head(50).index
    ratings_heatmap = ratings[ratings['User-ID'].isin(top_50_users) & ratings['ISBN'].isin(top_50_books)]
    heatmap_matrix = ratings_heatmap.pivot(index='User-ID', columns='ISBN', values='Book-Rating')
    plt.figure(figsize=(15, 10))
    sns.heatmap(heatmap_matrix, cmap='YlGnBu', cbar_kws={'label': 'Rating'})
    plt.title('Heatmap of Ratings: Top 50 Users vs Top 50 Books')
    plt.xlabel('Book ISBN')
    plt.ylabel('User ID')
    plt.tight_layout()
    plt.show()

    # SVD Visualization with PCA
    top_users = ratings['User-ID'].value_counts().head(1000).index
    top_books = ratings['ISBN'].value_counts().head(1000).index
    ratings_small = ratings[ratings['User-ID'].isin(top_users) & ratings['ISBN'].isin(top_books)]
    user_item_matrix = ratings_small.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
    user_ratings_mean = user_item_matrix.mean(axis=1)
    R_demeaned = user_item_matrix.sub(user_ratings_mean, axis=0)
    U, sigma, Vt = svds(R_demeaned, k=50)
    pca = PCA(n_components=2)
    user_pca = pca.fit_transform(U)
    plt.figure(figsize=(8, 6))
    plt.scatter(user_pca[:, 0], user_pca[:, 1], alpha=0.5)
    plt.title('User Visualization in SVD-Reduced Space via PCA')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.grid(True)
    plt.show()

visualize_data(ratings, books, users)

In [None]:
# Streamlit Interface
def run_streamlit_app():
    st.title('AI-Book-Recommender')
    st.write('Discover personalized book recommendations using collaborative filtering, content-based filtering, and popularity-based methods.')

    # User ID selection
    user_id = st.selectbox('Select User ID:', ratings['User-ID'].unique())

    # Book title selection
    book_title = st.selectbox('Select Book Title:', books['Book-Title'].unique())

    # Show recommendations
    if st.button('Show Recommendations'):
        st.write('### Recommendations')
        st.write('#### Content-Based:')
        st.dataframe(content_based_recommender(book_title, books))
        st.write('#### Collaborative Filtering:')
        st.dataframe(svd_recommender(ratings, user_id, books))
        st.write('#### Popularity-Based:')
        st.dataframe(popularity_recommender(ratings, books))

    # Evaluate model
    if st.button('Evaluate RMSE'):
        st.write('### Model Evaluation')
        evaluate_split_small(ratings, sample_size=1000)

# Note: Run `streamlit run main.py` in terminal to launch the app
# This cell is for demonstration; actual Streamlit app requires a .py file
print('To run the Streamlit app, save this code as main.py and execute: streamlit run main.py')