In [None]:
# Import necessary libraries
import sys
import os

# Add the src directory to the path for importing custom modules
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from data.loader import load_books_data, clean_books_data, get_data_info
from analysis.visualizer import (
    plot_data_overview, 
    plot_rating_analysis, 
    plot_publication_analysis,
    plot_category_analysis,
    plot_correlation_matrix,
    create_summary_report
)
from utils.helpers import print_dataframe_info, DataFrameProfiler

# Configure matplotlib for better-looking plots
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print("📚 Semantic Book Recommender - EDA Notebook")
print("="*50)
print("All modules imported successfully!")


In [None]:
# Load the raw dataset
print("🔄 Loading books dataset...")
df_raw = load_books_data("../books.csv")

# Display basic information about the raw dataset
print_dataframe_info(df_raw, "Raw Books Dataset")

# Show first few rows
print("\n📋 First 5 rows of the dataset:")
display(df_raw.head())


In [None]:
# Create a comprehensive data profile
print("🔍 Generating comprehensive data profile...")
profiler = DataFrameProfiler(df_raw)
profile = profiler.generate_profile()

print("\n📊 DATASET PROFILE SUMMARY")
print("="*50)
print(f"Shape: {profile['shape']}")
print(f"Total Memory Usage: {profile['memory_usage']['total_mb']:.2f} MB")
print(f"Complete Rows: {profile['missing_data']['complete_rows']}")
print(f"Total Duplicates: {profile['duplicates']['total_duplicates']}")

# Display data info using our custom function
data_info = get_data_info(df_raw)
print(f"\nColumns: {data_info['columns']}")

# Show statistical summary for numeric columns
if profile['numeric_summary']:
    print("\n📈 NUMERIC COLUMNS SUMMARY:")
    numeric_df = df_raw.select_dtypes(include=[np.number])
    display(numeric_df.describe())


In [None]:
# Create data overview visualization
print("📊 Creating data overview visualization...")
plot_data_overview(df_raw)


In [None]:
# Clean the dataset using our custom function
print("🧹 Cleaning the dataset...")
df_clean = clean_books_data(df_raw)

# Compare before and after cleaning
print("\n📋 BEFORE vs AFTER CLEANING:")
print("="*50)
print(f"Raw dataset shape: {df_raw.shape}")
print(f"Clean dataset shape: {df_clean.shape}")
print(f"Rows removed: {len(df_raw) - len(df_clean)}")

# Show information about the cleaned dataset
print_dataframe_info(df_clean, "Cleaned Books Dataset")

# Display sample of cleaned data
print("\n📋 Sample of cleaned data:")
display(df_clean.head(3))


In [None]:
# Generate comprehensive summary report
print("📝 Generating comprehensive analysis report...")
summary_report = create_summary_report(df_clean)
print(summary_report)

# Additional statistical analysis
print("\n📊 ADDITIONAL INSIGHTS:")
print("="*50)

# Most common publication decades
if 'published_year' in df_clean.columns:
    pub_years = df_clean['published_year'].dropna()
    pub_years = pub_years[(pub_years >= 1800) & (pub_years <= 2024)]
    if len(pub_years) > 0:
        decades = (pub_years // 10) * 10
        most_common_decade = decades.mode().iloc[0] if len(decades.mode()) > 0 else None
        print(f"📅 Most productive decade: {most_common_decade}s")

# Rating insights
if 'average_rating' in df_clean.columns:
    ratings = df_clean['average_rating'].dropna()
    print(f"⭐ Average book rating: {ratings.mean():.2f}")
    print(f"⭐ Median book rating: {ratings.median():.2f}")
    
    # High-rated books threshold
    high_rated_threshold = ratings.quantile(0.9)
    high_rated_count = len(ratings[ratings >= high_rated_threshold])
    print(f"⭐ Books in top 10% (≥{high_rated_threshold:.2f}): {high_rated_count}")

# Category insights
if 'categories' in df_clean.columns:
    categories = df_clean['categories'].dropna()
    books_with_categories = len(categories[categories != 'Unknown'])
    print(f"📚 Books with valid categories: {books_with_categories} ({(books_with_categories/len(df_clean)*100):.1f}%)")


In [None]:
# Rating Analysis Visualizations
print("📊 Creating rating analysis visualizations...")
plot_rating_analysis(df_clean)


In [None]:
# Publication Analysis Visualizations
print("📅 Creating publication trends visualizations...")
plot_publication_analysis(df_clean)


In [None]:
# Category and Genre Analysis
print("📚 Creating category and genre analysis...")
plot_category_analysis(df_clean)


In [None]:
# Correlation Analysis
print("🔗 Creating correlation matrix for numeric variables...")
plot_correlation_matrix(df_clean)


In [None]:
# Generate final insights
print("🎯 KEY INSIGHTS FOR SEMANTIC BOOK RECOMMENDER:")
print("="*60)
print()

# Data Quality Insights
print("📊 DATA QUALITY:")
print(f"   • Dataset contains {len(df_clean):,} books after cleaning")
print(f"   • Data completeness varies by column")
print(f"   • Text fields (title, description, categories) are rich sources for semantic analysis")
print()

# Content Insights
if 'categories' in df_clean.columns:
    categories = df_clean['categories'].dropna()
    unique_cats = set()
    for cat_string in categories:
        if isinstance(cat_string, str) and cat_string != 'Unknown':
            cats = cat_string.replace(',', ';').replace('&', ';').split(';')
            unique_cats.update([cat.strip() for cat in cats if cat.strip()])
    
    print("📚 CONTENT INSIGHTS:")
    print(f"   • {len(unique_cats)} unique categories identified")
    print(f"   • Rich categorical information for semantic clustering")
    print(f"   • Book descriptions provide detailed content for NLP analysis")
    print()

# Rating Insights
if 'average_rating' in df_clean.columns and 'ratings_count' in df_clean.columns:
    ratings = df_clean['average_rating'].dropna()
    rating_counts = df_clean['ratings_count'].dropna()
    
    print("⭐ RATING INSIGHTS:")
    print(f"   • Average rating across all books: {ratings.mean():.2f}")
    print(f"   • Rating distribution is relatively normal")
    print(f"   • Popular books tend to have more consistent ratings")
    print(f"   • Can use ratings as quality indicators for recommendations")
    print()

# Recommendation System Insights
print("🤖 RECOMMENDATION SYSTEM RECOMMENDATIONS:")
print("   • Use book descriptions and categories for semantic similarity")
print("   • Implement hybrid approach: content-based + collaborative filtering")
print("   • Consider publication year trends for temporal recommendations")
print("   • Use author information for author-based similarity")
print("   • Leverage rating data for quality filtering")
print()

print("✅ Analysis Complete! Ready for model development.")
print("="*60)
