# Information Retrieval Project Demo

This notebook demonstrates the information retrieval project that involves web scraping, data preprocessing, clustering, classification, and visualization of data from two websites: books.toscrape.com and quotes.toscrape.com.

## 1. Setup and Imports

First, let's import the necessary modules and set up the environment.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any, Tuple
from sklearn.model_selection import train_test_split

# Import project modules
from src.scraper.books_scraper import BooksScraper
from src.scraper.quotes_scraper import QuotesScraper
from src.preprocessing.text_processor import TextProcessor
from src.analysis.clustering import ClusteringAnalyzer
from src.analysis.classification import ClassificationAnalyzer
from src.visualization.visualizer import Visualizer

# Set up matplotlib for inline display
%matplotlib inline
plt.style.use('ggplot')

# Create necessary directories
directories = [
    "data",
    "data/books",
    "data/books/html",
    "data/quotes",
    "data/quotes/html",
    "results",
    "results/figures",
    "results/models"
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)

## 2. Web Scraping

Let's scrape data from the books and quotes websites. We'll scrape 10 pages from each website.

In [None]:
# Set the number of pages to scrape
num_pages = 10

# Scrape books
print("=== Scraping Books ===\n")
books_scraper = BooksScraper()
books_data = books_scraper.scrape_pages(num_pages)
print(f"Scraped {len(books_data)} books.\n")

# Scrape quotes
print("\n=== Scraping Quotes ===\n")
quotes_scraper = QuotesScraper()
quotes_data = quotes_scraper.scrape_pages(num_pages)
print(f"Scraped {len(quotes_data)} quotes.\n")

# Display sample data
print("\nSample book data:")
print(books_data[0])

print("\nSample quote data:")
print(quotes_data[0])

## 3. Data Preprocessing

Now, let's preprocess the scraped data to clean and standardize the text.

In [None]:
# Initialize text processor
text_processor = TextProcessor()

# Preprocess books data
print("=== Preprocessing Books Data ===\n")
processed_books = text_processor.preprocess_data(books_data, 'description')
print(f"Preprocessed {len(processed_books)} books.\n")

# Preprocess quotes data
print("\n=== Preprocessing Quotes Data ===\n")
processed_quotes = text_processor.preprocess_data(quotes_data, 'text')
print(f"Preprocessed {len(processed_quotes)} quotes.\n")

# Display sample preprocessed data
print("\nSample preprocessed book description:")
print("Original:", processed_books[0]['description'])
print("Processed:", processed_books[0]['processed_description'])

print("\nSample preprocessed quote text:")
print("Original:", processed_quotes[0]['text'])
print("Processed:", processed_quotes[0]['processed_text'])

## 4. Clustering Analysis

Let's perform clustering on the books and quotes data to group similar items together.

### 4.1 Clustering Books

In [None]:
# Extract preprocessed descriptions
descriptions = [book['processed_description'] for book in processed_books]

# Vectorize descriptions
X_books, vectorizer_books = text_processor.vectorize_tfidf(descriptions)

# Find optimal number of clusters
clustering = ClusteringAnalyzer()
optimal_clusters, inertia_values = clustering.get_optimal_clusters(X_books, max_clusters=10)
print(f"Optimal number of clusters for books: {optimal_clusters}\n")

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(2, len(inertia_values) + 2), inertia_values, marker='o')
plt.title('Elbow Method for Optimal k (Books)')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.axvline(x=optimal_clusters, color='red', linestyle='--')
plt.text(optimal_clusters + 0.1, max(inertia_values) * 0.9, f'Optimal k = {optimal_clusters}')
plt.show()

# Create clustering analyzer with optimal number of clusters
book_clustering = ClusteringAnalyzer(n_clusters=optimal_clusters)

# Perform clustering
book_clusters = book_clustering.cluster_kmeans(X_books)
print(f"Clustered {len(book_clusters)} books into {len(set(book_clusters))} clusters.\n")

# Get cluster distribution
distribution = book_clustering.get_cluster_distribution()
print("Cluster distribution:")
for cluster, count in distribution.items():
    print(f"Cluster {cluster}: {count} books")

# Visualize clusters
fig = book_clustering.visualize_clusters(X_books, title="Book Clusters")
plt.show()

### 4.2 Clustering Quotes

In [None]:
# Extract preprocessed quotes
quotes_text = [quote['processed_text'] for quote in processed_quotes]

# Vectorize quotes
X_quotes, vectorizer_quotes = text_processor.vectorize_tfidf(quotes_text)

# Find optimal number of clusters
clustering = ClusteringAnalyzer()
optimal_clusters, inertia_values = clustering.get_optimal_clusters(X_quotes, max_clusters=10)
print(f"Optimal number of clusters for quotes: {optimal_clusters}\n")

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range(2, len(inertia_values) + 2), inertia_values, marker='o')
plt.title('Elbow Method for Optimal k (Quotes)')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.axvline(x=optimal_clusters, color='red', linestyle='--')
plt.text(optimal_clusters + 0.1, max(inertia_values) * 0.9, f'Optimal k = {optimal_clusters}')
plt.show()

# Create clustering analyzer with optimal number of clusters
quote_clustering = ClusteringAnalyzer(n_clusters=optimal_clusters)

# Perform clustering
quote_clusters = quote_clustering.cluster_kmeans(X_quotes)
print(f"Clustered {len(quote_clusters)} quotes into {len(set(quote_clusters))} clusters.\n")

# Get cluster distribution
distribution = quote_clustering.get_cluster_distribution()
print("Cluster distribution:")
for cluster, count in distribution.items():
    print(f"Cluster {cluster}: {count} quotes")

# Visualize clusters
fig = quote_clustering.visualize_clusters(X_quotes, title="Quote Clusters")
plt.show()

## 5. Classification Analysis

Now, let's perform classification on the books and quotes data to categorize them.

### 5.1 Classifying Books

In [None]:
# Extract preprocessed descriptions and categories
descriptions = [book['processed_description'] for book in processed_books]
categories = [book['category'] for book in processed_books]

# Vectorize descriptions
X_books, vectorizer_books = text_processor.vectorize_tfidf(descriptions)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_books, categories, test_size=0.2, random_state=42, stratify=categories if len(categories) > 1 else None
)

# Create classification analyzer
book_classifier = ClassificationAnalyzer()

# Compare different models
book_metrics = book_classifier.compare_models(X_books, categories)
print("Model comparison:")
for model_name, model_metrics in book_metrics.items():
    print(f"{model_name}:")
    for metric_name, metric_value in model_metrics.items():
        print(f"  {metric_name}: {metric_value:.3f}")

# Train the best model (based on F1 score)
best_model = max(book_metrics.items(), key=lambda x: x[1]['f1'])[0]
print(f"\nBest model: {best_model}\n")
book_classifier.train(X_train, y_train, model_name=best_model)

# Evaluate on test set
test_metrics = book_classifier.evaluate(X_test, y_test)
print("Test set evaluation:")
for metric_name, metric_value in test_metrics.items():
    print(f"  {metric_name}: {metric_value:.3f}")

# Plot confusion matrix
fig = book_classifier.plot_confusion_matrix(X_test, y_test, title="Book Classification Confusion Matrix")
plt.show()

### 5.2 Classifying Quotes

In [None]:
# Extract preprocessed quotes and primary tags
quotes_text = [quote['processed_text'] for quote in processed_quotes]

# Use the first tag as the target class
primary_tags = [quote['tags'][0] if quote['tags'] else 'unknown' for quote in processed_quotes]

# Vectorize quotes
X_quotes, vectorizer_quotes = text_processor.vectorize_tfidf(quotes_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_quotes, primary_tags, test_size=0.2, random_state=42, stratify=primary_tags if len(primary_tags) > 1 else None
)

# Create classification analyzer
quote_classifier = ClassificationAnalyzer()

# Compare different models
quote_metrics = quote_classifier.compare_models(X_quotes, primary_tags)
print("Model comparison:")
for model_name, model_metrics in quote_metrics.items():
    print(f"{model_name}:")
    for metric_name, metric_value in model_metrics.items():
        print(f"  {metric_name}: {metric_value:.3f}")

# Train the best model (based on F1 score)
best_model = max(quote_metrics.items(), key=lambda x: x[1]['f1'])[0]
print(f"\nBest model: {best_model}\n")
quote_classifier.train(X_train, y_train, model_name=best_model)

# Evaluate on test set
test_metrics = quote_classifier.evaluate(X_test, y_test)
print("Test set evaluation:")
for metric_name, metric_value in test_metrics.items():
    print(f"  {metric_name}: {metric_value:.3f}")

# Plot confusion matrix
fig = quote_classifier.plot_confusion_matrix(X_test, y_test, title="Quote Classification Confusion Matrix")
plt.show()

## 6. Visualizations

Let's create visualizations to better understand the data and analysis results.

### 6.1 Word Clouds for Book Clusters

In [None]:
# Initialize visualizer
visualizer = Visualizer()

# Create word clouds for book clusters
book_descriptions = [book['description'] for book in processed_books]
book_wordclouds = visualizer.create_class_wordclouds(
    book_descriptions, book_clusters, title_prefix="Book Cluster"
)

# Display word clouds
for cluster, fig in book_wordclouds.items():
    plt.figure(figsize=(12, 8))
    plt.imshow(fig.axes[0].images[0].get_array())
    plt.title(f"Word Cloud for Book Cluster {cluster}")
    plt.axis('off')
    plt.show()

### 6.2 Word Clouds for Quote Clusters

In [None]:
# Create word clouds for quote clusters
quote_texts = [quote['text'] for quote in processed_quotes]
quote_wordclouds = visualizer.create_class_wordclouds(
    quote_texts, quote_clusters, title_prefix="Quote Cluster"
)

# Display word clouds
for cluster, fig in quote_wordclouds.items():
    plt.figure(figsize=(12, 8))
    plt.imshow(fig.axes[0].images[0].get_array())
    plt.title(f"Word Cloud for Quote Cluster {cluster}")
    plt.axis('off')
    plt.show()

### 6.3 Comparing Clustering and Classification Results

In [None]:
# Compare clustering and classification for books
book_categories = [book['category'] for book in processed_books]

# Calculate purity score for book clusters
book_purity = book_clustering.calculate_purity(book_categories)
print(f"Book clustering purity score: {book_purity:.3f}")

# Calculate Rand Index for book clusters
book_rand_index = book_clustering.calculate_rand_index(book_categories)
print(f"Book clustering Rand Index: {book_rand_index:.3f}")

# Compare with classification metrics
print("\nBook classification metrics:")
for metric_name, metric_value in test_metrics.items():
    print(f"  {metric_name}: {metric_value:.3f}")

# Create a contingency table to visualize the relationship between clusters and categories
contingency_df = pd.DataFrame({
    'Category': book_categories,
    'Cluster': book_clusters
})

# Create a cross-tabulation
cross_tab = pd.crosstab(contingency_df['Category'], contingency_df['Cluster'])

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(cross_tab, annot=True, cmap='Blues', fmt='d')
plt.title('Relationship between Book Categories and Clusters')
plt.xlabel('Cluster')
plt.ylabel('Category')
plt.show()

## 7. Saving Results

Finally, let's save the results to CSV files for further analysis.

In [None]:
# Add cluster labels to books data
for i, book in enumerate(processed_books):
    book['cluster'] = int(book_clusters[i])

# Add cluster labels to quotes data
for i, quote in enumerate(processed_quotes):
    quote['cluster'] = int(quote_clusters[i])

# Convert to DataFrames
books_df = pd.DataFrame(processed_books)
quotes_df = pd.DataFrame(processed_quotes)

# Save to CSV
books_df.to_csv("results/books_results.csv", index=False)
quotes_df.to_csv("results/quotes_results.csv", index=False)

print("Results saved successfully.")

# Display sample of the saved data
print("\nSample of books results:")
print(books_df[['title', 'category', 'cluster']].head())

print("\nSample of quotes results:")
print(quotes_df[['text', 'author', 'tags', 'cluster']].head())

## 8. Conclusion

In this notebook, we demonstrated the information retrieval project that involved:

1. **Web Scraping**: We scraped data from books.toscrape.com and quotes.toscrape.com.
2. **Data Preprocessing**: We cleaned and standardized the text data.
3. **Clustering**: We grouped similar books and quotes together using K-means clustering.
4. **Classification**: We categorized books and quotes based on their content.
5. **Evaluation**: We evaluated the performance of our clustering and classification models.
6. **Visualization**: We created word clouds and other visualizations to better understand the data.

The project demonstrates how information retrieval techniques can be used to organize and analyze unstructured text data from the web.