In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import random
import re
import seaborn as sns
import nltk
import ipywidgets as widgets
import spacy
import string
import os
import math

from nltk import ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from scipy.optimize import curve_fit
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from IPython.display import display, clear_output
from textstat import flesch_reading_ease, gunning_fog, smog_index
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)

#paulbunyan = 'https://www.gutenberg.org/cache/epub/70060/pg70060.txt'
#Americanfairytales = 'https://www.gutenberg.org/ebooks/4357.txt.utf-8'
#merryadventuresofRobinHood = 'https://www.gutenberg.org/ebooks/10148.txt.utf-8'
#legendsofkingarthurandhisknight = 'https://www.gutenberg.org/ebooks/12753.txt.utf-8'
#mythsandlegendsofancientgreece = 'https://www.gutenberg.org/ebooks/22381.txt.utf-8'


#shortlist = [paulbunyan,Americanfairytales,merryadventuresofRobinHood,legendsofkingarthurandhisknight,mythsandlegendsofancientgreece]


In [None]:
def fetch_book(url):
    response = requests.get(url)
    return response.text

def strip_gutenberg_text(text):
    # Remove text before the start of the book
    start_markers = [
        "*** START OF THIS PROJECT GUTENBERG EBOOK",
        "*** START OF THE PROJECT GUTENBERG EBOOK",
        "*END*THE SMALL PRINT!"
    ]
    for marker in start_markers:
        if marker in text:
            text = text.split(marker, 1)[-1]
            break
    
    # Remove text after the end of the book
    end_markers = [
        "*** END OF THIS PROJECT GUTENBERG EBOOK",
        "*** END OF THE PROJECT GUTENBERG EBOOK",
        "End of Project Gutenberg's"
    ]
    for marker in end_markers:
        if marker in text:
            text = text.split(marker, 1)[0]
            break
    
    return text.strip()

def extract_title_author(text):
    title_pattern = r"Title: (.+)"
    author_pattern = r"Author: (.+)"
    
    title_match = re.search(title_pattern, text)
    author_match = re.search(author_pattern, text)
    
    title = title_match.group(1) if title_match else "Unknown Title"
    author = author_match.group(1) if author_match else "Unknown Author"
    
    return title.strip(), author.strip()

def process_book(url):
    raw_text = fetch_book(url)
    title, author = extract_title_author(raw_text)
    cleaned_text = strip_gutenberg_text(raw_text)
    return {
        'url': url,
        'title': title,
        'author': author,
        'text': cleaned_text
    }

# List of book URLs
book_urls = [
    'https://www.gutenberg.org/cache/epub/70060/pg70060.txt',
    'https://www.gutenberg.org/ebooks/4357.txt.utf-8',
    'https://www.gutenberg.org/ebooks/10148.txt.utf-8',
    'https://www.gutenberg.org/ebooks/12753.txt.utf-8',
    'https://www.gutenberg.org/ebooks/22381.txt.utf-8'
]

# Process all books
processed_books = []
for url in book_urls:
    try:
        book_data = process_book(url)
        processed_books.append(book_data)
        print(f"Processed: {book_data['title']} by {book_data['author']}")
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

# Now 'processed_books' contains a list of dictionaries, each with 'url', 'title', 'author', and 'text' keys
# Each book's text is kept separate in its own dictionary

# Example of how to access the text of a specific book:
#if processed_books:
#    first_book = processed_books[0]
#    print(f"\nFirst few characters of '{first_book['title']}' by {first_book['author']}:")
#    print(first_book['text'][:200])  # Print first 200 characters

In [None]:
def generate_and_save_wordcloud(text, title, output_dir):
    # Get stopwords
    stop_words = set(stopwords.words('english'))
    
    # Add punctuation to stopwords
    stop_words.update(string.punctuation)

    # Tokenize the text and remove stopwords and punctuation
    words = [word.lower() for word in nltk.word_tokenize(text) 
             if word.lower() not in stop_words and word not in string.punctuation]
    
    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    
    # Create and generate a word cloud image
    wordcloud = WordCloud(width=800, height=400, background_color='white', 
                          stopwords=stop_words, min_font_size=10).generate(cleaned_text)
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the generated image
    filename = f"{title.replace(' ', '_')}_wordcloud.png"
    filepath = os.path.join(output_dir, filename)
    wordcloud.to_file(filepath)
    print(f"Word cloud saved as: {filepath}")

    # Display the generated image in the notebook
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Word Cloud: {title}")
    plt.tight_layout(pad=0)
    plt.show()
    plt.close()


# Generate and save word clouds in a separate loop
print("\nGenerating word clouds:")
output_directory = "wordclouds"
for book in processed_books:
    try:
        print(f"Generating word cloud for: {book['title']}")
        generate_and_save_wordcloud(book['text'], book['title'], output_directory)
    except Exception as e:
        print(f"Error generating word cloud for {book['title']}: {str(e)}")

print("\nAll word clouds generated and saved.")

In [None]:
def analyze_book(book):
    text = book['text']
    sentences = sent_tokenize(text)
    
    # Get stopwords
    stop_words = set(stopwords.words('english'))
    
    # Add punctuation to stopwords
    stop_words.update(string.punctuation)

    # Tokenize the text and remove stopwords and punctuation
    words = [word.lower() for word in nltk.word_tokenize(text) 
             if word.lower() not in stop_words 
             and word not in string.punctuation
             and word.isalpha()]
    
    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    text = cleaned_text

    # Word frequency
    freq_dist = FreqDist(words)
    top_words = freq_dist.most_common(20)
    
    # Readability
    flesch_score = flesch_reading_ease(text)
    gunning_fog_score = gunning_fog(text)
    smog_score = smog_index(text)
    
    # Average sentence length
    avg_sentence_length = np.mean([len(word_tokenize(sent)) for sent in sentences])
    
    # Sentence length variation
    sentence_length_variation = np.std([len(word_tokenize(sent)) for sent in sentences])
    
    # Type-Token Ratio
    type_token_ratio = len(set(words)) / len(words)
    
    # POS distribution
    pos_tags = pos_tag(words)
    pos_counts = FreqDist(tag for (word, tag) in pos_tags)
    
    # Sentiment scores
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = [sia.polarity_scores(sent) for sent in sentences]
    avg_sentiment = np.mean([score['compound'] for score in sentiment_scores])
    
    # Dialogue ratio (rough estimate based on quotation marks)
    dialogue_words = sum(len(word_tokenize(sent)) for sent in sentences if sent.count('"') >= 2)
    dialogue_ratio = dialogue_words / len(words)
    
    # Named Entities
    chunked_sentences = ne_chunk(pos_tag(word_tokenize(text)))
    named_entities = [" ".join(word for word, pos in subtree.leaves())
                      for subtree in chunked_sentences
                      if isinstance(subtree, nltk.Tree)]
    top_named_entities = FreqDist(named_entities).most_common(10)
    
    return {
        'title': book['title'],
        'flesch_score': flesch_score,
        'gunning_fog_score': gunning_fog_score,
        'smog_score': smog_score,
        'avg_sentence_length': avg_sentence_length,
        'sentence_length_variation': sentence_length_variation,
        'type_token_ratio': type_token_ratio,
        'pos_distribution': dict(pos_counts),
        'sentiment_compound': avg_sentiment,
        'dialogue_ratio': dialogue_ratio,
        'word_frequency': dict(top_words),
        'named_entities': dict(top_named_entities)
    }

# Analyze all books
book_analyses = [analyze_book(book) for book in processed_books]

# Create a DataFrame for easy comparison
df = pd.DataFrame([
    {
        'Title': analysis['title'],
        'Flesch Reading Ease': analysis['flesch_score'],
        'Gunning Fog Index': analysis['gunning_fog_score'],
        'SMOG Index': analysis['smog_score'],
        'Avg Sentence Length': analysis['avg_sentence_length'],
        'Sentence Length Variation': analysis['sentence_length_variation'],
        'Type-Token Ratio': analysis['type_token_ratio'],
        'Avg Sentiment': analysis['sentiment_compound'],
        'Dialogue Ratio': analysis['dialogue_ratio']
    }
    for analysis in book_analyses
])

# Display the summary comparison table
print("Summary Comparison Table:")
display(df)

# Display detailed metrics for each book
print("\nDetailed Metrics for Each Book:")
for analysis in book_analyses:
    print(f"\nTitle: {analysis['title']}")
    print(f"Flesch Reading Ease: {analysis['flesch_score']:.2f}")
    print(f"Gunning Fog Index: {analysis['gunning_fog_score']:.2f}")
    print(f"SMOG Index: {analysis['smog_score']:.2f}")
    print(f"Average Sentence Length: {analysis['avg_sentence_length']:.2f}")
    print(f"Sentence Length Variation: {analysis['sentence_length_variation']:.2f}")
    print(f"Type-Token Ratio: {analysis['type_token_ratio']:.4f}")
    print(f"Dialogue Ratio: {analysis['dialogue_ratio']:.4f}")
    print(f"Sentiment Compound Score: {analysis['sentiment_compound']:.4f}")
    print("\nTop 20 Most Frequent Words:")
    for word, count in analysis['word_frequency'].items():
        print(f"  {word}: {count}")
    print("\nPart-of-Speech Distribution:")
    for pos, count in analysis['pos_distribution'].items():
        print(f"  {pos}: {count}")
    print("\nTop 10 Named Entities:")
    for entity, count in analysis['named_entities'].items():
        print(f"  {entity}: {count}")
    print("-" * 50)

# Visualizations

# 1. Word Frequency
num_books = len(book_analyses)
num_cols = 5
num_rows = math.ceil(num_books / num_cols)

plt.figure(figsize=(15, 10))
for i, analysis in enumerate(book_analyses, 1):
    plt.subplot(2, 3, i)
    words = list(analysis['word_frequency'].keys())[:10]
    counts = list(analysis['word_frequency'].values())[:10]
    plt.bar(words, counts)
    plt.title(f"Top 10 Words in {analysis['title']}")
    plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 2. Readability Scores
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Flesch Reading Ease', y='Gunning Fog Index', hue='Title')
plt.title('Readability Scores Comparison')
plt.show()

# 3. Sentence Length and Variation
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Avg Sentence Length', y='Sentence Length Variation', hue='Title')
plt.title('Sentence Length and Variation')
plt.show()

# 4. POS Distribution
pos_df = pd.DataFrame([analysis['pos_distribution'] for analysis in book_analyses])
pos_df.index = [analysis['title'] for analysis in book_analyses]
pos_df = pos_df.div(pos_df.sum(axis=1), axis=0)
plt.figure(figsize=(12, 6))
sns.heatmap(pos_df, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('POS Distribution Comparison')
plt.show()

# 5. Sentiment and Dialogue Ratio
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Avg Sentiment', y='Dialogue Ratio', hue='Title')
plt.title('Sentiment vs Dialogue Ratio')
plt.show()

print("Analysis complete. Check the visualizations above for comparisons.")

In [None]:
#talltalesfromTexas = 'https://www.gutenberg.org/ebooks/71089.txt.utf-8'
#mythsandfolktalesoftheRussians = 'https://www.gutenberg.org/ebooks/50011.txt.utf-8'
#talesofCapeCod = 'https://www.gutenberg.org/ebooks/69718.txt.utf-8'
#surprisingadventuresofBaronMunchausen = 'https://www.gutenberg.org/ebooks/3154.txt.utf-8'
#ConnecticutyankeeinKingArthurscourt = 'https://www.gutenberg.org/ebooks/86.txt.utf-8'
#adventuresofHuckleberryFinn = 'https://www.gutenberg.org/ebooks/76.txt.utf-8'
#casebookofSherlockHolmes = 'https://www.gutenberg.org/ebooks/69700.txt.utf-8'
#lostworld = 'https://www.gutenberg.org/ebooks/139.txt.utf-8'
#wonderfulwiardofOz = 'https://www.gutenberg.org/ebooks/55.txt.utf-8'
#phantastesafaerieromanceformenandwomen = 'https://www.gutenberg.org/ebooks/325.txt.utf-8'
#aesopsfables = 'https://www.gutenberg.org/ebooks/11339.txt.utf-8'
#manwhowasthursdayanightmare = 'https://www.gutenberg.org/ebooks/1695.txt.utf-8'
#romeoandjuliet = 'https://www.gutenberg.org/ebooks/1513.txt.utf-8'
#twentythousandleaguesunderthesea = 'https://www.gutenberg.org/ebooks/164.txt.utf-8'
#aroundtheworldin80days = 'https://www.gutenberg.org/ebooks/103.txt.utf-8'
#journeytothecenteroftheearth = 'https://www.gutenberg.org/ebooks/18857.txt.utf-8'
#warandpeace = 'https://www.gutenberg.org/ebooks/2600.txt.utf-8'
#annakarenina = 'https://www.gutenberg.org/ebooks/1399.txt.utf-8'
#crimeandpunishment = 'https://www.gutenberg.org/ebooks/2554.txt.utf-8'
#brotherskaramazov = 'https://www.gutenberg.org/ebooks/28054.txt.utf-8'
#lesmiserables = 'https://www.gutenberg.org/ebooks/135.txt.utf-8'
#frankenstein = 'https://www.gutenberg.org/ebooks/84.txt.utf-8'
#dracula = 'https://www.gutenberg.org/ebooks/345.txt.utf-8'
#pilgrimsprogress = 'https://www.gutenberg.org/ebooks/131.txt.utf-8'

#longlist = [paulbunyan,talltalesfromTexas,mythsandfolktalesoftheRussians,talesofCapeCod,surprisingadventuresofBaronMunchausen,Americanfairytales,wonderfulwiardofOz,
#            merryadventuresofRobinHood,ConnecticutyankeeinKingArthurscourt,adventuresofHuckleberryFinn,casebookofSherlockHolmes,lostworld,legendsofkingarthurandhisknight,
#            mythsandfolktalesoftheRussians,phantastesafaerieromanceformenandwomen,aesopsfables,manwhowasthursdayanightmare,romeoandjuliet,twentythousandleaguesunderthesea,
#            aroundtheworldin80days,journeytothecenteroftheearth,warandpeace,annakarenina,crimeandpunishment,brotherskaramazov,lesmiserables,frankenstein,dracula,pilgrimsprogress]

In [None]:
# Add this code section after the existing analysis

# New book URLs
new_book_urls = [
    'https://www.gutenberg.org/cache/epub/70060/pg70060.txt',
    'https://www.gutenberg.org/ebooks/4357.txt.utf-8',
    'https://www.gutenberg.org/ebooks/10148.txt.utf-8',
    'https://www.gutenberg.org/ebooks/12753.txt.utf-8',
    'https://www.gutenberg.org/ebooks/22381.txt.utf-8',
    'https://www.gutenberg.org/ebooks/71089.txt.utf-8',
    'https://www.gutenberg.org/ebooks/50011.txt.utf-8',
    'https://www.gutenberg.org/ebooks/69718.txt.utf-8',
    'https://www.gutenberg.org/ebooks/3154.txt.utf-8',
    'https://www.gutenberg.org/ebooks/86.txt.utf-8',
    'https://www.gutenberg.org/ebooks/76.txt.utf-8',
    'https://www.gutenberg.org/ebooks/69700.txt.utf-8',
    'https://www.gutenberg.org/ebooks/139.txt.utf-8',
    'https://www.gutenberg.org/ebooks/55.txt.utf-8',
    'https://www.gutenberg.org/ebooks/325.txt.utf-8',
    'https://www.gutenberg.org/ebooks/11339.txt.utf-8',
    'https://www.gutenberg.org/ebooks/1695.txt.utf-8',
    'https://www.gutenberg.org/ebooks/1513.txt.utf-8',
    'https://www.gutenberg.org/ebooks/164.txt.utf-8',
    'https://www.gutenberg.org/ebooks/103.txt.utf-8',
    'https://www.gutenberg.org/ebooks/18857.txt.utf-8',
    'https://www.gutenberg.org/ebooks/2600.txt.utf-8',
    'https://www.gutenberg.org/ebooks/1399.txt.utf-8',
    'https://www.gutenberg.org/ebooks/2554.txt.utf-8',
    'https://www.gutenberg.org/ebooks/28054.txt.utf-8',
    'https://www.gutenberg.org/ebooks/135.txt.utf-8',
    'https://www.gutenberg.org/ebooks/84.txt.utf-8',
    'https://www.gutenberg.org/ebooks/345.txt.utf-8',
    'https://www.gutenberg.org/ebooks/131.txt.utf-8'
]

# Process new books
new_processed_books = []
for url in new_book_urls:
    try:
        book_data = process_book(url)
        new_processed_books.append(book_data)
        print(f"Processed: {book_data['title']} by {book_data['author']}")
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

# Generate word clouds for new books
print("\nGenerating word clouds for new books:")
for book in new_processed_books:
    try:
        print(f"Generating word cloud for: {book['title']}")
        generate_and_save_wordcloud(book['text'], book['title'], output_directory)
    except Exception as e:
        print(f"Error generating word cloud for {book['title']}: {str(e)}")

# Analyze new books
new_book_analyses = [analyze_book(book) for book in new_processed_books]

# Create a DataFrame for new books
new_df = pd.DataFrame([
    {
        'Title': analysis['title'],
        'Flesch Reading Ease': analysis['flesch_score'],
        'Gunning Fog Index': analysis['gunning_fog_score'],
        'SMOG Index': analysis['smog_score'],
        'Avg Sentence Length': analysis['avg_sentence_length'],
        'Sentence Length Variation': analysis['sentence_length_variation'],
        'Type-Token Ratio': analysis['type_token_ratio'],
        'Avg Sentiment': analysis['sentiment_compound'],
        'Dialogue Ratio': analysis['dialogue_ratio']
    }
    for analysis in new_book_analyses
])

# Display the summary comparison table for new books
print("Summary Comparison Table for New Books:")
print(new_df)

# Display detailed metrics for each new book
print("\nDetailed Metrics for Each New Book:")
for analysis in new_book_analyses:
    print(f"\nTitle: {analysis['title']}")
    print(f"Flesch Reading Ease: {analysis['flesch_score']:.2f}")
    print(f"Gunning Fog Index: {analysis['gunning_fog_score']:.2f}")
    print(f"SMOG Index: {analysis['smog_score']:.2f}")
    print(f"Average Sentence Length: {analysis['avg_sentence_length']:.2f}")
    print(f"Sentence Length Variation: {analysis['sentence_length_variation']:.2f}")
    print(f"Type-Token Ratio: {analysis['type_token_ratio']:.4f}")
    print(f"Dialogue Ratio: {analysis['dialogue_ratio']:.4f}")
    print(f"Sentiment Compound Score: {analysis['sentiment_compound']:.4f}")
    print("\nTop 20 Most Frequent Words:")
    for word, count in analysis['word_frequency'].items():
        print(f"  {word}: {count}")
    print("\nPart-of-Speech Distribution:")
    for pos, count in analysis['pos_distribution'].items():
        print(f"  {pos}: {count}")
    print("\nTop 10 Named Entities:")
    for entity, count in analysis['named_entities'].items():
        print(f"  {entity}: {count}")
    print("-" * 50)

# Combine old and new dataframes
all_df = pd.concat([df, new_df])

# Visualizations for all books

# 1. Word Frequency
all_analyses = book_analyses + new_book_analyses
num_books = len(all_analyses)
num_cols = 5
num_rows = math.ceil(num_books / num_cols)

plt.figure(figsize=(20, 4 * num_rows))
for i, analysis in enumerate(all_analyses, 1):
    plt.subplot(num_rows, num_cols, i)
    words = list(analysis['word_frequency'].keys())[:10]
    counts = list(analysis['word_frequency'].values())[:10]
    plt.bar(words, counts)
    plt.title(f"Top 10 Words in {analysis['title'][:20]}...")
    plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 2. Readability Scores
plt.figure(figsize=(15, 10))
sns.scatterplot(data=all_df, x='Flesch Reading Ease', y='Gunning Fog Index', hue='Title')
plt.title('Readability Scores Comparison (All Books)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 3. Sentence Length and Variation
plt.figure(figsize=(15, 10))
sns.scatterplot(data=all_df, x='Avg Sentence Length', y='Sentence Length Variation', hue='Title')
plt.title('Sentence Length and Variation (All Books)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 4. POS Distribution
all_pos_df = pd.DataFrame([analysis['pos_distribution'] for analysis in book_analyses + new_book_analyses])
all_pos_df.index = [analysis['title'] for analysis in book_analyses + new_book_analyses]
all_pos_df = all_pos_df.div(all_pos_df.sum(axis=1), axis=0)
plt.figure(figsize=(20, 15))
sns.heatmap(all_pos_df, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('POS Distribution Comparison (All Books)')
plt.tight_layout()
plt.show()

# 5. Sentiment and Dialogue Ratio
plt.figure(figsize=(15, 10))
sns.scatterplot(data=all_df, x='Avg Sentiment', y='Dialogue Ratio', hue='Title')
plt.title('Sentiment vs Dialogue Ratio (All Books)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

print("Analysis complete for all books. Check the visualizations for comparisons.")