In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import random
import re
import seaborn as sns
import nltk
import ipywidgets as widgets
import spacy

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from scipy.optimize import curve_fit
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from IPython.display import display, clear_output
from textstat import flesch_reading_ease, gunning_fog, smog_index
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('vader_lexicon', quiet=True)

paulbunyan = 'https://www.gutenberg.org/cache/epub/70060/pg70060.txt'
talltalesfromTexas = 'https://www.gutenberg.org/ebooks/71089.txt.utf-8'
mythsandfolktalesoftheRussians = 'https://www.gutenberg.org/ebooks/50011.txt.utf-8'
talesofCapeCod = 'https://www.gutenberg.org/ebooks/69718.txt.utf-8'
surprisingadventuresofBaronMunchausen = 'https://www.gutenberg.org/ebooks/3154.txt.utf-8'
Americanfairytales = 'https://www.gutenberg.org/ebooks/4357.txt.utf-8'
wonderfulwiardofOz = 'https://www.gutenberg.org/ebooks/55.txt.utf-8'
merryadventuresofRobinHood = 'https://www.gutenberg.org/ebooks/10148.txt.utf-8'
ConnecticutyankeeinKingArthurscourt = 'https://www.gutenberg.org/ebooks/86.txt.utf-8'
adventuresofHuckleberryFinn = 'https://www.gutenberg.org/ebooks/76.txt.utf-8'
casebookofSherlockHolmes = 'https://www.gutenberg.org/ebooks/69700.txt.utf-8'
lostworld = 'https://www.gutenberg.org/ebooks/139.txt.utf-8'
legendsofkingarthurandhisknight = 'https://www.gutenberg.org/ebooks/12753.txt.utf-8'
mythsandlegendsofancientgreece = 'https://www.gutenberg.org/ebooks/22381.txt.utf-8'
phantastesafaerieromanceformenandwomen = 'https://www.gutenberg.org/ebooks/325.txt.utf-8'
aesopsfables = 'https://www.gutenberg.org/ebooks/11339.txt.utf-8'
manwhowasthursdayanightmare = 'https://www.gutenberg.org/ebooks/1695.txt.utf-8'
romeoandjuliet = 'https://www.gutenberg.org/ebooks/1513.txt.utf-8'
twentythousandleaguesunderthesea = 'https://www.gutenberg.org/ebooks/164.txt.utf-8'
aroundtheworldin80days = 'https://www.gutenberg.org/ebooks/103.txt.utf-8'
journeytothecenteroftheearth = 'https://www.gutenberg.org/ebooks/18857.txt.utf-8'
warandpeace = 'https://www.gutenberg.org/ebooks/2600.txt.utf-8'
annakarenina = 'https://www.gutenberg.org/ebooks/1399.txt.utf-8'
crimeandpunishment = 'https://www.gutenberg.org/ebooks/2554.txt.utf-8'
brotherskaramazov = 'https://www.gutenberg.org/ebooks/28054.txt.utf-8'
lesmiserables = 'https://www.gutenberg.org/ebooks/135.txt.utf-8'
frankenstein = 'https://www.gutenberg.org/ebooks/84.txt.utf-8'
dracula = 'https://www.gutenberg.org/ebooks/345.txt.utf-8'
pilgrimsprogress = 'https://www.gutenberg.org/ebooks/131.txt.utf-8'

shortlist = [paulbunyan,Americanfairytales,merryadventuresofRobinHood,legendsofkingarthurandhisknight,mythsandlegendsofancientgreece]
longlist = [paulbunyan,talltalesfromTexas,mythsandfolktalesoftheRussians,talesofCapeCod,surprisingadventuresofBaronMunchausen,Americanfairytales,wonderfulwiardofOz,
            merryadventuresofRobinHood,ConnecticutyankeeinKingArthurscourt,adventuresofHuckleberryFinn,casebookofSherlockHolmes,lostworld,legendsofkingarthurandhisknight,
            mythsandfolktalesoftheRussians,phantastesafaerieromanceformenandwomen,aesopsfables,manwhowasthursdayanightmare,romeoandjuliet,twentythousandleaguesunderthesea,
            aroundtheworldin80days,journeytothecenteroftheearth,warandpeace,annakarenina,crimeandpunishment,brotherskaramazov,lesmiserables,frankenstein,dracula,pilgrimsprogress]

In [None]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

def fetch_book_content(url):
    response = requests.get(url)
    return response.text

def calculate_metrics(text):
    # Tokenize text
    words = word_tokenize(text.lower())
    sentences = sent_tokenize(text)
    
    # Word frequency
    freq_dist = FreqDist(words)
    most_common = freq_dist.most_common(20)
    
    # Readability scores
    readability = {
        'flesch_reading_ease': flesch_reading_ease(text),
        'gunning_fog': gunning_fog(text),
        'smog_index': smog_index(text)
    }
    
    # Sentence length
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    avg_sentence_length = np.mean(sentence_lengths)
    sentence_length_variation = np.std(sentence_lengths)
    
    # Lexical diversity
    vocab = set(words)
    type_token_ratio = len(vocab) / len(words)
    
    # Part-of-speech distribution
    pos_tags = pos_tag(words)
    pos_counts = Counter(tag for word, tag in pos_tags)
    
    # Named entity recognition
    doc = nlp(text)
    named_entities = Counter([ent.label_ for ent in doc.ents])
    
    # Sentiment analysis
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    
    # Dialogue-to-narrative ratio (simplified estimation)
    dialogue_sentences = sum(1 for sentence in sentences if '"' in sentence or "'" in sentence)
    dialogue_ratio = dialogue_sentences / len(sentences)
    
    return {
        'word_frequency': most_common,
        'readability': readability,
        'avg_sentence_length': avg_sentence_length,
        'sentence_length_variation': sentence_length_variation,
        'type_token_ratio': type_token_ratio,
        'pos_distribution': dict(pos_counts),
        'named_entities': dict(named_entities),
        'sentiment': sentiment_scores,
        'dialogue_ratio': dialogue_ratio
    }

def analyze_book_list(book_list):
    results = []
    for book_url in book_list:
        print(f"Analyzing: {book_url}")
        content = fetch_book_content(book_url)
        metrics = calculate_metrics(content)
        results.append({'url': book_url, 'metrics': metrics})
    return results

def compare_results(results):
    df = pd.DataFrame(results)
    
    # Extract specific metrics for comparison
    df['flesch_reading_ease'] = df['metrics'].apply(lambda x: x.get('readability', {}).get('flesch_reading_ease', np.nan))
    df['gunning_fog'] = df['metrics'].apply(lambda x: x.get('readability', {}).get('gunning_fog', np.nan))
    df['smog_index'] = df['metrics'].apply(lambda x: x.get('readability', {}).get('smog_index', np.nan))
    df['avg_sentence_length'] = df['metrics'].apply(lambda x: x.get('avg_sentence_length', np.nan))
    df['sentence_length_variation'] = df['metrics'].apply(lambda x: x.get('sentence_length_variation', np.nan))
    df['type_token_ratio'] = df['metrics'].apply(lambda x: x.get('type_token_ratio', np.nan))
    df['dialogue_ratio'] = df['metrics'].apply(lambda x: x.get('dialogue_ratio', np.nan))
    df['sentiment_compound'] = df['metrics'].apply(lambda x: x.get('sentiment', {}).get('compound', np.nan))
    
    # Print all metrics for each book
    for index, row in df.iterrows():
        print(f"\nMetrics for {row['url']}:")
        print(f"Flesch Reading Ease: {row['flesch_reading_ease']:.2f}")
        print(f"Gunning Fog Index: {row['gunning_fog']:.2f}")
        print(f"SMOG Index: {row['smog_index']:.2f}")
        print(f"Average Sentence Length: {row['avg_sentence_length']:.2f}")
        print(f"Sentence Length Variation: {row['sentence_length_variation']:.2f}")
        print(f"Type-Token Ratio: {row['type_token_ratio']:.4f}")
        print(f"Dialogue Ratio: {row['dialogue_ratio']:.4f}")
        print(f"Sentiment Compound Score: {row['sentiment_compound']:.4f}")
        print("\nTop 20 Most Frequent Words:")
        print(row['metrics']['word_frequency'])
        print("\nPart-of-Speech Distribution:")
        print(row['metrics']['pos_distribution'])
        print("\nNamed Entities:")
        print(row['metrics']['named_entities'])

    # Visualizations
    plt.figure(figsize=(15, 10))
    sns.heatmap(df[['flesch_reading_ease', 'gunning_fog', 'smog_index', 'avg_sentence_length', 'type_token_ratio', 'dialogue_ratio', 'sentiment_compound']].corr(), annot=True, cmap='coolwarm')
    plt.title(f'Correlation Heatmap of Metrics')
    plt.show()

    metrics_to_plot = ['flesch_reading_ease', 'gunning_fog', 'smog_index', 'avg_sentence_length', 'type_token_ratio', 'dialogue_ratio', 'sentiment_compound']
    
    fig, axes = plt.subplots(len(metrics_to_plot), 1, figsize=(15, 5*len(metrics_to_plot)))
    fig.suptitle(f'Comparison of Textual Metrics Across Books')
    
    for i, metric in enumerate(metrics_to_plot):
        sns.barplot(x='url', y=metric, data=df, ax=axes[i])
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=90)
        axes[i].set_xlabel('')
        axes[i].set_title(metric)
    
    plt.tight_layout()
    plt.show()

    return df

# Analyze short list
print("Starting analysis of books...")
short_list_results = analyze_book_list(shortlist)
short_list_comparison = compare_results(short_list_results)

print("Analysis complete. Results stored in short_list_comparison DataFrames.")
print(short_list_results)

In [None]:
    # Word frequency visualization
    plt.figure(figsize=(15, 10))
    word_freq = pd.DataFrame(df['metrics'].iloc[0]['word_frequency'], columns=['word', 'count'])
    sns.barplot(x='word', y='count', data=word_freq)
    plt.title(f'Top 20 Most Frequent Words - {df.iloc[0]["url"]}')
    plt.xticks(rotation=45)
    plt.show()
    
    # POS distribution visualization
    plt.figure(figsize=(15, 10))
    pos_dist = pd.DataFrame.from_dict(df['metrics'].iloc[0]['pos_distribution'], orient='index', columns=['count'])
    pos_dist = pos_dist.sort_values('count', ascending=False).head(10)
    sns.barplot(x=pos_dist.index, y='count', data=pos_dist)
    plt.title(f'Top 10 Part-of-Speech Tags - {df.iloc[0]["url"]}')
    plt.xticks(rotation=45)
    plt.show()

     # Named entities visualization
    plt.figure(figsize=(15, 10))
    ne_dist = pd.DataFrame.from_dict(df['metrics'].iloc[0]['named_entities'], orient='index', columns=['count'])
    ne_dist = ne_dist.sort_values('count', ascending=False).head(10)
    sns.barplot(x=ne_dist.index, y='count', data=ne_dist)
    plt.title(f'Top 10 Named Entity Types - {df.iloc[0]["url"]}')
    plt.xticks(rotation=45)
    plt.show()
    
    return df


In [None]:
def fetch_gutenberg_book(url):
    r = requests.get(url)
    r.encoding = 'utf-8'
    return r.text

def get_gutenberg_catalog(num_books=100):
    url = f"https://www.gutenberg.org/browse/scores/top#{num_books}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    books = []
    for li in soup.find_all('li'):
        a = li.find('a')
        if a and a.get('href', '').startswith('/ebooks/'):
            book_id = a['href'].split('/')[-1]
            title = a.text.strip()
            books.append((book_id, title))
    return books

def create_book_selection_ui(books):
    book_dict = {f"{book[1]} (ID: {book[0]})": book[0] for book in books}
    book_dropdown = widgets.Dropdown(
        options=book_dict.keys(),
        description='Select a book:',
        disabled=False,
    )
    add_button = widgets.Button(description="Add Book")
    selected_books = widgets.Textarea(
        value='',
        placeholder='Selected books will appear here',
        description='Selected:',
        disabled=True
    )
    
    book_selections = []
    
    def on_add_button_click(b):
        selected = book_dropdown.value
        book_id = book_dict[selected]
        if book_id not in book_selections:
            book_selections.append(book_id)
            selected_books.value = ", ".join([f"ID: {id}" for id in book_selections])
    
    add_button.on_click(on_add_button_click)
    
    display(widgets.VBox([book_dropdown, add_button, selected_books]))
    
    return book_selections

def analyze_books(book_ids):
    results = {}
    for book_id in book_ids:
        url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
        book_text = fetch_gutenberg_book(url)
        cleaned_text = clean_text(book_text)
        
        # Perform analysis (reuse functions from the original code)
        words = word_tokenize(cleaned_text.lower())
        word_freq = Counter(words)
        total_words = len(words)
        unique_words = len(set(words))
        ttr = unique_words / total_words
        avg_word_length = sum(len(word) for word in words) / total_words
        
        results[book_id] = {
            'total_words': total_words,
            'unique_words': unique_words,
            'ttr': ttr,
            'avg_word_length': avg_word_length,
            'top_words': word_freq.most_common(20)
        }
    
    return results

def compare_books(results):
    # Create comparison visualizations
    book_ids = list(results.keys())
    
    # Compare total words
    plt.figure(figsize=(12, 6))
    plt.bar(book_ids, [results[book_id]['total_words'] for book_id in book_ids])
    plt.title('Total Words Comparison')
    plt.xlabel('Book ID')
    plt.ylabel('Total Words')
    plt.show()
    
    # Compare unique words
    plt.figure(figsize=(12, 6))
    plt.bar(book_ids, [results[book_id]['unique_words'] for book_id in book_ids])
    plt.title('Unique Words Comparison')
    plt.xlabel('Book ID')
    plt.ylabel('Unique Words')
    plt.show()
    
    # Compare TTR
    plt.figure(figsize=(12, 6))
    plt.bar(book_ids, [results[book_id]['ttr'] for book_id in book_ids])
    plt.title('Type-Token Ratio Comparison')
    plt.xlabel('Book ID')
    plt.ylabel('TTR')
    plt.show()
    
    # Compare average word length
    plt.figure(figsize=(12, 6))
    plt.bar(book_ids, [results[book_id]['avg_word_length'] for book_id in book_ids])
    plt.title('Average Word Length Comparison')
    plt.xlabel('Book ID')
    plt.ylabel('Average Word Length')
    plt.show()
    
    # Compare top words
    plt.figure(figsize=(15, 10))
    for i, book_id in enumerate(book_ids):
        top_words = results[book_id]['top_words']
        plt.subplot(len(book_ids), 1, i+1)
        plt.barh([word for word, _ in top_words], [count for _, count in top_words])
        plt.title(f'Top 20 Words - Book ID: {book_id}')
        plt.xlabel('Frequency')
    plt.tight_layout()
    plt.show()

# Main execution
books = get_gutenberg_catalog()
selected_book_ids = create_book_selection_ui(books)

# Wait for user to select books
input("Press Enter when you've finished selecting books...")

print(selected_book_ids)
results = analyze_books(selected_book_ids)
compare_books(results)