In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import random
import re
import seaborn as sns
import nltk
import ipywidgets as widgets
import spacy

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from scipy.optimize import curve_fit
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from IPython.display import display, clear_output
from textstat import flesch_reading_ease, gunning_fog, smog_index
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('vader_lexicon', quiet=True)

# Get the text of 'Paul Bunyan' by James Stevens from Project Gutenberg
r = requests.get('https://www.gutenberg.org/cache/epub/70060/pg70060.txt')
r.encoding = 'utf-8'
booktext = r.text

# Function to clean the text
def clean_text(text):
    # Remove Project Gutenberg header
    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK PAUL BUNYAN ***"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK PAUL BUNYAN ***"
    start_index = text.find(start_marker) + len(start_marker)
    if start_index == -1:
        start_index = 0
    else:
        start_index = text.index("\n", start_index) + 1
    end_index = text.find(end_marker)
    if end_index == -1:
        end_index = len(text)
    text = text[start_index:end_index]

    # Remove the mentions of illustrations
    text = re.sub(r'\[Illustration\]', '', text)
    # Remove punctuation but keep capitaliation for name recognition
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

# Clean the text
cleaned_text = clean_text(booktext)

# Split into words
words = cleaned_text.split()

# Tokenize the text
tokens = word_tokenize(cleaned_text)

# Perform part-of-speech tagging
tagged_tokens = pos_tag(tokens)

# Remove stopwords
stopwords = set(STOPWORDS)
additional_stopwords = ['said','now','one']
stopwords.update(additional_stopwords)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=set(STOPWORDS),
        max_words=100,
        max_font_size=88, 
        scale=3,
        random_state=random.randint(0,1000)
    ).generate(str(data))

    plt.figure(figsize=(12, 12))
    plt.axis('off')
    plt.imshow(wordcloud)
    if title:
        plt.title(title)
    plt.show()

show_wordcloud(cleaned_text, 'Paul Bunyan')

In [None]:
# Word frequency analysis
words = [word.lower() for word in tokens if word.lower() not in stopwords]

# Count word frequencies
word_freq = Counter(words)

# Convert to DataFrame
df = pd.DataFrame.from_dict(word_freq, orient='index', columns=['frequency'])
df = df.sort_values('frequency', ascending=False)
df['word'] = df.index
df = df.reset_index(drop=True)

# Calculate additional statistics
total_words = len(words)
df['length'] = df['word'].str.len()
df['frequency_percentage'] = df['frequency'] / total_words * 100
df['normalized_frequency_1000'] = df['frequency'] / total_words * 1000  # Normalized per 1000 words

# Print the table
print(df.head(20).to_string(index=False))

# Calculate and print some overall statistics
unique_words = len(df)
average_word_length = df['length'].mean()

# Calculate the Type-Token Ratio
ttr = unique_words / total_words

# Advanced Lexical Diversity Measures
def mtld(text, ttr_threshold=0.72):
    def mtld_calc(text, ttr_threshold):
        factors = 0
        types = set()
        tokens = 0
        for word in text:
            types.add(word)
            tokens += 1
            ttr = len(types) / tokens
            if ttr <= ttr_threshold:
                factors += 1
                types = set()
                tokens = 0
        if tokens > 0:
            factors += (1 - ttr) / (1 - ttr_threshold)
        return len(text) / factors if factors > 0 else 0
    
    forward = mtld_calc(text, ttr_threshold)
    backward = mtld_calc(text[::-1], ttr_threshold)
    return (forward + backward) / 2

def mattr(text, window_size=1000):
    ttrs = []
    for i in range(len(text) - window_size + 1):
        window = text[i:i+window_size]
        ttrs.append(len(set(window)) / len(window))
    return np.mean(ttrs)

def mass_function_residual(text, num_points=10):
    def expected_types(n, a, b):
        return a * n ** b

    def calculate_observed_types(text, num_points):
        observed_types = []
        step = len(text) // num_points
        for i in range(step, len(text) + 1, step):
            observed_types.append(len(set(text[:i])))
        return observed_types, step

    observed_types, step = calculate_observed_types(text, num_points)
    x = np.arange(step, len(text) + 1, step)
    
    # Fit the expected function to the observed data
    popt, _ = curve_fit(expected_types, x, observed_types)
    
    # Calculate the residuals
    expected = expected_types(x, *popt)
    residuals = np.array(observed_types) - expected
    
    # Normalize the residuals
    normalized_residuals = residuals / expected
    
    # Return the mean of the absolute normalized residuals
    return np.mean(np.abs(normalized_residuals))

# Calculate advanced measures
mtld_score = mtld(words)
mattr_score = mattr(words)
mfr_score = mass_function_residual(words)

# Print Statistics
print(f"\nTotal words (tokens): {total_words}")
print(f"Unique words (types): {unique_words}")
print(f"Type-Token Ratio: {ttr:.4f}")
print(f"MATTR: {mattr_score:.4f}")
print(f"MTLD: {mtld_score:.4f}")
print(f"Mass-Function Residual: {mfr_score:.4f}")
print(f"Average word length: {df['length'].mean():.2f}")

In [None]:
# Visualization functions
def plot_top_words(df, n=20, normalized=False):
    plt.figure(figsize=(12, 8))
    if normalized:
        sns.barplot(x='normalized_frequency_1000', y='word', data=df.head(n))
        plt.title(f'Top {n} Most Frequent Words (Normalized per 1000 words)')
        plt.xlabel('Normalized Frequency')
    else:
        sns.barplot(x='frequency', y='word', data=df.head(n))
        plt.title(f'Top {n} Most Frequent Words')
        plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.show()

def plot_word_length_distribution(df):
    plt.figure(figsize=(12, 6))
    sns.histplot(df['length'], bins=20, kde=False)
    plt.title('Distribution of Word Lengths')
    plt.xlabel('Word Length')
    plt.ylabel('Count')
    plt.show()

def plot_frequency_vs_length(df):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='length', y='frequency', data=df)
    plt.title('Word Frequency vs Word Length')
    plt.xlabel('Word Length')
    plt.ylabel('Frequency')
    plt.show()

def plot_cumulative_frequency(df):
    df_sorted = df.sort_values('frequency', ascending=False)
    df_sorted['cumulative_freq'] = df_sorted['frequency'].cumsum() / df_sorted['frequency'].sum() * 100
    
    plt.figure(figsize=(12, 6))
    plt.plot(range(1, len(df_sorted) + 1), df_sorted['cumulative_freq'])
    plt.title('Cumulative Word Frequency')
    plt.xlabel('Number of Unique Words')
    plt.ylabel('Cumulative Frequency (%)')
    plt.ylim(0, 100)
    plt.show()

# Create visualizations
plot_top_words(df,normalized=False)
plot_top_words(df,normalized=True)
plot_word_length_distribution(df)
plot_frequency_vs_length(df)
plot_cumulative_frequency(df)


In [None]:
# Function to identify potential character names
def is_potential_name(word, tag):
    return (tag.startswith('NNP') and len(word) > 1 and word[0].isupper()) or word in ['I', 'Me', 'You', 'He', 'She', 'Him', 'Her']

# Extract potential character names
potential_names = [word for word, tag in tagged_tokens if is_potential_name(word, tag)]

# Function to combine first and last names
def combine_names(names):
    combined_names = []
    i = 0
    while i < len(names) - 1:
        if names[i][0].isupper() and names[i+1][0].isupper():
            combined_names.append(f"{names[i]} {names[i+1]}")
            i += 2
        else:
            combined_names.append(names[i])
            i += 1
    if i == len(names) - 1:
        combined_names.append(names[i])
    return combined_names

# Combine names
combined_names = combine_names(potential_names)

# Count name occurrences
name_counts = Counter(combined_names)

# Function to normalize names
def normalize_name(name):
    # Remove possessive 's
    name = re.sub(r"'s\b", "", name)
    # Split the name into parts
    parts = name.split()
    # Sort the parts to handle cases like "Bunyan Paul"
    parts.sort()
    # Join the parts back together
    return " ".join(parts)

# Function to combine similar names
def combine_similar_names(name_counts):
    normalized_counts = {}
    for name, count in name_counts.items():
        normalized_name = normalize_name(name)
        if normalized_name in normalized_counts:
            normalized_counts[normalized_name] += count
        else:
            normalized_counts[normalized_name] = count
    return normalized_counts

# Combine Similar Names
name_counts = combine_similar_names(name_counts)

# Remove nonsensical combinations
name_counts = {name: count for name, count in name_counts.items() 
               if not (name.split()[0] == name.split()[-1] and len(name.split()) > 1)}

# Convert to DataFrame
name_df = pd.DataFrame.from_dict(name_counts, orient='index', columns=['frequency'])
name_df = name_df.sort_values('frequency', ascending=False)
name_df['name'] = name_df.index
name_df = name_df.reset_index(drop=True)

# Calculate percentage of total names
total_names = name_df['frequency'].sum()
name_df['percentage'] = name_df['frequency'] / total_names * 100



# Print top character names
print("Top 20 Potential Character Names:")
print(name_df[['name', 'frequency', 'percentage']].head(20).to_string(index=False))

# Visualization for character names
def plot_top_names(df, n=20):
    plt.figure(figsize=(12, 8))
    sns.barplot(x='frequency', y='name', data=df.head(n))
    plt.title(f'Top {n} Most Frequent Character Names')
    plt.xlabel('Frequency')
    plt.ylabel('Name')
    plt.show()

plot_top_names(name_df)

In [None]:
# Sentiment analysis
def get_character_sentiment(character, text):
    sia = SentimentIntensityAnalyzer()
    sentences = sent_tokenize(text)
    character_sentences = [sent for sent in sentences if character.lower() in sent.lower()]
    if not character_sentences:
        return {'compound': 0, 'pos': 0, 'neu': 0, 'neg': 0}
    sentiments = [sia.polarity_scores(sent) for sent in character_sentences]
    avg_sentiment = {key: sum(sent[key] for sent in sentiments) / len(sentiments) 
                     for key in sentiments[0]}
    return avg_sentiment

# Get the top 20 characters
top_20_characters = name_df.head(20)

# Add sentiment analysis to the DataFrame
for index, row in top_20_characters.iterrows():
    sentiment = get_character_sentiment(row['name'], cleaned_text)
    name_df.at[index, 'sentiment_compound'] = sentiment['compound']
    name_df.at[index, 'sentiment_positive'] = sentiment['pos']
    name_df.at[index, 'sentiment_neutral'] = sentiment['neu']
    name_df.at[index, 'sentiment_negative'] = sentiment['neg']

# Print top character names with sentiment
print("Top 20 Potential Character Names with Sentiment:")
print(name_df[['name', 'frequency', 'percentage', 'sentiment_compound']].head(20).to_string(index=False))

# Visualization for character names and sentiment
def plot_top_names_sentiment(df, n=20):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='frequency', y='sentiment_compound', size='frequency', 
                    hue='sentiment_compound', data=df.head(n), legend=False)
    plt.title(f'Top {n} Most Frequent Character Names with Sentiment')
    plt.xlabel('Frequency')
    plt.ylabel('Sentiment (Compound Score)')
    
    for i, row in df.head(n).iterrows():
        plt.annotate(row['name'], (row['frequency'], row['sentiment_compound']))
    
    plt.show()

plot_top_names_sentiment(name_df)

# Additional visualization: Sentiment distribution
plt.figure(figsize=(12, 6))
sns.boxplot(data=name_df[['sentiment_positive', 'sentiment_neutral', 'sentiment_negative']])
plt.title('Distribution of Sentiment Scores Across Characters')
plt.ylabel('Score')
plt.show()