# Preprocessing

In [41]:
# Load the uploaded text file
file_path = 'Ulysses.txt'

# Read the content of the file
with open(file_path, 'r',encoding='utf-8') as file:
    text_content = file.read()

# Text preprocessing: remove extra spaces and standardize format
import re

# Removing extra whitespace and newlines
cleaned_text = re.sub(r'\s+', ' ', text_content.strip())

cleaned_text = re.sub(r'[\“\”]', '"', cleaned_text)  # Replace double curly quotes

cleaned_text = re.sub(r'[\‘\’]', "'", cleaned_text)  # Replace single curly quotes

cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text) # Replace the ellipsis

# Remove non-essential characters (if necessary, like extra formatting artifacts)
cleaned_text = re.sub(r'[^a-zA-Z0-9.,!?\'\-\—\"\s]', '',  cleaned_text)

cleaned_text = re.sub(r'—', ' ', cleaned_text)

# Lowercase the text for case-insensitive analysis
cleaned_text = cleaned_text.lower()

#output_file_path = 'Cleaned.txt'

#with open(output_file_path, 'w', encoding='utf-8') as output_file:
    #output_file.write(cleaned_text)

# Calculation of Tokens, Types, Hapax Legomena, TTR, STTR, Lexical Density 

In [None]:
from nltk import word_tokenize, pos_tag, FreqDist

# Tokenize the text into words
tokens = word_tokenize(cleaned_text)
tokens = [token for token in tokens if token.isalpha() or '-' in token ]
    
# Calculate the word count
word_count = len(tokens)

# Calculate unique words (types)
unique_words = set(tokens)
unique_word_count = len(unique_words)

# Calculate hapax legomena
freq_dist = FreqDist(tokens)
hapax_legomena = [word for word, count in freq_dist.items() if count == 1]

# Calculate type-token ratio (TTR)
ttr = unique_word_count / word_count if word_count > 0 else 0

# Calculate STTR
chunk_size = 1000

chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

ttrs = []
for chunk in chunks:
    unique_tokens = set(chunk)
    ttr_each_chunk = len(unique_tokens) / len(chunk) if len(chunk) > 0 else 0
    ttrs.append(ttr_each_chunk)

sttr = sum(ttrs) / len(ttrs) if ttrs else 0

# Calculate lexical density
pos_tags = pos_tag(tokens)
content_words = [word for word, tag in pos_tags if tag.startswith(('NN', 'VB', 'JJ', 'RB'))]
lexical_density = (len(content_words) / word_count) * 100 if word_count > 0 else 0

# Output results
print(f"Total Word Count: {word_count}")
print(f"Unique Word Count: {unique_word_count}")
print(f"Hapax Legomena Count: {len(hapax_legomena)}")
print(f"Type-Token Ratio (TTR): {ttr:.2f}")
print(f"Standardized Type-Token Ratio (STTR):{sttr:.2f}")
print(f"Lexical Density: {lexical_density:.2f}%")

# Top Frequent Words

In [None]:
import requests

stop_words = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stop_words = set(stop_words.decode().splitlines()) 
filtered_tokens = [token for token in tokens if token not in stop_words]

# Count token frequencies
freq_dist = FreqDist(filtered_tokens)

# Get the top 100 most frequent tokens
top_100 = freq_dist.most_common(100)

# Display the results
for token, count in top_100:
    print(f"{token}: {count}")

# Wordcloud for the visualization

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(top_100))
    
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"Top 100 Words in {file_path}", fontsize=16)
plt.show()

# Calculation of Punctuations

In [None]:
import string 

# Count total tokens with punctuation marks
tokens_with_punct_count = len(word_tokenize(cleaned_text))

print(tokens_with_punct_count)

punctuations = string.punctuation

# Count total punctuation marks
punctuation_count = sum(1 for char in cleaned_text if char in punctuations)

print(punctuation_count)

# 
exclamation_count = cleaned_text.count('!')

print(exclamation_count)

percentage = punctuation_count / tokens_with_punct_count

print(percentage)