# Preprocessing

In [155]:
# Load the uploaded text file
file_path = 'Portrait.txt'
file_path1 = 'Dubliners.txt'
file_path3 = 'Ulysses.txt'

# Read the content of the file
with open(file_path3, 'r',encoding='utf-8') as file:
    text_content = file.read()

import re

# Removing extra whitespace and newlines
cleaned_text = re.sub(r'\s+', ' ', text_content.strip())

cleaned_text = re.sub(r'[“”]', '"', cleaned_text)  # Replace double curly quotes

cleaned_text = re.sub(r'[‘’]', "'", cleaned_text)  # Replace single curly quotes

cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text) # Replace the ellipsis

# Remove non-essential characters 
cleaned_text = re.sub(r'[^a-zA-Z0-9.,!?\'\-\—\"\s]', '',  cleaned_text)

cleaned_text = re.sub(r'—', ' ', cleaned_text)

# Lowercase the text for case-insensitive analysis
cleaned_text = cleaned_text.lower()


# Sentence Length and Its Average, Median, Min, Max

In [None]:
from nltk import word_tokenize, sent_tokenize, pos_tag, FreqDist
import pandas as pd
import statistics

sentences = sent_tokenize(cleaned_text)

# Calculate sentence lengths
sentence_lengths = [
    (sentence, len([word for word in word_tokenize(sentence) if word.isalpha() or '-' in word]))
    for sentence in sentences
]


lengths = [length for _, length in sentence_lengths]

total_sentences = len(sentences)

mean_sentence_length = sum(lengths) / len(lengths) if lengths else 0

median_sentence_length = statistics.median(lengths)

min_sentence_length = min(lengths)

max_sentence_length = max(lengths)

sorted_sentences = sorted(sentence_lengths, key=lambda x: x[1], reverse=True)
top_10_sentences = sorted_sentences[:10]

top_10_df = pd.DataFrame(top_10_sentences, columns=["Sentence", "Word Count"])

print(f"Total: {total_sentences}")
print(f"Mean: {mean_sentence_length}")
print(f"Median: {median_sentence_length}")
print(f"Min: {min_sentence_length}")
print(f"Max: {max_sentence_length}")

top_10_df

# N-gram 

In [None]:
from nltk.util import ngrams
from collections import Counter

tokens = word_tokenize(cleaned_text)
tokens = [token for token in tokens if token.isalpha()]

# Generate N-grams
trigrams_list = list(ngrams(tokens, 6))
# Calculate frequencies
freq_dict = Counter(trigrams_list)
filtered_freq_dict = {ngram: freq for ngram, freq in freq_dict.items() if freq >= 5}

# Calculate types (unique N-grams) and tokens (total N-grams)
total_trigram_tokens = sum(filtered_freq_dict.values())
unique_trigram_types = len(filtered_freq_dict)

print("Total Trigram Tokens:", total_trigram_tokens)
print("Unique Trigram Types:", unique_trigram_types)

#import csv       
#output_file_path = "5grams_output.csv"
#with open(output_file_path, "w", newline="", encoding="utf-8") as csvfile:
    #writer = csv.writer(csvfile)
    #writer.writerow(["N-gram", "Frequency"])  # Write header
    #for ngram, freq in freq_dict.items():
        #writer.writerow([' '.join(ngram), freq]) 



# Sentence Pattern

In [None]:
patterns = []

for sentence in sentences:
        # Tokenize words in the sentence
    words = [word for word in word_tokenize(sentence) if word.isalpha() or '-' in word]
        # Tag words with parts of speech
    pos_tags = pos_tag(words)
        # Extract the POS sequence
    pattern = '-'.join(tag for word, tag in pos_tags)
    patterns.append(pattern)

#output_file = 'patterns_output1.txt'
#with open(output_file, 'w', encoding='utf-8') as file:
    #for pattern, sentence in patterns:
        #file.write(f"Pattern: {pattern}\nSentence: {sentence}\n\n")
        
    # Count frequencies of patterns
pattern_counts = Counter(patterns)
print(len(pattern_counts))

for pattern, count in pattern_counts.most_common(30):
    print(f"{pattern}: {count}")

