# Preprocessing

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import numpy as np
import glob
import re
import requests

file_paths = ["Dubliners.txt", "Portrait.txt", "Ulysses.txt"]

# Preprocessing function
def preprocess_text(text):
    cleaned_text = re.sub(r'\s+', ' ', text.strip())
    cleaned_text = re.sub(r'[“”]', '"', cleaned_text)  # Replace double curly quotes
    cleaned_text = re.sub(r'[‘’]', "'", cleaned_text)  # Replace single curly quotes
    cleaned_text = re.sub(r'[^a-zA-Z0-9.,!?\'\-\—\"\s]', '',  cleaned_text)
    cleaned_text = re.sub(r'—', ' ', cleaned_text)
    cleaned_text = cleaned_text.lower()
    tokens = word_tokenize(cleaned_text) 
    tokens = [word for word in tokens if word.isalpha() or '-' in word]  
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# Read and preprocess files
tokenized_texts = []
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized_texts.append(preprocess_text(text))


In [13]:
all_tokens = [token for tokens in tokenized_texts for token in tokens]
freq_dist = Counter(all_tokens)
most_frequent_words = [word for word, _ in freq_dist.most_common(200)]  

print(f"Most Frequent Words: {most_frequent_words}")

Most Frequent Words: ['said', 'mr', 'one', 'like', 'stephen', 'bloom', 'would', 'man', 'old', 'could', 'little', 'time', 'eyes', 'see', 'two', 'back', 'know', 'father', 'good', 'says', 'god', 'yes', 'face', 'first', 'hand', 'well', 'come', 'asked', 'day', 'go', 'life', 'street', 'way', 'say', 'us', 'long', 'came', 'head', 'night', 'made', 'never', 'went', 'right', 'round', 'voice', 'young', 'get', 'mrs', 'still', 'must', 'away', 'name', 'upon', 'let', 'put', 'tell', 'last', 'dedalus', 'house', 'woman', 'sir', 'think', 'every', 'look', 'saw', 'going', 'hands', 'ever', 'heard', 'always', 'world', 'might', 'make', 'turned', 'men', 'towards', 'dark', 'told', 'thing', 'something', 'poor', 'soul', 'thought', 'door', 'mother', 'three', 'heart', 'great', 'got', 'another', 'love', 'white', 'stood', 'mind', 'light', 'air', 'left', 'give', 'course', 'fellow', 'much', 'though', 'miss', 'took', 'john', 'looked', 'even', 'began', 'behind', 'along', 'hat', 'felt', 'home', 'knew', 'passed', 'words', '

In [14]:
# Function to calculate frequencies
def calculate_frequencies(tokens, mfw_list):
    word_counts = Counter(tokens)
    total_words = sum(word_counts.values())
    return {word: (word_counts[word] / total_words) * 100 for word in mfw_list}

# Calculate frequencies for all texts
frequencies = [calculate_frequencies(tokens, most_frequent_words) for tokens in tokenized_texts]

print(f"Frequencies: {frequencies}")

Frequencies: [{'said': 2.280622262498112, 'mr': 1.7399184413230628, 'one': 0.6343452650657001, 'like': 0.3957106177314605, 'stephen': 0.012082766953632381, 'bloom': 0.0, 'would': 0.7551729346020238, 'man': 0.7008004833106781, 'old': 0.43800030206917384, 'could': 0.46216583597643857, 'little': 0.6343452650657001, 'time': 0.3443588581785229, 'eyes': 0.2899864068871772, 'see': 0.24769672254946382, 'two': 0.35644162513215527, 'back': 0.2779036399335448, 'know': 0.3413381664401148, 'father': 0.16915873735085335, 'good': 0.30508986557921763, 'says': 0.045310376076121435, 'god': 0.10572421084428334, 'yes': 0.16915873735085335, 'face': 0.3292553994864824, 'first': 0.19030357951971003, 'hand': 0.2054070382117505, 'well': 0.3443588581785229, 'come': 0.23561395559583143, 'asked': 0.3836278507778281, 'day': 0.14801389518199667, 'go': 0.28696571514876906, 'life': 0.2446760308110557, 'street': 0.2597794895030962, 'way': 0.18426219604289382, 'say': 0.22051049690379096, 'us': 0.23259326385742335, 'lon

In [15]:
# Create a frequency matrix (rows: words, columns: texts)
freq_matrix = np.array([[freq.get(word, 0) for freq in frequencies] for word in most_frequent_words])

# Compute mean and standard deviation for each word
means = freq_matrix.mean(axis=1)
std_devs = freq_matrix.std(axis=1, ddof=1)  

# Calculate z-scores
z_scores = (freq_matrix - means[:, None]) / std_devs[:, None]

print(f"Z-Scores: {z_scores}")


Z-Scores: [[ 1.03517127 -0.07451077 -0.9606605 ]
 [ 1.14735288 -0.68630874 -0.46104414]
 [ 1.11073247 -0.28204403 -0.82868845]
 [-1.14554571  0.6984461   0.44709962]
 [-0.90676932  1.07251878 -0.16574946]
 [-0.58034829 -0.57434705  1.15469534]
 [ 0.72059091  0.42108892 -1.14167984]
 [ 1.14671591 -0.69075461 -0.4559613 ]
 [ 1.03174263 -0.964904   -0.06683863]
 [ 0.97114496  0.05540636 -1.02655132]
 [ 1.11892388 -0.31246595 -0.80645793]
 [ 1.12301203 -0.32884104 -0.79417099]
 [-0.22930613  1.09473675 -0.86543061]
 [-0.03553513 -0.98175879  1.01729392]
 [ 1.00907184 -0.99067429 -0.01839754]
 [ 0.45612134  0.690615   -1.14673634]
 [ 1.1386334  -0.40307744 -0.73555596]
 [-0.70447075  1.14456783 -0.44009708]
 [ 1.15341245 -0.62392688 -0.52948557]
 [-0.59340149 -0.56114889  1.15455038]
 [-0.71435987  1.14284366 -0.42848379]
 [-0.70245418 -0.44244702  1.1449012 ]
 [ 0.69053507  0.45621296 -1.14674803]
 [-0.60588084  1.15422257 -0.54834174]
 [ 0.21190871 -1.08897064  0.87706194]
 [ 1.13532214 -

In [16]:
# Compute Delta values
num_texts = len(file_paths)
delta_matrix = np.zeros((num_texts, num_texts))

for i in range(num_texts):
    for j in range(num_texts):
        if i != j:
            delta_matrix[i, j] = np.mean(np.abs(z_scores[:, i] - z_scores[:, j]))

print(f"Delta Matrix:\n{delta_matrix}")

import pandas as pd

labels = [f"Text {i+1}" for i in range(num_texts)]
delta_df = pd.DataFrame(delta_matrix, index=labels, columns=labels)

delta_df


Delta Matrix:
[[0.         1.239246   1.24938388]
 [1.239246   0.         1.31900829]
 [1.24938388 1.31900829 0.        ]]


Unnamed: 0,Text 1,Text 2,Text 3
Text 1,0.0,1.239246,1.249384
Text 2,1.239246,0.0,1.319008
Text 3,1.249384,1.319008,0.0
