In [1]:
import os
import string

def read_poems(file_path, label):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
    remove_punct = str.maketrans('', '', string.punctuation)
        
    lines = [line.translate(remove_punct).strip().lower() for line in lines if line.strip()]
    return [(line, label) for line in lines]

poe_lines = read_poems("./../data/edgar_allan_poe.txt", 0)
frost_lines = read_poems("./../data/robert_frost.txt", 1)

In [2]:
print(poe_lines[0])
print(frost_lines[1])

('lo death hath reard himself a throne', 0)
('and sorry i could not travel both', 1)


In [3]:
data_poe, labels_poe = zip(*poe_lines)
data_frost, labels_frost = zip(*frost_lines)

data = data_poe + data_frost
labels = labels_poe + labels_frost

In [4]:
# Split
from sklearn.model_selection import train_test_split

train_text, test_text, Ytrain, Ytest = train_test_split(data, labels)


In [5]:
from collections import defaultdict

def create_mapping(data):
    word_to_index = defaultdict(lambda: len(word_to_index))
    word_to_index["<UNK>"] = 0 # special token for unknown words
    
    for line in data:
        words = line.split()
        [word_to_index[word] for word in words]
        
    return dict(word_to_index)

word_to_index = create_mapping(train_text)

In [6]:
word_to_index

{'<UNK>': 0,
 'with': 1,
 'hope': 2,
 'and': 3,
 'in': 4,
 'beauty': 5,
 'tonight': 6,
 'i': 7,
 'had': 8,
 'such': 9,
 'company': 10,
 'outward': 11,
 'bound': 12,
 'front': 13,
 'of': 14,
 'one': 15,
 'town': 16,
 'hall': 17,
 'id': 18,
 'left': 19,
 'him': 20,
 'hitched': 21,
 'im': 22,
 'getting': 23,
 'too': 24,
 'old': 25,
 'for': 26,
 'my': 27,
 'size': 28,
 'tell': 29,
 'them': 30,
 'to': 31,
 'wonder': 32,
 'where': 33,
 'the': 34,
 'cold': 35,
 'was': 36,
 'coming': 37,
 'from': 38,
 'went': 39,
 'till': 40,
 'there': 41,
 'were': 42,
 'no': 43,
 'cottages': 44,
 'found': 45,
 'this': 46,
 'all': 47,
 'olden': 48,
 'jupiter': 49,
 'several': 50,
 'times': 51,
 'over': 52,
 'by': 53,
 'marrying': 54,
 'father': 55,
 'when': 56,
 'he': 57,
 'made': 58,
 'at': 59,
 'tutelar': 60,
 'shrine': 61,
 'among': 62,
 'unearthed': 63,
 'potatoes': 64,
 'standing': 65,
 'still': 66,
 'seraphic': 67,
 'glancing': 68,
 'thine': 69,
 'eyes': 70,
 'board': 71,
 'we': 72,
 'laid': 73,
 'down':

In [7]:
def tokenize_and_convert(data, word_to_index):
    tokenized_data = []
    for line in data:
        words = line.split()
        tokenized_line = [word_to_index[word] if word in word_to_index else word_to_index["<UNK>"] for word in words]
        tokenized_data.append(tokenized_line)
    return tokenized_data


data_train_tokenized = tokenize_and_convert(train_text, word_to_index)
data_test_tokenized = tokenize_and_convert(test_text, word_to_index)

In [8]:
print(train_text[0])
print(data_train_tokenized[0])

with hope and in beauty tonight
[1, 2, 3, 4, 5, 6]


In [9]:
def calculate_initial_state_distribution(word, distribution):
    count_word = distribution.get(word, 0)
    distribution[word] = count_word + 1
    
def get_initial_state_distribution(word, distribution, N, vocabulary_size):
    count_word = distribution.get(word, 0)
    return (count_word + 1) / (N + vocabulary_size)

In [30]:
# Define state transition matrix, calculate transition_counts, total_counts, vocab size and initial_state
# distribution and pi
def calculate_transitions_and_initial_distr(data, initial_state_distrib, transitions_counts_A, total_counts_A, vocab_size):
    for line in data:
        calculate_initial_state_distribution(line[0], initial_state_distrib)
        for i in range(len(line) - 1):
            total_counts_A[line[i]] =  total_counts_A.get(line[i], 0) + 1
            transitions_counts_A[(line[i], line[i+1])] = transitions_counts_A.get((line[i], line[i+1]), 0) + 1
            
transitions_counts_poe = {}
total_counts_poe = {}
vocab_size = len(word_to_index)
initial_state_distrib_poe = {}

data_train_tokenized_poe = [t for t, y in zip(data_train_tokenized, Ytrain) if y == 0]
calculate_transitions_and_initial_distr(data_train_tokenized_poe, initial_state_distrib_poe, transitions_counts_poe,
                                        total_counts_poe, vocab_size)

transitions_counts_frost = {}
total_counts_frost = {}
vocab_size = len(word_to_index)
initial_state_distrib_frost = {}

data_train_tokenized_frost = [t for t, y in zip(data_train_tokenized, Ytrain) if y == 1]
calculate_transitions_and_initial_distr([t for t, y in zip(data_train_tokenized, Ytrain) if y == 1], initial_state_distrib_frost, transitions_counts_frost,
                                        total_counts_frost, vocab_size)

In [31]:
print(transitions_counts_poe)

{(1, 2): 2, (2, 3): 2, (3, 4): 5, (4, 5): 6, (5, 6): 2, (46, 47): 1, (47, 46): 1, (46, 36): 1, (36, 4): 1, (4, 34): 31, (34, 48): 1, (56, 57): 1, (57, 58): 1, (58, 59): 1, (59, 34): 4, (34, 60): 1, (60, 61): 1, (34, 67): 2, (67, 68): 2, (68, 14): 2, (14, 69): 2, (69, 70): 5, (4, 113): 1, (113, 4): 1, (4, 114): 1, (114, 4): 1, (4, 115): 1, (3, 34): 10, (34, 120): 1, (120, 121): 1, (121, 122): 1, (123, 34): 14, (34, 124): 6, (124, 125): 3, (125, 42): 4, (42, 126): 3, (126, 3): 3, (3, 127): 9, (14, 87): 5, (87, 4): 2, (4, 120): 2, (120, 134): 2, (134, 135): 2, (135, 136): 3, (136, 3): 2, (3, 137): 2, (117, 138): 1, (138, 7): 1, (7, 139): 1, (139, 140): 1, (123, 143): 1, (143, 14): 1, (14, 34): 26, (34, 144): 8, (144, 145): 1, (145, 146): 1, (146, 34): 1, (34, 147): 2, (147, 148): 1, (41, 149): 1, (149, 150): 1, (150, 149): 1, (149, 151): 1, (74, 74): 1, (74, 125): 1, (125, 16): 2, (16, 135): 1, (135, 152): 1, (152, 153): 1, (167, 168): 1, (168, 79): 1, (79, 169): 1, (169, 20): 1, (20, 4):

In [32]:
import numpy as np

# Adjusted normalization to directly create numpy arrays/matrices
def create_normalized_matrices(transition_counts, total_counts, initial_state_distrib, N, V):
    # Initialize with add-one smoothing
    A = np.ones((V, V))
    pi = np.ones(V)
    
    # Apply counts (add-one is already in initial values)
    for (prev_word, next_word), count in transition_counts.items():
        A[prev_word, next_word] += count  # Already 1, so just add count
    for word, count in initial_state_distrib.items():
        pi[word] += count
    
    # Normalize
    A /= A.sum(axis=1, keepdims=True)
    pi /= pi.sum()
    
    # Convert to log space
    logA = np.log(A)
    logpi = np.log(pi)
    
    return logA, logpi

N_poe = len(data_train_tokenized_poe)
N_frost = len(data_train_tokenized_frost)
logA_poe, logpi_poe = create_normalized_matrices(transitions_counts_poe, total_counts_poe, initial_state_distrib_poe, N_poe, vocab_size)
logA_frost, logpi_frost = create_normalized_matrices(transitions_counts_frost, total_counts_frost, initial_state_distrib_frost, N_frost, vocab_size)

In [33]:
lenPoe = len(labels_poe)
lenFrost = len(labels_frost)

total = lenPoe + lenFrost
p0 = lenPoe / total
p1 = lenFrost / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.3333333333333333, 0.6666666666666666)

In [34]:
def calculate_log_sequence_probability(test_line, logA, logpi):
    if not test_line:
        return float('-inf')  # log probability of an empty sequence is negative infinity
    
    # Assuming test_line is a sequence of indices
    log_prob = logpi[test_line[0]]  # Start with log probability of the initial state
    for i in range(1, len(test_line)):
        log_prob += logA[test_line[i-1], test_line[i]]  # Add log transition probabilities
    
    return log_prob

In [35]:
import random

indices_with_1 = np.where(np.array(Ytest) == 1)[0]

random_index = random.choice(indices_with_1)

test_data_ex = data_test_tokenized[random_index]

prob_poe = calculate_log_sequence_probability(test_data_ex, logA_poe, logpi_poe)
prob_frost = calculate_log_sequence_probability(test_data_ex, logA_frost, logpi_frost)

print(prob_poe)
print(prob_frost)

-60.32816776411826
-57.81091699488259


In [39]:
# Compute argmax of log of probabilities for each class
import math

def predict_author(test_line, logA_poe, logpi_poe, logp_poe, logA_frost, logpi_frost, logp_frost):
    # Calculate the total log likelihood + log prior for each class
    log_likelihood_poe = calculate_log_sequence_probability(test_line, logA_poe, logpi_poe) + logp_poe
    log_likelihood_frost = calculate_log_sequence_probability(test_line, logA_frost, logpi_frost) + logp_frost
    
    # Compare and predict
    if log_likelihood_poe > log_likelihood_frost:
        return 'Edgar Allan Poe', 0  # Returning label and class index
    else:
        return 'Robert Frost', 1

author, _ = predict_author(test_data_ex, logA_poe, logpi_poe, logp0, logA_frost, logpi_frost, logp1)
print(f"The class with the maximum log probability is: {author}")

The class with the maximum log probability is: Robert Frost


In [42]:
def calculate_accuracy(data_tokenized, true_labels):
    correct_predictions = 0
    for i, line in enumerate(data_tokenized):
        _, predicted_label = predict_author(line, logA_poe, logpi_poe, logp0, logA_frost, logpi_frost, logp1)
        if predicted_label == true_labels[i]:
            correct_predictions += 1
    accuracy = correct_predictions / len(data_tokenized)
    return accuracy

# Calculate and print accuracies
accuracy_train = calculate_accuracy(data_train_tokenized, Ytrain)
accuracy_test = calculate_accuracy(data_test_tokenized, Ytest)

print(f"Train accuracy: {accuracy_train}")
print(f"Test accuracy: {accuracy_test}")

Train accuracy: 0.9950464396284829
Test accuracy: 0.8126159554730983
