In [1]:
import os
import string

def read_poems(file_path, label):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
    remove_punct = str.maketrans('', '', string.punctuation)
        
    lines = [line.translate(remove_punct).strip().lower() for line in lines if line.strip()]
    return lines

data = read_poems("./../data/robert_frost.txt", 1)

In [2]:
from collections import defaultdict

def create_mapping(data):
    word_to_index = defaultdict(lambda: len(word_to_index))
    word_to_index["<UNK>"] = 0 # special token for unknown words
    
    for line in data:
        words = line.split()
        [word_to_index[word] for word in words]
        
    return dict(word_to_index)

word_to_index = create_mapping(data)

In [10]:
index_to_word = {index: word for word, index in word_to_index.items()}

In [3]:
word_to_index

{'<UNK>': 0,
 'two': 1,
 'roads': 2,
 'diverged': 3,
 'in': 4,
 'a': 5,
 'yellow': 6,
 'wood': 7,
 'and': 8,
 'sorry': 9,
 'i': 10,
 'could': 11,
 'not': 12,
 'travel': 13,
 'both': 14,
 'be': 15,
 'one': 16,
 'traveler': 17,
 'long': 18,
 'stood': 19,
 'looked': 20,
 'down': 21,
 'as': 22,
 'far': 23,
 'to': 24,
 'where': 25,
 'it': 26,
 'bent': 27,
 'the': 28,
 'undergrowth': 29,
 'then': 30,
 'took': 31,
 'other': 32,
 'just': 33,
 'fair': 34,
 'having': 35,
 'perhaps': 36,
 'better': 37,
 'claim': 38,
 'because': 39,
 'was': 40,
 'grassy': 41,
 'wanted': 42,
 'wear': 43,
 'though': 44,
 'for': 45,
 'that': 46,
 'passing': 47,
 'there': 48,
 'had': 49,
 'worn': 50,
 'them': 51,
 'really': 52,
 'about': 53,
 'same': 54,
 'morning': 55,
 'equally': 56,
 'lay': 57,
 'leaves': 58,
 'no': 59,
 'step': 60,
 'trodden': 61,
 'black': 62,
 'oh': 63,
 'kept': 64,
 'first': 65,
 'another': 66,
 'day': 67,
 'yet': 68,
 'knowing': 69,
 'how': 70,
 'way': 71,
 'leads': 72,
 'on': 73,
 'doubted': 

In [4]:
def tokenize_and_convert(data, word_to_index):
    tokenized_data = []
    for line in data:
        words = line.split()
        if len(words) > 2:
            tokenized_line = [word_to_index[word] if word in word_to_index else word_to_index["<UNK>"] for word in words]
            tokenized_data.append(tokenized_line)
    return tokenized_data


data_tokenized = tokenize_and_convert(data, word_to_index)

In [5]:
print(data[0])
print(data_tokenized[0])

two roads diverged in a yellow wood
[1, 2, 3, 4, 5, 6, 7]


In [6]:
transitions_counts = {}
initial_state_distrib = {}

for line in data_tokenized:
    # first register the initial distribution
    initial_state_distrib[line[0], line[1]] = initial_state_distrib.get((line[0], line[1]), 0) + 1
    
    for i in range(len(line) - 2):
        currentDict = transitions_counts.get((line[i], line[i+1]), {})
        currentDict[line[i+2]] = currentDict.get(line[i+2], 0) + 1
        transitions_counts[line[i], line[i+1]] = currentDict
        
print(transitions_counts) 

{(1, 2): {3: 2}, (2, 3): {4: 2}, (3, 4): {5: 2}, (4, 5): {6: 1, 7: 1, 986: 1, 1026: 1, 1424: 1, 1042: 1, 1410: 1, 1714: 1, 244: 1, 692: 1, 1001: 1, 1924: 1, 1019: 2}, (5, 6): {7: 1}, (8, 9): {10: 2}, (9, 10): {11: 1, 77: 1}, (10, 11): {12: 2, 149: 2, 107: 3, 671: 1}, (11, 12): {13: 1, 156: 1}, (12, 13): {14: 1}, (8, 15): {16: 1, 669: 1}, (15, 16): {17: 1}, (16, 17): {18: 1}, (17, 18): {10: 1}, (18, 10): {19: 1, 57: 1}, (8, 20): {21: 1}, (20, 21): {16: 1, 28: 1}, (21, 16): {22: 1, 231: 1}, (16, 22): {23: 1}, (22, 23): {22: 2}, (23, 22): {10: 1, 698: 1}, (22, 10): {11: 1, 905: 1, 671: 1, 156: 1}, (24, 25): {26: 2}, (25, 26): {27: 1, 980: 1, 574: 1, 1224: 1, 228: 1, 103: 1}, (26, 27): {4: 1}, (27, 4): {28: 1}, (4, 28): {29: 1, 104: 1, 301: 1, 353: 1, 356: 1, 538: 1, 589: 1, 637: 2, 692: 1, 695: 1, 783: 1, 738: 1, 340: 1, 146: 2, 399: 1, 1096: 1, 203: 1, 1231: 1, 1225: 7, 1310: 1, 210: 1, 1243: 1, 1015: 2, 1446: 1, 206: 1, 316: 1, 938: 1, 919: 1, 1230: 1, 421: 1, 1833: 1, 1852: 1, 1869: 1,

In [7]:
# transform to transition probabilities
def normalize_counts(transitions_counts):
    transition_probabilities = {}
    for word_pair, next_words_counts in transitions_counts.items():
        total_counts = sum(next_words_counts.values())
        probabilities = {word: count / total_counts for word, count in next_words_counts.items()}
        transition_probabilities[word_pair] = probabilities
        
    return transition_probabilities

transition_probs = normalize_counts(transitions_counts)

print(transition_probs)

{(1, 2): {3: 1.0}, (2, 3): {4: 1.0}, (3, 4): {5: 1.0}, (4, 5): {6: 0.07142857142857142, 7: 0.07142857142857142, 986: 0.07142857142857142, 1026: 0.07142857142857142, 1424: 0.07142857142857142, 1042: 0.07142857142857142, 1410: 0.07142857142857142, 1714: 0.07142857142857142, 244: 0.07142857142857142, 692: 0.07142857142857142, 1001: 0.07142857142857142, 1924: 0.07142857142857142, 1019: 0.14285714285714285}, (5, 6): {7: 1.0}, (8, 9): {10: 1.0}, (9, 10): {11: 0.5, 77: 0.5}, (10, 11): {12: 0.25, 149: 0.25, 107: 0.375, 671: 0.125}, (11, 12): {13: 0.5, 156: 0.5}, (12, 13): {14: 1.0}, (8, 15): {16: 0.5, 669: 0.5}, (15, 16): {17: 1.0}, (16, 17): {18: 1.0}, (17, 18): {10: 1.0}, (18, 10): {19: 0.5, 57: 0.5}, (8, 20): {21: 1.0}, (20, 21): {16: 0.5, 28: 0.5}, (21, 16): {22: 0.5, 231: 0.5}, (16, 22): {23: 1.0}, (22, 23): {22: 1.0}, (23, 22): {10: 0.5, 698: 0.5}, (22, 10): {11: 0.25, 905: 0.25, 671: 0.25, 156: 0.25}, (24, 25): {26: 1.0}, (25, 26): {27: 0.16666666666666666, 980: 0.16666666666666666, 574

In [8]:
def normalize_initial_distribution(initial_counts):
    total_counts = sum(initial_counts.values())
    initial_probabilities = {word_pair: count / total_counts for word_pair, count in initial_counts.items()}
    return initial_probabilities

initial_state_distrib_probs = normalize_initial_distribution(initial_state_distrib)
print(initial_state_distrib_probs)

{(1, 2): 0.00141643059490085, (8, 9): 0.00141643059490085, (8, 15): 0.000708215297450425, (8, 20): 0.000708215297450425, (24, 25): 0.000708215297450425, (30, 31): 0.000708215297450425, (8, 35): 0.000708215297450425, (39, 26): 0.000708215297450425, (44, 22): 0.000708215297450425, (49, 50): 0.000708215297450425, (8, 14): 0.00141643059490085, (4, 58): 0.000708215297450425, (63, 10): 0.000708215297450425, (68, 69): 0.000708215297450425, (10, 74): 0.000708215297450425, (10, 80): 0.00141643059490085, (85, 86): 0.000708215297450425, (10, 31): 0.000708215297450425, (8, 46): 0.00141643059490085, (95, 96): 0.000708215297450425, (101, 102): 0.000708215297450425, (105, 106): 0.000708215297450425, (24, 111): 0.00141643059490085, (115, 116): 0.000708215297450425, (24, 120): 0.000708215297450425, (124, 28): 0.00141643059490085, (28, 127): 0.000708215297450425, (105, 131): 0.000708215297450425, (24, 135): 0.000708215297450425, (28, 138): 0.000708215297450425, (129, 141): 0.000708215297450425, (28, 96)

In [14]:
import random

def generate_boundaries_and_pick_number(distr):
    boundaries = {}
    previous_boundaries = 0
    for keys, prob in distr.items():
        boundaries[keys] = previous_boundaries + prob
        previous_boundaries = boundaries[keys]
        
    # generate a random numbet between 0 and 1
    choosen = random.uniform(0, 1)
    answer = (0, 0)
    for keys, acc in boundaries.items():
        if choosen < acc:
            answer = keys
            break
    
    return answer

def pick_next(word_one, word_two, transition_probs, initial_state_distrib_probs):
    next_probs = 0
    while not next_probs:
        if transition_probs.get(word_one, word_two):
            next_word = transition_probs.get(word_one, word_two)
        else:
            (word_one, word_two) = generate_boundaries_for_tuple_and_pick_number(initial_state_distrib)
            
def detokenize(word_index, index_to_word):
    return index_to_word[word_index]

def generate_line(transition_probs, initial_state_distrib_probs, Nwords):
    # pick the initial start
    (word_one, previous_word) = generate_boundaries_and_pick_number(initial_state_distrib_probs)
    next_word = generate_boundaries_and_pick_number(transition_probs.get((word_one, previous_word)))
    sequence = [detokenize(word_one, index_to_word), detokenize(previous_word, index_to_word),
                detokenize(next_word, index_to_word)]

    while len(sequence) < Nwords:
        if transition_probs.get((previous_word, next_word)):
            next_word = generate_boundaries_and_pick_number(transition_probs.get((previous_word, next_word)))
            sequence.append(detokenize(next_word, index_to_word))
        else:
            (word_one, word_two) = generate_boundaries_and_pick_number(initial_state_distrib_probs)
            previous_word = word_one
            next_word = word_two
            sequence.append(detokenize(previous_word, index_to_word))
            sequence.append(detokenize(next_word, index_to_word))
            
    return sequence

nrOfLines = 5
poem = ''
for i in range(nrOfLines):
    lines = generate_line(transition_probs, initial_state_distrib_probs, 8)
    poem += " ".join(lines)
    poem += '\n'
    
print(poem)

he never said much by its own what are
he never let them then we asked brown makes
a featherhammer gives a thence they were i made
and i dread the out in rain perhaps you
on his feet against and if youre her

