In [2]:
import random
from google.colab import files

# -------------------------
# Upload and read file
# -------------------------
try:
    uploaded = files.upload()
    filename = list(uploaded.keys())[0]

    with open(filename, 'r') as f:
        raw_lines = f.readlines()

except FileNotFoundError:
    print("Error: File not found!")
    raw_lines = []

# -------------------------
# Preprocess lines
# -------------------------
lines = []
for line in raw_lines:
    line = line.strip()
    if line:
        lines.append(line)

print("First 5 processed lines:")
for l in lines[:5]:
    print(l)


# -------------------------
# Build Markov chain counts
# -------------------------
init_probs = {}
first_order = {}
second_order = {}

for line in lines:
    words = line.split() + [""]
    first_word = words[0]
    init_probs[first_word] = init_probs.get(first_word, 0) + 1


    for i in range(len(words)-1):
        w1, w2 = words[i], words[i+1]
        if w1 not in first_order:
            first_order[w1] = {}
        first_order[w1][w2] = first_order[w1].get(w2, 0) + 1

    # Second-order transitions
    for i in range(len(words)-2):
        pair = (words[i], words[i+1])
        next_w = words[i+2]
        if pair not in second_order:
            second_order[pair] = {}
        second_order[pair][next_w] = second_order[pair].get(next_w, 0) + 1


# -------------------------
# Normalize counts to probabilities
# -------------------------
def normalize(d):
    total = sum(d.values())
    if total == 0:
        return {k: 0 for k in d}
    return {k: v/total for k, v in d.items()}

init_probs = normalize(init_probs)

for w in first_order:
    first_order[w] = normalize(first_order[w])

for pair in second_order:
    second_order[pair] = normalize(second_order[pair])


# -------------------------
# Helper: choose next word based on probabilities
# -------------------------
def pick_word(prob_dict):
    r = random.random()
    cumulative = 0.0
    for word, prob in prob_dict.items():
        cumulative += prob
        if r <= cumulative:
            return word

    return random.choice(list(prob_dict.keys()))


# -------------------------
# Generate a line of text
# -------------------------
def generate_line(max_len=50):
    w1 = pick_word(init_probs)
    if w1 in first_order:
        w2 = pick_word(first_order[w1])
    else:
        return w1

    result = [w1, w2]

    for _ in range(max_len-2):
        pair = (result[-2], result[-1])
        if pair in second_order:
            next_w = pick_word(second_order[pair])
        elif result[-1] in first_order:
            next_w = pick_word(first_order[result[-1]])
        else:
            break

        if next_w == "":
            break

        result.append(next_w)

    return ' '.join(result)


# -------------------------
# Generate multiple lines
# -------------------------
print("\nGenerated text:")
for _ in range(4):
    print(generate_line())


Saving robert_frost.txt to robert_frost (1).txt
First 5 processed lines:
Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth;

Generated text:
Were not too much of having shaded out
Where driven rain had wet and swollen them.
They did all the flowers I am sure they weren't;
Here loveless birds now flock as winter friends,
