Generate Function that takes chain length as input.

In [31]:
import random

def tokenize(corpus: str) -> list:
    """Tokenize the corpus into words."""
    return corpus.split()

def build_transition_probs(words: list, chain_length: int) -> dict:
    """Build transition probabilities for chains of words."""
    transition_probs = {}
    for i in range(len(words) - chain_length):
        current_words = tuple(words[i:i+chain_length])
        next_word = words[i + chain_length]
        if current_words in transition_probs:
            transition_probs[current_words].append(next_word)
        else:
            transition_probs[current_words] = [next_word]
    return transition_probs

def generate_text(transition_probs: dict, start_words: tuple, output_length: int, chain_length: int) -> str:
    """Generate text using Markov Chain."""
    generated_text = list(start_words)
    for _ in range(output_length - chain_length):
        current_words = tuple(generated_text[-chain_length:])
        if current_words in transition_probs:
            next_word = random.choice(transition_probs[current_words])
        else:
            # If current words not in dictionary, choose a random word from corpus
            next_word = random.choice(start_words)
        generated_text.append(next_word)
    return ' '.join(generated_text)

def generate(corpus: str, start_words: list[str], chain_length: int, output_length: int) -> str:
    """Generate text using Markov Chain."""
    # Tokenize the corpus
    words = tokenize(corpus)

    # Build transition probabilities
    transition_probs = build_transition_probs(words, chain_length)

    # Generate text
    return generate_text(transition_probs, tuple(start_words), output_length, chain_length)

#

The lazy cat lazy lazy The cat The cat lazy The cat cat The cat The cat The The The The


In [42]:
def test_generate():
    # Test Case 1: Test with short corpus and chain length 1
    corpus = "The quick brown fox jumps over the lazy dog."
    generated_text = generate(corpus, ["The"], 1, 10)
    assert len(generated_text.split()) == 10, "Test Case 1 Failed"

    # Test Case 2: Test with longer corpus and chain length 2
    corpus = "The quick brown fox jumps over the lazy dog. The lazy dog barks at the moon."
    generated_text = generate(corpus, ["The", "quick"], 2, 20)
    assert len(generated_text.split()) == 20, "Test Case 2 Failed"

    # Test Case 3: Test with start words not present in corpus
    corpus = "The quick brown fox jumps over the lazy dog. The lazy dog barks at the moon."
    generated_text = generate(corpus, ["A", "bird"], 2, 20)
    assert len(generated_text.split()) == 20, "Test Case 3 Failed"

    # Test Case 4: Test with chain length = corpus length - 2
    corpus = "The quick brown fox jumps over the lazy dog."
    generated_text = generate(corpus, ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy"], 8, 20)
    assert len(generated_text.split()) == 20, "Test Case 4 Failed"

    print("All test cases passed successfully!")

# Run test cases
test_generate()


All test cases passed successfully!


Generate Function that only considers the previous word i.e. chain length is 1.

In [11]:
import random

def generate(corpus: str, start_word: str, output_length: int) -> str:
    # Tokenize the corpus into words
    words = corpus.split()

    # Build a dictionary to store transition probabilities
    transition_probs = {}
    for i in range(len(words) - 1):
        current_word = words[i]
        next_word = words[i + 1]
        if current_word in transition_probs:
            transition_probs[current_word].append(next_word)
        else:
            transition_probs[current_word] = [next_word]

    # Generate text using Markov Chain
    current_word = start_word
    generated_text = [current_word]
    for _ in range(output_length - 1):
        if current_word in transition_probs:
            next_word = random.choice(transition_probs[current_word])
        else:
            # If current word not in dictionary, choose a random word from corpus
            next_word = random.choice(words)
        generated_text.append(next_word)
        current_word = next_word

    return ' '.join(generated_text)

# Example usage:
corpus = "The quick brown fox jumps over the lazy dog. The lazy dog barks at the moon. Moonlight shines through the window. The window is open. Open the door slowly."
generated_text = generate(corpus, "The", 50)
print(generated_text)


The window is open. Open the lazy dog barks at the window. The quick brown fox jumps over the moon. Moonlight shines through the window. The lazy dog barks at the door slowly. window. The quick brown fox jumps over the lazy dog. The window is open. Open the door
