In [None]:
# pip install nltk

In [3]:
import nltk
# nltk.download()

# 1 Data collection and preprocessing

In [4]:
from nltk.corpus import gutenberg

text = gutenberg.raw()

## Lowercase

In [6]:
text = text.lower()

## Removing Punctuation

In [7]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
text = "".join([char for char in text if char not in string.punctuation])

## Tokenization

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize

#sent_tokens = sent_tokenize(text)
tokens = nltk.word_tokenize(text)

## Stopword Filtering

In [10]:
from nltk.corpus import stopwords

In [11]:
stop_words = stopwords.words('english')
filtered_tokens = [token for token in tokens if token not in stop_words] 

## lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer

In [13]:
wnl = WordNetLemmatizer()
lemmas = [wnl.lemmatize(token) for token in filtered_tokens]

# 2 Model implementation

In [14]:
# generate trigrams
trigrams = list(nltk.ngrams(filtered_tokens, 3))

In [15]:
# Calculate the frequency of each trigram
import collections
trigram_freqs = collections.Counter(trigrams)

In [16]:
bigrams = list(nltk.ngrams(lemmas, 2))

In [17]:
import collections
bigram_freqs = collections.Counter(bigrams)

## Probability Calculation

In [18]:
def calculate_probabilities(word):
    probabilities = {}
    for bigram in bigrams:
        # Form the trigram from the bigram and the word
        trigram = bigram + (word,)

        # Get the frequency of the trigram
        trigram_frequency = trigram_freqs.get(trigram, 0)

        # Get the frequency of the bigram
        bigram_frequency = bigram_freqs.get(bigram, 0)

        if bigram_frequency == 0:
            probabilities[bigram] = 0
        else:
            probability = trigram_frequency / bigram_frequency
            probabilities[bigram] = probability
    return probabilities

In [21]:
# test
result = calculate_probabilities('god')
result.get(('lord', 'thy'))

0.8689458689458689

## Next Word Prediction

In [22]:
def predict_next_word(sequence):
    # Tokenize the sequence to handle multiple words
    tokens = sequence.split()

    # Check if we have at least 2 tokens to work with
    if len(tokens) < 2:
        return "Error: Need at least two words for prediction"

    # Get the last two words
    last_two_words = tuple(tokens[-2:])

    # Find all possible next words
    possible_next_words = {trigram[2]: freq for trigram, freq in trigram_freqs.items() if trigram[:2] == last_two_words}

    # If no possible words are found
    if not possible_next_words:
        return "No prediction available"

    # Get the most frequent next word
    predicted_word = max(possible_next_words, key=possible_next_words.get)

    return predicted_word

# test
sequence = "lord thy"
predicted_word = predict_next_word(sequence)
print(f"The predicted next word is: {predicted_word}")

The predicted next word is: god


## Sentence Generation

In [23]:
def sentence_generation(prefix, sentence_length):
    if len(prefix.split()) != 2:
        return "Error: Prefix must contain exactly two words for a trigram model."

    else:
        sentence = prefix.split()
        while len(sentence) < sentence_length:
            last_two_words = tuple(sentence[-2:])
            possible_next_words = {trigram[2]: freq for trigram, freq in trigram_freqs.items() if trigram[:2] == last_two_words}

            if not possible_next_words:
                break
            next_word = max(possible_next_words, key=possible_next_words.get)
            sentence.append(next_word)

        return ' '.join(sentence)

In [24]:
# test
sentence_generation("lord thy", 20)

'lord thy god hath given us inheritance fields vineyards wilt thou go battle children benjamin came according families cities villages'

## Somoothing

In [25]:
def sentence_generation_with_smoothing(prefix, sentence_length):
    vocab_size = len(set(lemmas))
    
    if len(prefix.split()) != 2:
        return "Error: Prefix must contain exactly two words for a trigram model."

    sentence = prefix.split()
    while len(sentence) < sentence_length:
        last_two_words = tuple(sentence[-2:])

        # Initialize the best word and its max probability
        best_next_word = None
        max_probability = 0

        # Try every possible next word in the vocabulary
        for possible_next_word in lemmas:
            # Create a potential trigram with the last two words and the current possible next word
            trigram = last_two_words + (possible_next_word,)

            # Apply Laplace smoothing
            trigram_freq = trigram_freqs.get(trigram, 0) + 1
            bigram_freq = bigram_freqs.get(last_two_words, 0) + vocab_size
            probability = trigram_freq / bigram_freq

            # Update the best next word based on the highest probability
            if probability > max_probability:
                max_probability = probability
                best_next_word = possible_next_word

        # Break if no valid next word is found
        if not best_next_word:
            break

        sentence.append(best_next_word)

    return ' '.join(sentence)

In [26]:
# test
sentence_generation_with_smoothing("lord thy", 20)

'lord thy god hath given us spirit spirit lord came unto saying thus saith lord god israel behold bring evil'

# Testing and Evaluation

## Next word prediction

In [27]:
bigrams_for_test= bigrams[100:1000:50]

In [28]:
sequences_for_pred = [' '.join(bigram) for bigram in bigrams_for_test]
sequences_for_pred

['liked highly',
 'friend emma',
 'promoted match',
 'marriage left',
 'mile miss',
 'temper talent',
 'lawn shrubbery',
 'body used',
 'deal happier',
 'humour might',
 'james like',
 'obliged glad',
 'go see',
 'highbury frequent',
 'come late',
 'joy pretty',
 'troublesome creature',
 'knightley fact']

In [29]:
for sequence in sequences_for_pred:
    print(sentence_generation_with_smoothing(sequence, 3))

liked highly esteeming
friend emma first
promoted match black
marriage left yet
mile miss taylor
temper talent emma
lawn shrubbery emma
body used hating
deal happier mr
humour might emma
james like put
obliged glad think
go see house
highbury frequent visitor
come late year
joy pretty well
troublesome creature said
knightley fact one


## Evaluation

In [30]:
segments = text.split("\n")
segments = [segment for segment in segments if segment.strip()]

In [31]:
reference_sentences = segments[100:10000:500]
reference_sentences 

['happier if she had spent all the rest of her life at hartfield',
 'an egg boiled very soft is not unwholesome  serle understands boiling',
 'to a very good purpose for she found her decidedly more sensible',
 'of her with more voluntary praise than emma had ever heard before',
 'in creating  this is a connexion which offers nothing but good',
 'still however though every thing had not been accomplished',
 'were actually at hartfield he was not able to make more than',
 'putting an end to his extreme solicitude about her  she was vexed',
 'it was rather too late in the day to set about being simpleminded',
 'we will call in mr perry  the expense shall not be thought of',
 'i did not know what to do  i was sitting near the doorelizabeth saw',
 'if you were never particularly struck by her manners before',
 'to betray any imperfection which could be concealed',
 'presently mr knightley looked back and came and sat down by her',
 'new pianoforte  do put up your horse at the crown and com

In [32]:
first_two_words = [sentence.split()[:2] for sentence in reference_sentences]

In [33]:
sequence_for_test = [' '.join(words) for words in first_two_words]

In [34]:
generated_sentences = []
for sequence in sequence_for_test:
    generated_sentences.append(sentence_generation_with_smoothing(sequence, 15))

In [35]:
generated_sentences

['happier if emma emma could feel man may come unto thee thou shalt make unto',
 'an egg emma emma could feel man may come unto thee thou shalt make unto',
 'to a emma emma could feel man may come unto thee thou shalt make unto',
 'of her emma emma could feel man may come unto thee thou shalt make unto',
 'in creating emma emma could feel man may come unto thee thou shalt make unto',
 'still however happy fancy came christ child born unto thee thou shalt make unto people',
 'were actually emma emma could feel man may come unto thee thou shalt make unto',
 'putting an emma emma could feel man may come unto thee thou shalt make unto',
 'it was emma emma could feel man may come unto thee thou shalt make unto',
 'we will emma emma could feel man may come unto thee thou shalt make unto',
 'i did emma emma could feel man may come unto thee thou shalt make unto',
 'if you emma emma could feel man may come unto thee thou shalt make unto',
 'to betray emma emma could feel man may come unto thee

In [42]:
qgrams = list(nltk.ngrams(filtered_tokens, 4))
trigrams = list(nltk.ngrams(lemmas, 3))

qgram_freqs = collections.Counter(qgrams)
trigram_freqs = collections.Counter(trigrams)

def calculate_probabilities(word):
    probabilities = {}
    for trigram in trigrams:
        # Form the trigram from the bigram and the word
        qgram = trigram + (word,)

        # Get the frequency of the trigram
        qgram_frequency = qgram_freqs.get(qgram, 0)

        # Get the frequency of the bigram
        trigram_frequency = trigram_freqs.get(trigram, 0)

        if trigram_frequency == 0:
            probabilities[trigram] = 0
        else:
            probability = qgram_frequency / trigram_frequency
            probabilities[trigram] = probability
    return probabilities

def sentence_generation_with_smoothing_4(prefix, sentence_length):
    vocab_size = len(set(lemmas))
    
    if len(prefix.split()) != 3:
        return "Error: Prefix must contain exactly three words for a qgram model."

    sentence = prefix.split()
    while len(sentence) < sentence_length:
        last_three_words = tuple(sentence[-3:])

        # Initialize the best word and its max probability
        best_next_word = None
        max_probability = 0

        # Try every possible next word in the vocabulary
        for possible_next_word in lemmas:
            # Create a potential trigram with the last two words and the current possible next word
            qgram = last_three_words + (possible_next_word,)

            # Apply Laplace smoothing
            qgram_freq = qgram_freqs.get(qgram, 0) + 1
            trigram_freq = trigram_freqs.get(last_three_words, 0) + vocab_size
            probability = qgram_freq / trigram_freq

            # Update the best next word based on the highest probability
            if probability > max_probability:
                max_probability = probability
                best_next_word = possible_next_word

        # Break if no valid next word is found
        if not best_next_word:
            break

        sentence.append(best_next_word)

    return ' '.join(sentence)


segments = text.split("\n")
segments = [segment for segment in segments if segment.strip()]

reference_sentences = segments[100:1000:50]
reference_sentences 

first_three_words = [sentence.split()[:3] for sentence in reference_sentences]
sequence_for_test_4 = [' '.join(words) for words in first_three_words]

generated_sentences = []
for sequence in sequence_for_test_4:
    generated_sentences.append(sentence_generation_with_smoothing_4(sequence, 15))


In [60]:
first_three_words

[['happier', 'if', 'she'],
 ['he', 'lived', 'about'],
 ['behaved', 'charmingly', 'every'],
 ['ladys', 'mind', 'but'],
 ['indisposed', 'for', 'any'],
 ['as', 'miss', 'taylor'],
 ['and', 'a', 'most'],
 ['being', 'seen', 'with'],
 ['for', 'nothing', 'the'],
 ['with', 'a', 'fine'],
 ['an', 'egg', 'boiled'],
 ['every', 'thing', 'in'],
 ['to', 'be', 'a'],
 ['six', 'years', 'hence'],
 ['and', 'walking', 'a'],
 ['of', 'being', 'silent'],
 ['she', 'feared', 'it'],
 ['emma', 'has', 'been']]

In [43]:
generated_sentences

['happier if she emma emma emma emma emma emma emma emma emma emma emma emma',
 'he lived about emma emma emma emma emma emma emma emma emma emma emma emma',
 'behaved charmingly every body punctual every body best emma emma emma emma emma emma emma',
 'ladys mind but emma emma emma emma emma emma emma emma emma emma emma emma',
 'indisposed for any emma emma emma emma emma emma emma emma emma emma emma emma',
 'as miss taylor emma emma emma emma emma emma emma emma emma emma emma emma',
 'and a most emma emma emma emma emma emma emma emma emma emma emma emma',
 'being seen with emma emma emma emma emma emma emma emma emma emma emma emma',
 'for nothing the emma emma emma emma emma emma emma emma emma emma emma emma',
 'with a fine emma emma emma emma emma emma emma emma emma emma emma emma',
 'an egg boiled emma emma emma emma emma emma emma emma emma emma emma emma',
 'every thing in emma emma emma emma emma emma emma emma emma emma emma emma',
 'to be a emma emma emma emma emma emma

When n equals 3 or 4, the model failed to produce coherent sentences. Consider predicting the next word based on its distribution rather than adhering to a greedy approach.




In [44]:
from random import choices

def sentence_generation_with_smoothing(prefix, sentence_length):
    vocab_size = len(set(lemmas))
    
    if len(prefix.split()) != 2:
        return "Error: Prefix must contain exactly two words for a trigram model."

    sentence = prefix.split()
    while len(sentence) < sentence_length:
        last_two_words = tuple(sentence[-2:])

        # Collect all potential next words and their probabilities
        next_words_probs = []
        for possible_next_word in lemmas:
            trigram = last_two_words + (possible_next_word,)
            trigram_freq = trigram_freqs.get(trigram, 0) + 1
            bigram_freq = bigram_freqs.get(last_two_words, 0) + vocab_size
            probability = trigram_freq / bigram_freq
            next_words_probs.append((possible_next_word, probability))

        # Normalize the probabilities
        total_prob = sum(prob for _, prob in next_words_probs)
        normalized_probs = [(word, prob / total_prob) for word, prob in next_words_probs]

        # Choose the next word based on the distribution
        next_word = choices([word for word, _ in normalized_probs], 
                            [prob for _, prob in normalized_probs], k=1)[0]

        # Append the chosen word to the sentence
        if next_word:
            sentence.append(next_word)
        else:
            break

    return ' '.join(sentence)

In [48]:
generated_sentences = []
for sequence in sequence_for_test:
    generated_sentences.append(sentence_generation_with_smoothing(sequence, 15))

In [49]:
generated_sentences 

['happier if rock old avoided consequence 102 miss benjamin abstract scholar naples corp amazement brother',
 'an egg dislike let son strong margaret eye twenty beyond must longer quantity sure house',
 'to a would topic eye coil went mr ask people eat jesus daughter 2517 park',
 'of her wind though myriad spared unto however seven men keeper eternal pause jerusalem thou',
 'in creating thou faithfully time way thing mind made hovering admitted whenever one footstep visiting',
 'still however cometh ripe abruptly close one unto thine even tear skin ive never narrative',
 'were actually order sihon give hand maid egypt delight year fellow man terrible relate manner',
 'putting an 1928 goodbye done however earnestness egypt unto little two smith sword lady invitation',
 'it was nay mr thus forwardcame dare puzzled eye prayer piteous hand infirm watched overcome',
 'we will cast winter behold daniel scattered play produced poured tree unto work circlings unworthy',
 'i did knew find sanct

The results look a little better now, at least there's not too much repetition of the same word. But most of the sentences still don't make sense.

# UI

In [59]:
import tkinter as tk
from tkinter import ttk

def generate_sentence():
    prefix = prefix_entry.get()
    try:
        sentence_length = int(length_entry.get())
    except ValueError:
        result_label.config(text="Error: Sentence length must be a number.")
        return

    generated_sentence = sentence_generation_with_smoothing(prefix, sentence_length)
    result_label.config(text=generated_sentence)

# Create the main window
root = tk.Tk()
root.title("Trigram Sentence Generator")

# Create and place widgets
prefix_label = ttk.Label(root, text="Enter Prefix:")
prefix_label.pack()

prefix_entry = ttk.Entry(root)
prefix_entry.pack()

length_label = ttk.Label(root, text="Enter Sentence Length:")
length_label.pack()

length_entry = ttk.Entry(root)
length_entry.pack()

generate_button = ttk.Button(root, text="Generate Sentence", command=generate_sentence)
generate_button.pack()

result_label = ttk.Label(root, text="")
result_label.pack()

# Run the application
root.mainloop()
