# Download NLTK resources 

In [1]:
import nltk
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from collections import defaultdict
from nltk.util import bigrams

# Download NLTK resources if not already downloaded
nltk.download('brown')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /Users/pawanbtw/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pawanbtw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pawanbtw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pawanbtw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Preprocess the text

In [2]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove empty strings
    tokens = [token for token in tokens if token.strip()]

    return tokens

# Preprocess Brown Corpus

In [3]:
# Load the Brown Corpus
brown_corpus = brown.sents()

# Preprocess the data
preprocessed_corpus = []
for sentence in brown_corpus:
    # Join the words into a single string
    text = ' '.join(sentence)
    # Apply preprocessing
    preprocessed_text = preprocess_text(text)
    # Add preprocessed text to corpus
    preprocessed_corpus.append(preprocessed_text)

# Calculate Bigram Frequencies and Probabilities
## Counter for bigram frequencies, precomputing sums for each unique word1, and implementing Good-Turing smoothing for probability calculation, improving efficiency and accuracy.

In [4]:
from collections import Counter, defaultdict

# Initialize a Counter to store bigram frequencies
bigram_freq = Counter()

# Iterate through each preprocessed sentence
for sentence in preprocessed_corpus:
    # Extract bigrams from the preprocessed sentence
    sentence_bigrams = list(bigrams(sentence))
    
    # Update bigram frequencies
    bigram_freq.update(sentence_bigrams)

# Initialize a dictionary to store the sum of the frequencies for each unique word1
word1_freq_sum = defaultdict(int)

# Calculate the sum of the frequencies for each unique word1
for word1, word2 in bigram_freq.keys():
    word1_freq_sum[word1] += bigram_freq[(word1, word2)]

# Initialize a dictionary to store bigram probabilities
bigram_prob = defaultdict(float)

# Calculate bigram probabilities
for bigram, freq in bigram_freq.items():
    word1, word2 = bigram
    # Calculate the probability of word2 given word1
    bigram_prob[bigram] = freq / word1_freq_sum[word1]

# Example: Predicting the Next Word

In [5]:
# Example: Predicting the next word given two previous words
previous_words = ("investig", "report")
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_words[-1] and word2 not in previous_words]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)
# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")

1. ask: 0.012594458438287154
2. presid: 0.012594458438287154
3. feder: 0.010075566750629723
4. refer: 0.010075566750629723
5. note: 0.010075566750629723


In [6]:
# Example: Predicting the next word given a previous word
previous_word = "investig"
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_word]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)
# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")

1. report: 0.036036036036036036
2. question: 0.02702702702702703
3. use: 0.018018018018018018
4. made: 0.018018018018018018
5. result: 0.018018018018018018


In [7]:
# Example: Predicting the next word given a previous word
previous_word = "rather"
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_word]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)
# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")

1. one: 0.01084010840108401
2. special: 0.01084010840108401
3. sharp: 0.008130081300813009
4. leav: 0.008130081300813009
5. specif: 0.008130081300813009


In [8]:
# Example: Predicting the next word given a previous word
previous_word = "name"
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_word]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)
# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")

1. two: 0.013729977116704805
2. index: 0.013729977116704805
3. one: 0.011441647597254004
4. address: 0.009153318077803204
5. never: 0.009153318077803204


In [9]:
# Example: Predicting the next word given a previous word
previous_word = "ultimately"
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_word]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)
# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")

In [10]:
# Example: Predicting the next word given a previous word
previous_word = "regardless"
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_word]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)
# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")

1. whether: 0.05263157894736842
2. much: 0.05263157894736842
3. decis: 0.02631578947368421
4. color: 0.02631578947368421
5. russia: 0.02631578947368421


In [11]:
# Example: Predicting the next word given a previous word
previous_word = "neither"
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_word]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)
# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")

1. mantl: 0.023255813953488372
2. wife: 0.023255813953488372
3. u: 0.023255813953488372
4. spirit: 0.015503875968992248
5. convent: 0.015503875968992248


## Pickle

In [12]:
import pickle

# Define the filename for storing the pickled model
filename = 'bigram_model.pkl'

# Pickle the model
with open(filename, 'wb') as f:
    pickle.dump(bigram_prob, f)


In [13]:
import pickle

# Define the filename from which to load the pickled model
filename = 'bigram_model.pkl'

# Load the pickled model
with open(filename, 'rb') as f:
    loaded_model = pickle.load(f)


In [None]:
import tkinter as tk
from tkinter import scrolledtext
import pickle

# Load the pickled model
with open('bigram_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

def predict_next_word():
    input_text = input_entry.get("1.0", "end-1c")
    previous_word = input_text.split()[-1]  # Get the last word as the previous word
    next_word_candidates = [(word2, prob) for (word1, word2), prob in loaded_model.items() if word1 == previous_word]
    next_word_candidates.sort(key=lambda x: x[1], reverse=True)
    output_text = "\n".join([f"{word}: {prob}" for word, prob in next_word_candidates[:5]])
    output_textbox.delete(1.0, tk.END)
    output_textbox.insert(tk.END, output_text)

# Create the GUI window
window = tk.Tk()
window.title("Next Word Prediction")

# Create input field
input_label = tk.Label(window, text="Enter text:")
input_label.pack()
input_entry = scrolledtext.ScrolledText(window, wrap=tk.WORD, width=40, height=5)
input_entry.pack()

# Create predict button
predict_button = tk.Button(window, text="Predict", command=predict_next_word)
predict_button.pack()

# Create output area
output_label = tk.Label(window, text="Next word candidates:")
output_label.pack()
output_textbox = scrolledtext.ScrolledText(window, wrap=tk.WORD, width=40, height=5)
output_textbox.pack()

# Run the GUI application
window.mainloop()


AttributeError: module 'os' has no attribute 'list_all'