In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from collections import defaultdict, Counter
from nltk.util import bigrams

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/pawanbtw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pawanbtw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pawanbtw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Pre-processing

In [2]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove empty strings
    tokens = [token for token in tokens if token.strip()]

    return tokens

def preprocess_text_file(file_path, encodings=('utf-16', 'latin1', 'windows-1252')):
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                text = file.read()
            return preprocess_text(text)
        except UnicodeDecodeError:
            pass
    raise ValueError("Unable to decode the file using the specified encodings.")

# Text file Loading

In [3]:
file_path = "/Users/pawanbtw/Downloads/mytext.txt"
preprocessed_text = preprocess_text_file(file_path)

# Calculating bigram frequency

In [4]:
# Initialize a Counter to store bigram frequencies
bigram_freq = Counter()

# Extract bigrams from the preprocessed text
preprocessed_bigrams = list(bigrams(preprocessed_text))
bigram_freq.update(preprocessed_bigrams)

# Initialize a dictionary to store the sum of the frequencies for each unique word1
word1_freq_sum = defaultdict(int)

# Calculate the sum of the frequencies for each unique word1
for word1, word2 in bigram_freq:
    word1_freq_sum[word1] += bigram_freq[(word1, word2)]

# Initialize a dictionary to store bigram probabilities
bigram_prob = defaultdict(float)

# Calculate bigram probabilities
for bigram, freq in bigram_freq.items():
    word1, word2 = bigram
    # Calculate the probability of word2 given word1
    bigram_prob[bigram] = freq / word1_freq_sum[word1]


# checking the pre-processed file

In [5]:
print(preprocessed_text)

['alice', 'beginning', 'get', 'tired', 'sitting', 'sister', 'bank', 'nothing', 'twice', 'peeped', 'book', 'sister', 'reading', 'picture', 'conversation', 'use', 'book', 'thought', 'alice', 'without', 'picture', 'conversation', 'considering', 'mind', 'well', 'could', 'hot', 'day', 'made', 'feel', 'sleepy', 'stupid', 'whether', 'pleasure', 'making', 'daisychain', 'would', 'worth', 'trouble', 'getting', 'picking', 'daisy', 'suddenly', 'white', 'rabbit', 'pink', 'eye', 'ran', 'close', 'nothing', 'remarkable', 'alice', 'think', 'much', 'way', 'hear', 'rabbit', 'say', 'oh', 'dear', 'oh', 'dear', 'shall', 'late', 'thought', 'afterwards', 'occurred', 'ought', 'wondered', 'time', 'seemed', 'quite', 'natural', 'rabbit', 'actually', 'took', 'watch', 'waistcoatpocket', 'looked', 'hurried', 'alice', 'started', 'foot', 'flashed', 'across', 'mind', 'never', 'seen', 'rabbit', 'either', 'waistcoatpocket', 'watch', 'take', 'burning', 'curiosity', 'ran', 'across', 'field', 'time', 'see', 'pop', 'large', 

In [6]:
# Example: Predicting the next word given two previous words
previous_words = ("look", "sister")
next_word_candidates = [(word2, prob) for (word1, word2), prob in bigram_prob.items() if word1 == previous_words[-1] and word2 not in previous_words]
next_word_candidates.sort(key=lambda x: x[1], reverse=True)

# Print the next word candidates line by line
for i, (word, prob) in enumerate(next_word_candidates[:5], start=1):
    print(f"{i}. {word}: {prob}")


1. bank: 0.5
2. reading: 0.5
