# 9. Language Modeling



In [1]:
!pip install nltk




In [2]:
import nltk
from nltk.corpus import reuters
from nltk import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize


In [3]:
# Download and Prepare the Dataset
nltk.download('reuters')
nltk.download('punkt')


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Tokenize the Text

# Load and tokenize the Reuters corpus
reuters_words = reuters.words()
reuters_text = ' '.join(reuters_words)
tokens = word_tokenize(reuters_text)


In [5]:
# Create N-Grams

# N-grams are contiguous sequences of n items (words or characters) from a given text. We will create bigrams (2-grams) and trigrams (3-grams) for our model:


# Create bigrams and trigrams
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))



In [6]:
# Calculate Frequencies

# Frequency distribution of bigrams and trigrams
bigram_freq = FreqDist(bigrams)
trigram_freq = FreqDist(trigrams)


In [7]:
# Build the Language Model
# A simple way to build a language model is to use conditional frequency distributions to estimate probabilities. For example, you can estimate the probability of a word given the previous word (bigram model) or two previous words (trigram model):

# Create Conditional Frequency Distributions for bigrams and trigrams
bigram_cfd = ConditionalFreqDist(bigrams)
trigram_cfd = ConditionalFreqDist(((w1, w2), w3) for w1, w2, w3 in trigrams)


In [8]:
#  Define Functions for Probability Estimation

def bigram_probability(word1, word2):
    # Probability of word2 given word1
    return bigram_cfd[word1].freq(word2)

def trigram_probability(word1, word2, word3):
    # Probability of word3 given word1 and word2
    return trigram_cfd[(word1, word2)].freq(word3)


In [10]:
# Evaluate the Model
# You can evaluate your language model using held-out data or cross-validation.

# Test the model with some examples
print("P('stock', 'market') =", bigram_probability('stock', 'market'))
print("P('the', 'stock', 'market') =", trigram_probability('the', 'stock', 'market'))


P('stock', 'market') = 0.020886615515771527
P('the', 'stock', 'market') = 0.08290155440414508


In [11]:
# Generate Text (Optional)
import random

def generate_text(start_word, length=10):
    current_word = start_word
    generated_text = [current_word]

    for _ in range(length - 1):
        next_words = list(bigram_cfd[current_word].keys())
        if not next_words:
            break
        current_word = random.choice(next_words)
        generated_text.append(current_word)

    return ' '.join(generated_text)

# Generate a sequence of words
print(generate_text('the', length=10))


the outlook results `` breaking through borrowings Wednesday by Amoco


In [12]:
# Save and Load the Model (this is for practical purpouses)

import pickle

# Save the model
with open('language_model.pkl', 'wb') as f:
    pickle.dump((bigram_cfd, trigram_cfd), f)

# Load the model
with open('language_model.pkl', 'rb') as f:
    bigram_cfd, trigram_cfd = pickle.load(f)
