<a href="https://colab.research.google.com/github/anshupandey/Working_with_Large_Language_models/blob/main/WWL_C3_Bigram_Statistical_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training and Using a Bigram Statistical Language Model
This notebook demonstrates how to train and use a bigram statistical language model using a dummy dataset.

In [None]:
!pip install nltk



In [1]:
import nltk
from nltk.util import bigrams
from collections import Counter, defaultdict
import random

# Ensure the required NLTK packages are downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Dummy Dataset
Let's create a small dummy dataset of sentences.

In [2]:
corpus = [
    'I love natural language processing',
    'language models are fascinating',
    'natural language processing is a complex field',
    'I love machine learning',
    'machine learning models can predict outcomes'
]

### Tokenization
Tokenize the sentences into words.

In [3]:
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in corpus]
print('Tokenized Sentences:', tokenized_sentences)

Tokenized Sentences: [['I', 'love', 'natural', 'language', 'processing'], ['language', 'models', 'are', 'fascinating'], ['natural', 'language', 'processing', 'is', 'a', 'complex', 'field'], ['I', 'love', 'machine', 'learning'], ['machine', 'learning', 'models', 'can', 'predict', 'outcomes']]


### Creating Bigram Model
Create a bigram model and calculate the probabilities.

In [5]:
# Create a dictionary to hold our bigram counts
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurrences (bigrams)
for sentence in tokenized_sentences:
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
        bigram_model[w1][w2] += 1

# Display the bigram counts
for w1 in bigram_model:
    for w2 in bigram_model[w1]:
        print(f'({w1}, {w2}) = {bigram_model[w1][w2]}')

(None, I) = 2
(None, language) = 1
(None, natural) = 1
(None, machine) = 1
(I, love) = 2
(love, natural) = 1
(love, machine) = 1
(natural, language) = 2
(language, processing) = 2
(language, models) = 1
(processing, None) = 1
(processing, is) = 1
(models, are) = 1
(models, can) = 1
(are, fascinating) = 1
(fascinating, None) = 1
(is, a) = 1
(a, complex) = 1
(complex, field) = 1
(field, None) = 1
(machine, learning) = 2
(learning, None) = 1
(learning, models) = 1
(can, predict) = 1
(predict, outcomes) = 1
(outcomes, None) = 1


In [6]:
# Convert counts to probabilities
for w1 in bigram_model:
    total_count = float(sum(bigram_model[w1].values()))
    for w2 in bigram_model[w1]:
        bigram_model[w1][w2] /= total_count

# Display the bigram probabilities
for w1 in bigram_model:
    for w2 in bigram_model[w1]:
        print(f'P({w2} | {w1}) = {bigram_model[w1][w2]:.4f}')

P(I | None) = 0.4000
P(language | None) = 0.2000
P(natural | None) = 0.2000
P(machine | None) = 0.2000
P(love | I) = 1.0000
P(natural | love) = 0.5000
P(machine | love) = 0.5000
P(language | natural) = 1.0000
P(processing | language) = 0.6667
P(models | language) = 0.3333
P(None | processing) = 0.5000
P(is | processing) = 0.5000
P(are | models) = 0.5000
P(can | models) = 0.5000
P(fascinating | are) = 1.0000
P(None | fascinating) = 1.0000
P(a | is) = 1.0000
P(complex | a) = 1.0000
P(field | complex) = 1.0000
P(None | field) = 1.0000
P(learning | machine) = 1.0000
P(None | learning) = 0.5000
P(models | learning) = 0.5000
P(predict | can) = 1.0000
P(outcomes | predict) = 1.0000
P(None | outcomes) = 1.0000


### Generating Text Using the Bigram Model
Now, let's use the trained bigram model to generate text.

In [14]:
def generate_sentence(bigram_model, start_word, length=10):
    current_word = start_word
    sentence = [current_word]
    for _ in range(length - 1):
        next_word_candidates = list(bigram_model[current_word].keys())
        next_word_probabilities = list(bigram_model[current_word].values())

        if next_word_candidates:
            next_word = random.choices(next_word_candidates, next_word_probabilities)[0]
        else:
            break

        if next_word is None:
            break

        sentence.append(next_word)
        current_word = next_word
        #print(sentence)

    return ' '.join(sentence)



In [15]:
# Generate a sentence starting with the word 'I'
generated_sentence = generate_sentence(bigram_model, start_word='I')
print('Generated Sentence:', generated_sentence)

Generated Sentence: I love machine learning
