# DEFINING THE CORPUS AND IMPORTING THE LIBRARIES

In [None]:
from collections import Counter, defaultdict
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Define the corpus
corpus = [
    "Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.",
    "It is primarily concerned with giving computers the ability to support and manipulate human language.",
    "It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic machine learning approaches.",
    "The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them.",
    "The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.",
    "Machine learning is a field of study in artificial intelligence concerned with the development of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
    "Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.",
    "Machine learning approaches have been applied to many fields including large language models, computer vision, and speech recognition.",
    "Machine learning is known in its application across business problems under the name predictive analytics.",
    "Although not all machine learning is statistically based, computational statistics is an important source of the field's methods."
]


# N-gram Model

In [None]:
# Preprocessing the corpus for the N-Gram model

def preprocess_corpus(corpus):
    return [['<s>'] + sentence.lower().split() + ['</s>'] for sentence in corpus]

In [None]:
# Building the Ngram model
def build_n_gram_model(corpus, n=2):
    model = defaultdict(Counter)
    for sentence in corpus:
        for i in range(len(sentence)-n+1):
            n_gram_sequence = tuple(sentence[i:i+n-1])
            next_word = sentence[i+n-1]
            model[n_gram_sequence][next_word] += 1
    return model

In [None]:
# Calculating the N-Gram probabilities

def calculate_n_gram_probabilities(model):
    probabilities = {}
    for n_gram_sequence, words in model.items():
        total_count = sum(words.values())
        probabilities[n_gram_sequence] = {word: count/total_count for word, count in words.items()}
    return probabilities

In [None]:
# Calculating the  perplexity for N-Gram model
def calculate_perplexity(model, corpus, n=2):
    N = sum(len(sentence) for sentence in corpus)
    logprob = 0
    for sentence in corpus:
        for i in range(n-1, len(sentence)):
            n_gram_sequence = tuple(sentence[i-n+1:i])
            word = sentence[i]
            probability = model.get(n_gram_sequence, {}).get(word, 1e-12)  # Smoothing for zero probabilities
            logprob += np.log2(probability)
    perplexity = 2 ** (-logprob / N)
    return perplexity

In [None]:
preprocessed_corpus = preprocess_corpus(corpus)

In [None]:
preprocessed_corpus[0]

['<s>',
 'natural',
 'language',
 'processing',
 '(nlp)',
 'is',
 'an',
 'interdisciplinary',
 'subfield',
 'of',
 'computer',
 'science',
 'and',
 'linguistics.',
 '</s>']

In [None]:
!pip install torch



In [None]:
n_gram_model = build_n_gram_model(preprocessed_corpus, 2)
n_gram_model

defaultdict(collections.Counter,
            {('<s>',): Counter({'natural': 1,
                      'it': 2,
                      'the': 2,
                      'machine': 3,
                      'recently,': 1,
                      'although': 1}),
             ('natural',): Counter({'language': 2}),
             ('language',): Counter({'processing': 1,
                      'datasets,': 1,
                      'within': 1,
                      'models,': 1}),
             ('processing',): Counter({'(nlp)': 1, 'natural': 1}),
             ('(nlp)',): Counter({'is': 1}),
             ('is',): Counter({'an': 2,
                      'primarily': 1,
                      'a': 2,
                      'known': 1,
                      'statistically': 1}),
             ('an',): Counter({'interdisciplinary': 1, 'important': 1}),
             ('interdisciplinary',): Counter({'subfield': 1}),
             ('subfield',): Counter({'of': 1}),
             ('of',): Counter({'computer': 1,

In [None]:
n_gram_probabilities = calculate_n_gram_probabilities(n_gram_model)

In [None]:
# Print example N-Gram probabilities (for brevity, print probabilities of the first few N-Grams)
print("Example N-Gram probabilities:")
for n_gram, probabilities in list(n_gram_probabilities.items())[:5]:
    print(f"{n_gram}: {probabilities}")

Example N-Gram probabilities:
('<s>',): {'natural': 0.1, 'it': 0.2, 'the': 0.2, 'machine': 0.3, 'recently,': 0.1, 'although': 0.1}
('natural',): {'language': 1.0}
('language',): {'processing': 0.25, 'datasets,': 0.25, 'within': 0.25, 'models,': 0.25}
('processing',): {'(nlp)': 0.5, 'natural': 0.5}
('(nlp)',): {'is': 1.0}


In [None]:
n_gram_perplexity = calculate_perplexity(n_gram_probabilities, preprocessed_corpus, 2)

In [None]:
# Print the perplexity of the N-Gram model
print(f"\nPerplexity of the N-Gram model: {n_gram_perplexity}")


Perplexity of the N-Gram model: 1.8121749932026143


# N-gram Neural Language Model (e.g., Trigram)

In [1]:
import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable

In [2]:
# Define the corpus
corpus = [
    "Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.",
    "It is primarily concerned with giving computers the ability to support and manipulate human language.",
    "It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic machine learning approaches.",
    "The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them.",
    "The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.",
    "Machine learning is a field of study in artificial intelligence concerned with the development of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.",
    "Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.",
    "Machine learning approaches have been applied to many fields including large language models, computer vision, and speech recognition.",
    "Machine learning is known in its application across business problems under the name predictive analytics.",
    "Although not all machine learning is statistically based, computational statistics is an important source of the field's methods."
]

In [3]:
corpus = '\n'.join(corpus).split()
vocab = set(corpus)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

In [4]:
trigram = [((corpus[i], corpus[i + 1]), corpus[i + 2])
           for i in range(len(corpus) - 2)]

In [5]:
trigram[:10]

[(('Natural', 'language'), 'processing'),
 (('language', 'processing'), '(NLP)'),
 (('processing', '(NLP)'), 'is'),
 (('(NLP)', 'is'), 'an'),
 (('is', 'an'), 'interdisciplinary'),
 (('an', 'interdisciplinary'), 'subfield'),
 (('interdisciplinary', 'subfield'), 'of'),
 (('subfield', 'of'), 'computer'),
 (('of', 'computer'), 'science'),
 (('computer', 'science'), 'and')]

In [6]:
class NgramModel(nn.Module):
    def __init__(self, vocb_size, context_size, n_dim):
        super(NgramModel, self).__init__()
        self.n_word = vocb_size
        self.embedding = nn.Embedding(self.n_word, n_dim)
        self.linear1 = nn.Linear(context_size * n_dim, 128)
        self.linear2 = nn.Linear(128, self.n_word)

    def forward(self, x):
        emb = self.embedding(x)
        emb = emb.view(1, -1)
        out = self.linear1(emb)
        out = F.relu(out)
        out = self.linear2(out)
        log_prob = F.log_softmax(out)
        return log_prob

In [7]:
# We are buidling a trigram neural langauge model
CONTEXT_SIZE = 2
ngrammodel = NgramModel(len(word_to_idx), CONTEXT_SIZE, 100)

In [8]:
# We are optimizing negtive log likelihood loss for this langauge modeling task.
criterion = nn.NLLLoss()
optimizer = optim.SGD(ngrammodel.parameters(), lr=1e-3)

In [9]:
# We are training by itrating the whole process for 100 times, each time will output the loss value
for epoch in range(100):
    print('epoch: {}'.format(epoch + 1))
    print('*' * 10)
    running_loss = 0
    for data in trigram:
        word, label = data     #E.g., word = ('Natural', 'language'); label = 'processing'
        word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
        label = Variable(torch.LongTensor([word_to_idx[label]]))

        # forward -- for prediction and calculating the loss for each instance
        out = ngrammodel(word)
        loss = criterion(out, label)
        running_loss += loss.item()

        # backward -- for gradiate update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # print the average loss
    print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))

epoch: 1
**********


  log_prob = F.log_softmax(out)


Loss: 7.191546
epoch: 2
**********
Loss: 7.107049
epoch: 3
**********
Loss: 7.023870
epoch: 4
**********
Loss: 6.941966
epoch: 5
**********
Loss: 6.860904
epoch: 6
**********
Loss: 6.780304
epoch: 7
**********
Loss: 6.699733
epoch: 8
**********
Loss: 6.619081
epoch: 9
**********
Loss: 6.538334
epoch: 10
**********
Loss: 6.457314
epoch: 11
**********
Loss: 6.375604
epoch: 12
**********
Loss: 6.293319
epoch: 13
**********
Loss: 6.210072
epoch: 14
**********
Loss: 6.126053
epoch: 15
**********
Loss: 6.041139
epoch: 16
**********
Loss: 5.955259
epoch: 17
**********
Loss: 5.868464
epoch: 18
**********
Loss: 5.780504
epoch: 19
**********
Loss: 5.691611
epoch: 20
**********
Loss: 5.602053
epoch: 21
**********
Loss: 5.511841
epoch: 22
**********
Loss: 5.421342
epoch: 23
**********
Loss: 5.330277
epoch: 24
**********
Loss: 5.238912
epoch: 25
**********
Loss: 5.147139
epoch: 26
**********
Loss: 5.054987
epoch: 27
**********
Loss: 4.962533
epoch: 28
**********
Loss: 4.870142
epoch: 29
**********


In [10]:
# Now let's testing
# Suppose our testing case is: given '(NLP) is', output 'an'

word, label = trigram[3]
print(word, '\t', label)

('(NLP)', 'is') 	 an


In [11]:
print(word_to_idx['(NLP)'], '\t', word_to_idx['is'])

61 	 89


In [12]:
word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
out = ngrammodel(word)

  log_prob = F.log_softmax(out)


In [13]:
# the intermediate representation of the predicted word
out

tensor([[-10.4696,  -8.1890,  -7.1613, -10.2673,  -7.7854,  -6.8352,  -7.6468,
          -9.4642,  -6.9517,  -9.3639,  -8.8288,  -6.1360,  -8.2456,  -5.1004,
          -7.8550,  -7.8797,  -7.1737,  -8.9314,  -7.5029,  -8.1449,  -0.1588,
          -8.4978,  -8.8739,  -8.4508,  -7.6528,  -8.1161,  -9.7926,  -3.0777,
          -6.8153,  -7.6515,  -8.5922,  -8.4576,  -8.1637,  -4.1588,  -7.8781,
          -8.9163,  -8.5277,  -8.1280,  -4.9791,  -8.1334,  -8.4802,  -7.2859,
          -7.6268,  -8.2424,  -7.8836,  -6.8924,  -8.0658,  -8.6845,  -4.8076,
          -7.6160,  -8.7249,  -6.7079,  -9.2927,  -9.9076,  -8.2134,  -8.7413,
          -8.5173,  -8.1099,  -8.7992,  -6.6223,  -7.7032,  -9.2457,  -8.1872,
          -8.6842,  -8.4832,  -7.3168,  -8.5292,  -8.2972,  -8.6396,  -9.3382,
          -8.0006,  -6.9437,  -8.7278,  -8.5051,  -7.1226,  -7.4214,  -7.7384,
          -6.3909,  -7.4097,  -7.6035,  -7.4354,  -8.1616,  -5.6515,  -7.9523,
          -8.3122,  -7.7721,  -8.4252,  -7.3988,  -6

In [14]:
_, predict_label = torch.max(out, 1)

In [15]:
predict_label

tensor([20])

In [16]:
predict_word = idx_to_word[predict_label.item()]
print('real word is: {},\npredict word is: {}'.format(label, predict_word))

real word is: an,
predict word is: an


Validation set can avoid overfitting
N-gram overlapping