In [None]:
import re
import nltk
import string
import numpy as np

**Ngrams**

An n-gram is a sequence of n words: a 2-gram (which we’ll call bigram) is a two-word sequence of words like "please turn", "turn your", or "your homework", and a 3-gram (a trigram) is a three-word sequence of words like "please turn your", or "turn your homework".

In [None]:
sentence = "An n-gram is a sequence of n adjacent symbols in particular order."

tokens = word_tokenize(sentence)
print(tokens)

In [None]:
# Creating bigrams

from nltk.util import bigrams

bgrams = list(bigrams(tokens))
print(bgrams)

In [None]:
# start and end padding

from nltk.util import pad_sequence

padded_tokens = list(pad_sequence(
    tokens,
    pad_left=True,
    left_pad_symbol="<s>",
    pad_right=True,
    right_pad_symbol="</s>",
    n=2
))
print(padded_tokens)

In [None]:
# alternate shorthand

from nltk.lm.preprocessing import pad_both_ends

padded_tokens = list(pad_both_ends(tokens, n=2))
print(padded_tokens)

In [None]:
# padded bigrams

padded_bgrams = list(bigrams(padded_tokens))
print(padded_bgrams)

In [None]:
# trigrams

from nltk.util import trigrams

padded_tgrams = list(trigrams(padded_tokens))
print(padded_tgrams)

In [None]:
# ngrams

from nltk.util import ngrams

padded_5grams = list(ngrams(padded_tokens, n=5))
print(padded_5grams)

In [None]:
# everygrams - ngrams for every n until max_len

from nltk.util import everygrams

everygrams3 = list(everygrams(padded_tokens, max_len=3))
print(everygrams3)

**Ngrams for Language Modelling**

In [None]:
text = "An n-gram is a sequence of n adjacent symbols in particular order. The symbols may be n adjacent letters (including punctuation marks and blanks), syllables, or rarely whole words found in a language dataset; or adjacent phonemes extracted from a speech-recording dataset, or adjacent base pairs extracted from a genome. They are collected from a text corpus or speech corpus. If Latin numerical prefixes are used, then n-gram of size 1 is called a \"unigram\", size 2 a \"bigram\" (or, less commonly, a \"digram\") etc. If, instead of the Latin ones, the English cardinal numbers are furtherly used, then they are called \"four-gram\", \"five-gram\", etc."
N = 4

In [None]:
# divide text into sentences

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text.lower())
sentences

In [None]:
# divide each sentence into words

tokens_list = [word_tokenize(sentence) for sentence in sentences]
print(tokens_list)

In [None]:
# pad each sentence

padded_tokens_list = [list(pad_both_ends(tokens, n=N)) for tokens in tokens_list]
print(padded_tokens_list)

In [None]:
# merge sentences into a single list

from nltk.lm.preprocessing import flatten

padded_tokens = list(flatten(padded_tokens_list))
print(padded_tokens)

In [None]:
# create n-grams

grams = list(ngrams(padded_tokens, n=N))
print(grams)

In [None]:
# finding the vocabulary of ngram model

vocabulary = list(set(padded_tokens))
print(vocabulary)

In [None]:
# Language Model

from nltk.lm import MLE

lm = MLE(N)

lm.fit([grams], vocabulary)

In [None]:
# model vocabulary

print(lm.vocab)
print(list(lm.vocab))

In [None]:
# check if a word exists in vocabulary

lm.vocab.lookup(['text', 'and', 'sherlock'])

In [None]:
# ngram counts

# a particular bigram
print(lm.counts[['a']]['sequence'])

# all bigrams starting with a word
lm.counts[['a']]

In [None]:
# ngram probabilities

# probability of 'sequence' occurring after 'a'
lm.score('sequence', ['a'])

In [None]:
# log probability to avoid very small values

lm.logscore('sequence', ['a'])

In [None]:
# text generation

generated_tokens = lm.generate(num_words=50, text_seed=['<s>', '<s>', '<s>'])
generated_text = " ".join(generated_tokens)
print(generated_text)

**Exercise 11.1** Create a character-based n-gram model using any book. Use the language model to write a short story.

In [None]:
# TODO: Your Code Here

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.lm.preprocessing import pad_both_ends, flatten
from nltk.util import ngrams
from nltk.lm import MLE

N = 5

with open('assets/sherlock.txt', 'r') as f:
    raw_text = f.read()
raw_text

sentences = sent_tokenize(raw_text)
tokens_list = [list(sentence) for sentence in sentences]

padded_tokens_list = [list(pad_both_ends(tokens, n=N)) for tokens in tokens_list]
padded_tokens = list(flatten(padded_tokens_list))

grams = list(ngrams(padded_tokens, n=N))

vocabulary = list(set(padded_tokens))

lm = MLE(N)
lm.fit([grams], vocabulary)

print(list(lm.vocab))

In [None]:
generated_tokens = lm.generate(num_words=500, text_seed=['<s>']*(N-1))
generated_text = ''.join(generated_tokens)

generated_text = generated_text.replace("<s>", " ")
generated_text = generated_text.replace("</s>", " ")
generated_text = " ".join(generated_text.split())

print(generated_text)

**Ngram Probabilities Calculation**

In [None]:
text = "To be, or not to be, that is the question"
text

In [None]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)
print(tokens)

In [None]:
tokens = [token.lower() for token in tokens]
print(tokens)

In [None]:
tokens = [token for token in tokens if token not in string.punctuation]
print(tokens)

In [None]:
vocabulary = sorted(set(tokens))
print(vocabulary)

In [None]:
word2index = {v: i for i, v in enumerate(vocabulary)}

In [None]:
from nltk.util import ngrams

grams = list(ngrams(tokens, n=2))
print(grams)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

import matplotlib
import matplotlib as mpl


def plot_matrix_with_names(matrix, namesx, namesy):

    fig, ax = plt.subplots()
    im = ax.imshow(matrix)

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(namesx)), labels=namesx)
    ax.set_yticks(np.arange(len(namesy)), labels=namesy)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(namesy)):
        for j in range(len(namesx)):
            text = ax.text(j, i, matrix[i, j],
                           ha="center", va="center", color="w")

    fig.tight_layout()
    plt.show()

In [None]:
counts = np.zeros((len(vocabulary), len(vocabulary)), dtype=np.int32)

for gram in grams:
    previous_word = gram[0]
    next_word = gram[1]

    i = word2index[previous_word]
    j = word2index[next_word]

    counts[i][j] += 1

plot_matrix_with_names(counts, vocabulary, vocabulary)

In [None]:
total_word_counts = np.sum(counts, axis=1, keepdims=True)

total_word_counts[total_word_counts==0] = 1

In [None]:
probabilities = counts / total_word_counts
probabilities = np.around(probabilities, 2)

plot_matrix_with_names(probabilities, vocabulary, vocabulary)

**Onehot Encoding**

In [None]:
tokens

In [None]:
vocabulary

In [None]:
word2index = {v: i for i, v in enumerate(vocabulary)}
word2index

In [None]:
# representing 'be'

onehot_vector = np.zeros((len(vocabulary)))
onehot_vector[0] = 1
onehot_vector

In [None]:
# representing 'to'

onehot_vector = np.zeros((len(vocabulary)))
onehot_vector[7] = 1
onehot_vector

In [None]:
# general representation

vectors_list = []
for word in tokens:
    onehot_vector = np.zeros((len(vocabulary)))
    word_index = word2index[word]
    onehot_vector[word_index] = 1
    vectors_list.append(onehot_vector)

vectors_list

In [None]:
plot_matrix_with_names(np.array(vectors_list), vocabulary, tokens)