<a href="https://colab.research.google.com/github/alekhyabulusu/n-gram/blob/main/n_gram_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from nltk.corpus import gutenberg
import nltk
import re
import random
import math
from nltk.util import ngrams
from collections import Counter
nltk.download('gutenberg')
nltk.download('punkt_tab')
nltk.download('punkt')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from nltk.corpus import gutenberg
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [None]:
df = gutenberg.raw("austen-sense.txt")
print(df[:500])

[Sense and Sensibility by Jane Austen 1811]

CHAPTER 1


The family of Dashwood had long been settled in Sussex.
Their estate was large, and their residence was at Norland Park,
in the centre of their property, where, for many generations,
they had lived in so respectable a manner as to engage
the general good opinion of their surrounding acquaintance.
The late owner of this estate was a single man, who lived
to a very advanced age, and who for many years of his life,
had a constant companion an


In [None]:
df = df.lower()
df = re.sub(r'[^a-z\s]', ' ', df)
df[:500]

' sense and sensibility by jane austen      \n\nchapter  \n\n\nthe family of dashwood had long been settled in sussex \ntheir estate was large  and their residence was at norland park \nin the centre of their property  where  for many generations \nthey had lived in so respectable a manner as to engage\nthe general good opinion of their surrounding acquaintance \nthe late owner of this estate was a single man  who lived\nto a very advanced age  and who for many years of his life \nhad a constant companion an'

In [None]:
tokens = nltk.word_tokenize(df)
print(tokens[:50])
print("total tokens:", len(tokens))
print('unique tokens:', len(set(tokens)))

['sense', 'and', 'sensibility', 'by', 'jane', 'austen', 'chapter', 'the', 'family', 'of', 'dashwood', 'had', 'long', 'been', 'settled', 'in', 'sussex', 'their', 'estate', 'was', 'large', 'and', 'their', 'residence', 'was', 'at', 'norland', 'park', 'in', 'the', 'centre', 'of', 'their', 'property', 'where', 'for', 'many', 'generations', 'they', 'had', 'lived', 'in', 'so', 'respectable', 'a', 'manner', 'as', 'to', 'engage', 'the']
total tokens: 120823
unique tokens: 6282


In [None]:
def build_ngrams(tokens, n):
  ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
  return ngrams

In [None]:
def ngram_counts(tokens, n):
  ngrams = build_ngrams(tokens, n)
  ngram_freq = Counter(ngrams)
  context_freq = Counter([ng[:-1] for ng in ngrams])
  return ngram_freq, context_freq

In [None]:
def word_prob(word, context, ngram_freq, context_freq, vocab, n):
    context = tuple(context)
    ngram = context + (word,)
    numerator = ngram_freq[ngram] + 1
    denominator = context_freq[context] + len(vocab)
    return numerator / denominator

In [None]:
def predict_next_word(context, ngram_freq, context_freq, vocab, n):
  probs = {w: word_prob(w, context, ngram_freq, context_freq, vocab, n) for w in vocab}
  return max(probs, key=probs.get)

In [None]:
def sentence_generation(prefix, length, ngram_freq, context_freq, vocab, n):
  tokens = prefix.split()
  for _ in range(length):
    context = tokens[-n+1:] if len(tokens) >= n else tokens
    next_word = predict_next_word(context, ngram_freq, context_freq, vocab, n)
    tokens.append(next_word)
  return ' '.join(tokens)

In [None]:
n=3
ngram_freq, context_freq = ngram_counts(tokens, n)
vocab = set(tokens)

In [None]:
print(build_ngrams(tokens, 3))



In [None]:
print(ngram_freq.most_common(10))

[(('i', 'am', 'sure'), 72), (('as', 'soon', 'as'), 59), (('in', 'the', 'world'), 57), (('i', 'do', 'not'), 46), (('could', 'not', 'be'), 42), (('i', 'can', 'not'), 40), (('she', 'could', 'not'), 39), (('her', 'sister', 's'), 37), (('it', 'would', 'be'), 36), (('would', 'have', 'been'), 36)]


In [None]:
context = ("for", "many")
word = "years"
prob = word_prob(word, context, ngram_freq, context_freq, vocab, n)
print("Probability:", prob)

Probability: 0.000794912559618442


In [None]:
print("Next word:", predict_next_word(("for", "many"), ngram_freq, context_freq, vocab, n))

Next word: years


In [None]:
print("Generated sentence:", sentence_generation("the family", 20, ngram_freq, context_freq, vocab, n))

Generated sentence: the family to walk to the house and the two miss steeles as to the house and the two miss steeles as


In [None]:
print("Prefix = 'good opinion'")
print("Generated sentence:", sentence_generation("good opinion", 20, ngram_freq, context_freq, vocab, n))

Prefix = 'good opinion'
Generated sentence: good opinion of other people the middletons and palmers how am i to tell you that mr willoughby s behaviour in asking


In [None]:
print("Prefix = 'constant companion'")
print("Generated sentence:", sentence_generation("constant companion", 20, ngram_freq, context_freq, vocab, n))

Prefix = 'constant companion'
Generated sentence: constant companion and each for the sake of the house and the two miss steeles as to the house and the two


In [None]:
def perplexity(tokens, ngram_freq, context_freq, vocab, n):
  N = len(tokens) - (n-1)
  log_prob_sum = 0

  for i in range(n-1, len(tokens)):
    context = tuple(tokens[i-n+1:i])
    word = tokens[i]
    prob = word_prob(word, context, ngram_freq, context_freq, vocab, n)
    log_prob_sum += math.log(prob)
  return math.exp(-log_prob_sum / N)

In [None]:
split = int(len(tokens) * 0.5)
train_tokens = tokens[:split]
test_tokens = tokens[split:]

def train_perplexity(train_tokens, test_tokens, n):
  ngram_freq, context_freq = ngram_counts(train_tokens, n)
  vocab = set(train_tokens)
  pp =  perplexity(test_tokens, ngram_freq, context_freq, vocab, n)
  return pp

In [None]:
model_pp = train_perplexity(train_tokens, test_tokens, n)
print(f"Model perplexity: {model_pp}")

Model perplexity: 4565.301614098664


In [None]:
for n in [2, 3, 4, 5]:
  pp = train_perplexity(train_tokens, test_tokens, n)
  print(f"Perplexity for n={n}: {pp}")

Perplexity for n=2: 1532.4852718617046
Perplexity for n=3: 4538.121070944934
Perplexity for n=4: 5491.424305297188
Perplexity for n=5: 5628.285351101017
