In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import os
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] \t %(message)s",
    datefmt="%b %d %Y %I:%M%p",
)

os.chdir("../")

In [3]:
from src.utils import *
from src.etl import *
from src.rfm import *
import numpy as np
import matplotlib.pyplot as plt
import PyPDF2

In [4]:
logger.info("test")

Mar 09 2023 10:05PM [INFO] 	 test


In [5]:
# get current directory
os.getcwd()

'/home/arunavgupta/Documents/UCSD/ml-theory-capstone'

In [6]:
# get all urls to make all datasets
import re

def clean_url(x):
    search = re.findall(r'"(.*?)"', x)
    if search:
        return search[0]
    else:
        return ""

urls = list(filter(lambda x: len(x) > 0, map(clean_url, open("./src/url_1984.txt", "r").readlines())))


In [7]:

# Pdf dataset
pdf_fp = "/Users/rohitmishra/Desktop/1984.pdf"
#pdf_fp = "/Users/rohitmishra/Desktop/Interview Frameworks & Tips from a Google PM (Berkeley).pdf"
pdf_tokenized = pdf_tokenizer(pdf_fp,contextsize=256)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/rohitmishra/Desktop/1984.pdf'

In [None]:
pdf_tokenized.shape

In [None]:
make_all_datasets(urls)

In [8]:
corpus = generate_corpus()

Mar 09 2023 10:07PM [INFO] 	 Unknown characters: {34, 91, 93, 39}


In [9]:
vocab = build_vocab()

In [10]:
reverse_vocab = {i: v for v, i in vocab.items()}

In [11]:
def decode(X):
    """Decodes a (N, context_size, vocab_size) array of one-hot vectors into a list of words.)"""
    return [[reverse_vocab[np.argmax(x)] for x in sentence] for sentence in X]

In [12]:
decoded_corpus = decode(corpus)
# decoded_corpus = decode(pdf_tokenized)
#pdf_decoded_corpus = decode(pdf_tokenized)

In [13]:
corpus = corpus.reshape(-1, len(vocab)*64)
# corpus = pdf_tokenized.reshape(-1, len(vocab)*64)

In [14]:
corpus.shape

(7531, 3200)

## Baseline: Bigrams/Trigrams

In [15]:
from nltk.util import bigrams, trigrams, pad_sequence
from nltk.lm.preprocessing import padded_everygram_pipeline, flatten
from nltk.lm import MLE

In [16]:
CONTEXT_SIZE = 64
TEST_SPLIT = 0.2

In [17]:
train_corpus = np.array(decoded_corpus[: int(len(decoded_corpus) * (1 - TEST_SPLIT))])
test_corpus = np.array(decoded_corpus[int(len(decoded_corpus) * (1 - TEST_SPLIT)) :])

In [18]:
len(train_corpus), len(test_corpus)

(6024, 1507)

In [71]:
# shop the dataset in 3/4 - 1/4 split. The 3/4 will be used to conditionally generate the 1/4
CONTEXT_SPLIT_SIZE = int(3/4 * CONTEXT_SIZE)
X_train = train_corpus[:, :CONTEXT_SPLIT_SIZE]
y_train = train_corpus[:, CONTEXT_SPLIT_SIZE:]
X_test = test_corpus[:, :CONTEXT_SPLIT_SIZE]
y_test = test_corpus[:, CONTEXT_SPLIT_SIZE:]

In [20]:
np.where(X_train.flatten() == "<UNK>")[0].shape

(1287,)

In [21]:
# get ngrams
ngrams = []
for sent in X_train:
    ngrams.append(list(bigrams(sent)))


In [22]:
lm = MLE(3)
lm.fit(ngrams, vocab)

In [23]:
X_train[:10]

array([['p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n',
        'b', 'e', 'r', 'g', ' ', 'o', 'f', ' ', 'a', 'u', 's', 't', 'r',
        'a', 'l', 'i', 'a', ' ', 'e', 'b', 'o', 'o', 'k', 's', ' ', 'a',
        'r', 'e', ' ', 'c', 'r', 'e', 'a', 't', 'e'],
       ['w', 'h', 'i', 'c', 'h', ' ', 'a', 'r', 'e', ' ', 'i', 'n', ' ',
        't', 'h', 'e', ' ', 'p', 'u', 'b', 'l', 'i', 'c', ' ', 'd', 'o',
        'm', 'a', 'i', 'n', ' ', 'i', 'n', ' ', 'a', 'u', 's', 't', 'r',
        'a', 'l', 'i', 'a', ',', ' ', 'u', 'n', 'l'],
       ['i', 's', ' ', 'i', 'n', 'c', 'l', 'u', 'd', 'e', 'd', '.', ' ',
        'w', 'e', ' ', 'd', 'o', ' ', 'n', 'o', 't', ' ', 'k', 'e', 'e',
        'p', ' ', 'a', 'n', 'y', ' ', 'e', 'b', 'o', 'o', 'k', 's', ' ',
        'i', 'n', ' ', 'c', 'o', 'm', 'p', 'l', 'i'],
       ['c', 'o', 'p', 'y', 'r', 'i', 'g', 'h', 't', ' ', 'l', 'a', 'w',
        's', ' ', 'a', 'r', 'e', ' ', 'c', 'h', 'a', 'n', 'g', 'i', 'n',
        'g', ' ', 'a', 'l', 'l', ' 

In [24]:
def generate_text(lm, X_test, length=CONTEXT_SIZE-CONTEXT_SPLIT_SIZE):
    # generate text
    text = []
    for sent in X_test:
        text.append(lm.generate(length, text_seed=sent))
    return text

In [25]:
y_test.shape

(1507, 16)

In [26]:
y_test_pred = np.array(generate_text(lm, X_test))

In [27]:
"".join(y_test[0]), "".join(y_test_pred[0])

('and on the dial.', 'alsheath-bed<UNK>ras')

In [28]:
lm.vocab.unk_label

'<UNK>'

In [29]:
class Vocab():
    def __init__(self, vals):
        self._dict = {v: i for i, v in enumerate(vals)}
        self._dict['<UNK>'] = len(self._dict)
        self.rev = {i: v for v, i in self._dict.items()}
    
    def __getitem__(self, key):
        if key in self._dict:
            return self._dict[key]
        else:
            return self._dict['<UNK>']
    
    def __len__(self):
        return len(self._dict)
    
    def __iter__(self):
        return iter(self._dict)
    
    def __contains__(self, key):
        return key in self._dict
    
    def __repr__(self):
        return f"{self.__class__.__name__}({self._dict})"
    
    def decode(self, idx):
        return self.rev[idx]

In [30]:
train_vocab = Vocab(list(sorted(lm.vocab.counts.keys())))
train_vocab

Vocab({' ': 0, '!': 1, '&': 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, ';': 19, '<UNK>': 50, '?': 21, '_': 22, 'a': 23, 'b': 24, 'c': 25, 'd': 26, 'e': 27, 'f': 28, 'g': 29, 'h': 30, 'i': 31, 'j': 32, 'k': 33, 'l': 34, 'm': 35, 'n': 36, 'o': 37, 'p': 38, 'q': 39, 'r': 40, 's': 41, 't': 42, 'u': 43, 'v': 44, 'w': 45, 'x': 46, 'y': 47, 'z': 48, 'â€™': 49})

In [78]:
def encode(sent, vocab=train_vocab):
    return np.array([vocab[w] for w in sent])

def evaluate(y_test, y_test_pred):
    # evaluate
    bleu = []
    perplexity = []
    for i in range(len(y_test)):
        bleu.append(utils.bleu_score(y_test, y_test_pred[i], n=2))
        perplexity.append(utils.perplexity(encode(y_test_pred[i])))
    return { "bleu2": np.mean(bleu), "perplexity": np.mean(perplexity) }

In [32]:
list(map(lambda x: "".join(x).replace("<UNK>", " "), y_test_pred))

['alsheath-bed ras',
 ' wiflonorof w. l',
 'panst p r cof on',
 'tealise ge ly. f',
 'ierer hangaliaf ',
 'e oocreme bjus, ',
 'se, mest, bexisu',
 'g rasesispicheal',
 ' si danobju s in',
 'ivewnomisixin f ',
 ' iowhave t her c',
 'the y. t stonthe',
 ' she waraind ser',
 'y, higextheete, ',
 ' tonimarsthitthe',
 'ttasas mid paner',
 'htoneretrcurimed',
 'iolwengutind ave',
 'rhes. eanacatour',
 'ldo bomared im h',
 't ndofthesty  wh',
 'nd s th of ndace',
 'klud theve he ug',
 'erorsie whe im d',
 'f tuetettittoute',
 'hithitoderotoule',
 'uline the d toc ',
 'po al hennere ha',
 'phacecitlouredec',
 'ty on orecer ctt',
 'risord itour f t',
 'and ledorive tly',
 'es oree intorend',
 'g col hesth foua',
 'at sit whre or s',
 'hasafonuiscore t',
 'qutthethirce iti',
 'y, ald yt g an n',
 'g brelubulen f t',
 'or wemmingif m w',
 'ththomexthevit t',
 'pny f a ane tond',
 'item ackng lalle',
 'eme bin thitican',
 'ut coivecenatofa',
 'elde thanth yore',
 's irtheryis dung',
 'e cathethef

In [33]:
np.where(y_test_pred == '<UNK>', " ", y_test_pred)

array([['a', 'l', 's', ..., 'r', 'a', 's'],
       [' ', 'w', 'i', ..., '.', ' ', 'l'],
       ['p', 'a', 'n', ..., ' ', 'o', 'n'],
       ...,
       [' ', 't', 'r', ..., 'd', ' ', 'a'],
       ['i', 't', 'o', ..., 'u', 'l', 'l'],
       ['g', ' ', 'd', ..., 'f', 'r', '?']], dtype='<U5')

In [117]:
evaluate(y_test, y_test_pred)

{'bleu2': 0.4499339863957525, 'perplexity': 15.368293881367265}

## RFM

In [60]:
SUBSET_SIZE = 3400
subset = np.random.choice(corpus.shape[0], SUBSET_SIZE, replace=False)

In [61]:
y_test_gt = y_test
y_test_gt

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [62]:
TEST_SPLIT = 0.2
N_TEST = int(corpus.shape[0] * TEST_SPLIT)
CONTEXT_SIZE = 48
X_SPLIT = int(len(vocab) * CONTEXT_SIZE)

sampled_corpus = corpus[subset]

X_test = sampled_corpus[:N_TEST, :X_SPLIT]
X_train = sampled_corpus[N_TEST:, :X_SPLIT]
y_test = sampled_corpus[:N_TEST, X_SPLIT:]
y_train = sampled_corpus[N_TEST:, X_SPLIT:]
nc_train = y_train[:, :len(vocab)]
nc_test = y_test[:, :len(vocab)]

X_train.shape, nc_train.shape

((1894, 2400), (1894, 50))

In [63]:
import torch
print("CUDA:", torch.cuda.is_available())
print("MAX VRAM (GB):", torch.cuda.get_device_properties("cuda").total_memory/1024**3 - 1)

CUDA: True
MAX VRAM (GB): 4.79241943359375


In [64]:
torch.cuda.empty_cache()

In [65]:
# move to gpu
X_train = torch.from_numpy(X_train).to("cuda").float()
X_test = torch.from_numpy(X_test).to("cuda").float()
y_train = torch.from_numpy(y_train).to("cuda").float()
y_test = torch.from_numpy(y_test).to("cuda").float()
nc_train = torch.from_numpy(nc_train).to("cuda").float()
nc_test = torch.from_numpy(nc_test).to("cuda").float()

In [66]:
model = RFM(backend="gpu", L=CONTEXT_SIZE)

In [67]:
model.fit(X_train, nc_train, val_split=0.2)

[0.15457700192928314,
 0.10675912350416183,
 0.02754102647304535,
 0.0234493687748909,
 0.022757848724722862,
 0.022372083738446236,
 0.022182442247867584,
 0.02207004465162754,
 0.022009680047631264,
 0.02198205329477787]

In [68]:
def generate_text_RFM(model, X_test, length=16):
    text = []
    for i in range(length):
        yhat = model.predict(X_test)
        # decode the text
        text.append([reverse_vocab[y] for y in np.argmax(yhat.cpu().numpy(), axis=1)])
        # move the window forward
        X_test = torch.concatenate([X_test[:, len(vocab):], yhat], dim=1)
    
    transpose = list(zip(*text))
    return transpose

In [69]:
gen_test = generate_text_RFM(model, X_test)

In [70]:
list(map(lambda x: "".join(x), gen_test))

['h tlt se teot t ',
 'trl ir lt th sa ',
 'ot lat hitnao  e',
 't the  tirtnth i',
 'boi y li  oeadio',
 'eeiat ee tteat u',
 'd se   th u th t',
 ' hit  se lti e  ',
 ' its  or aote   ',
 'dogutler si to u',
 'fe   te ahet  te',
 'tre tool suede r',
 ' fert edet l lia',
 ' to  ud  he   tt',
 'eds ttee ti afs ',
 'nn erintosr iaes',
 '  h t th   yr sa',
 'noan tndthe rlt ',
 ' rit s.mh ohesi ',
 'i  s so  io so  ',
 'edose os  albosa',
 'elarotes tt oita',
 'irn  tir too  ne',
 'cre s se ye oith',
 ' t ao hotd ai  t',
 ' tue   toescod t',
 'nst tee tte othi',
 'rifdls boe e ton',
 ' ren ssitheeboas',
 ' tieesiit tee tt',
 'lti  treneithes ',
 ' thi  toe to  te',
 'eod tidd  er th ',
 'antr   o toou ro',
 ' t  h ore toetoi',
 ' t iwd th  ardea',
 'ut toih  or  tot',
 'foesteoaircs ese',
 'sytife the rt  t',
 '  ttol oit t lat',
 ' in cele o aooc ',
 'ollssed f llt sa',
 'etlta  iho  we t',
 ' e rt es et uoty',
 'al ut dndod liar',
 'nd  nttlotrcitne',
 ' e eaho theidde ',
 'rteeesi a o

In [None]:
plt.plot(np.abs(model.alpha_.cpu().numpy()).sum(axis=0))

In [None]:
model.alpha_.cpu().numpy().shape

In [None]:
c = 48
v = 50
M_c = torch.einsum("vcVC -> cC", model.M_.reshape(v, c, v, c))

In [None]:
plt.plot(torch.diag(M_c.cpu()))

In [None]:
vec = torch.linalg.eigh(M_c).eigenvectors

In [None]:
plt.plot(vec[:, -1].cpu())

In [None]:
utils.mse(y_train_pred, next_char_train)


In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 9))
ar = (y_train_pred.shape[1]/y_train_pred.shape[0])
ax[0].imshow(utils.softmax(y_train_pred, axis=1), aspect=ar)
ax[1].imshow(next_char_train, aspect=ar)

In [None]:
y_test_pred = utils.K_M(X_test_enc, X_train_enc, M, L=256) @ alpha

In [None]:
utils.mse(y_test_pred, next_char_test)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 9))
ar = (y_test_pred.shape[1]/y_test_pred.shape[0])
y_test_argmax_ohc = np.eye(len(train_vocab))[y_test_pred.argmax(axis=1)]
ax[0].imshow(y_test_argmax_ohc, aspect=ar)
ax[1].imshow(next_char_test, aspect=ar)

In [None]:
plt.imshow(y_test_argmax_ohc - next_char_test, aspect=ar)
plt.colorbar()

In [None]:
from functools import partial

In [None]:
kernel_rfm = partial(utils.K_M, M=M, L=256)

gen_test_rfm = generate_text_kernel(kernel_rfm, alpha, X_test_enc, X_train_enc, length=CONTEXT_SIZE-CONTEXT_SPLIT_SIZE)

In [None]:
gen_test

In [84]:
list(map(lambda x: "".join(x), gen_test))

['h tlt se teot t ',
 'trl ir lt th sa ',
 'ot lat hitnao  e',
 't the  tirtnth i',
 'boi y li  oeadio',
 'eeiat ee tteat u',
 'd se   th u th t',
 ' hit  se lti e  ',
 ' its  or aote   ',
 'dogutler si to u',
 'fe   te ahet  te',
 'tre tool suede r',
 ' fert edet l lia',
 ' to  ud  he   tt',
 'eds ttee ti afs ',
 'nn erintosr iaes',
 '  h t th   yr sa',
 'noan tndthe rlt ',
 ' rit s.mh ohesi ',
 'i  s so  io so  ',
 'edose os  albosa',
 'elarotes tt oita',
 'irn  tir too  ne',
 'cre s se ye oith',
 ' t ao hotd ai  t',
 ' tue   toescod t',
 'nst tee tte othi',
 'rifdls boe e ton',
 ' ren ssitheeboas',
 ' tieesiit tee tt',
 'lti  treneithes ',
 ' thi  toe to  te',
 'eod tidd  er th ',
 'antr   o toou ro',
 ' t  h ore toetoi',
 ' t iwd th  ardea',
 'ut toih  or  tot',
 'foesteoaircs ese',
 'sytife the rt  t',
 '  ttol oit t lat',
 ' in cele o aooc ',
 'ollssed f llt sa',
 'etlta  iho  we t',
 ' e rt es et uoty',
 'al ut dndod liar',
 'nd  nttlotrcitne',
 ' e eaho theidde ',
 'rteeesi a o

In [93]:
list(map(lambda x: "".join(x), gen_test))

['h tlt se teot t ',
 'trl ir lt th sa ',
 'ot lat hitnao  e',
 't the  tirtnth i',
 'boi y li  oeadio',
 'eeiat ee tteat u',
 'd se   th u th t',
 ' hit  se lti e  ',
 ' its  or aote   ',
 'dogutler si to u',
 'fe   te ahet  te',
 'tre tool suede r',
 ' fert edet l lia',
 ' to  ud  he   tt',
 'eds ttee ti afs ',
 'nn erintosr iaes',
 '  h t th   yr sa',
 'noan tndthe rlt ',
 ' rit s.mh ohesi ',
 'i  s so  io so  ',
 'edose os  albosa',
 'elarotes tt oita',
 'irn  tir too  ne',
 'cre s se ye oith',
 ' t ao hotd ai  t',
 ' tue   toescod t',
 'nst tee tte othi',
 'rifdls boe e ton',
 ' ren ssitheeboas',
 ' tieesiit tee tt',
 'lti  treneithes ',
 ' thi  toe to  te',
 'eod tidd  er th ',
 'antr   o toou ro',
 ' t  h ore toetoi',
 ' t iwd th  ardea',
 'ut toih  or  tot',
 'foesteoaircs ese',
 'sytife the rt  t',
 '  ttol oit t lat',
 ' in cele o aooc ',
 'ollssed f llt sa',
 'etlta  iho  we t',
 ' e rt es et uoty',
 'al ut dndod liar',
 'nd  nttlotrcitne',
 ' e eaho theidde ',
 'rteeesi a o

In [90]:
def cherrypick(y_test, y_test_pred, N=5):
    """Get the sentences with the highest bleu score."""
    bleu = []
    for i in range(len(y_test_pred)):
        bleu.append(utils.bleu_score(y_test, y_test_pred[i], n=2))
    best_idxs = np.argpartition(bleu, -N)[-N:]
    best_sentences = list(map(lambda x: "".join(x), y_test_pred))[best_idxs]
    print(f"Best sentences have BLEU scores of: {bleu[best_idxs]}, vs mean BLEU of {np.mean(bleu)}")
    return best_sentences

In [113]:
bleu = []
for i in range(len(gen_test)):
    bleu.append(utils.bleu_score(y_test, gen_test[i], n=3))

In [114]:
best_idxs = np.argpartition(bleu, -10)[-10:]
best_idxs

array([ 138,   11, 1196,  461, 1229, 1321, 1130, 1302, 1357, 1232])

In [115]:
np.array(list(map(lambda x: "".join(x), gen_test)))[best_idxs.tolist()]

array(['eat the i tor su', 'tre tool suede r', 'dancees ne te to',
       'to tie nocea ers', 'o hoon th ere to', ' bel hot thesatt',
       'at nea bet sirio', 'e thon ano ois t', 'thald tely da th',
       'es tonsitea ato '], dtype='<U20')

In [116]:
np.array(bleu)[best_idxs]

array([0.47236655, 0.47236655, 0.47236655, 0.47236655, 0.47236655,
       0.47236655, 0.47236655, 0.47236655, 0.47236655, 0.47236655])

In [77]:
utils.bleu_score(list(map(lambda x: "".join(x), y_test)), list(map(lambda x: "".join(x), gen_test)), n=1)

0

In [82]:
len(y_test), len(gen_test)

(1507, 1506)

In [83]:
evaluate(y_test[:-1], gen_test)

{'bleu2': 0.453748164950674, 'perplexity': 11.530809937126978}

In [None]:
import random


list(map(lambda x: "".join(x), gen_test_rfm))#[random.randint(0,len(gen_test_rfm))]

In [None]:
# get train and test accuracy
train_acc = (next_char_train.argmax(axis=1) == y_train_pred.argmax(axis=1)).mean()
test_acc = (next_char_test.argmax(axis=1) == y_test_pred.argmax(axis=1)).mean()

print(f"Train accuracy: {train_acc:.2f}")
print(f"Test accuracy: {test_acc:.2f}")

## Bigram Implementation

In [48]:
import random

class NgramModel:
    def __init__(self, n):
        self.n = n
        self.ngrams = {}
        self.total_count = 0

    def update(self, text):
        for i in range(len(text) - self.n):
            ngram = text[i:i+self.n]
            next_char = text[i+self.n]
            if ngram not in self.ngrams:
                self.ngrams[ngram] = {}
            if next_char not in self.ngrams[ngram]:
                self.ngrams[ngram][next_char] = 0
            self.ngrams[ngram][next_char] += 1
            self.total_count += 1

    def predict(self, prefix):
        if prefix not in self.ngrams:
            return random.choice(list(self.ngrams.keys()))[0]
        choices = list(self.ngrams[prefix].items())
        total = sum(count for _, count in choices)
        r = random.uniform(0, total)
        upto = 0
        for choice, count in choices:
            if upto + count >= r:
                return choice
            upto += count
        return choices[-1][0]

    def generate(self, length):
        start = random.choice(list(self.ngrams.keys()))
        text = start
        for i in range(length - self.n):
            prefix = text[-self.n:]
            text += self.predict(prefix)
        return text

In [51]:
lines = list(map(lambda x: "".join(x), y_test_gt))
lines

['and on the dial.',
 'lled it. that is',
 'failed in humili',
 'ubmission which ',
 'a minority of on',
 'elieve that real',
 'wn right. you al',
 ' you delude your',
 ' everyone else s',
 ' reality is not ',
 'se. not in the i',
 'oon perishes: on',
 'tal. whatever th',
 'e to see reality',
 'he fact that you',
 'truction, an eff',
 'what he had been',
 'diary, <UNK>freedom ',
 'winston, with th',
 'ive--then how ma',
 'the dial had sho',
 'nston<UNK>s body. th',
 'hich even by cle',
 'e four fingers s',
 'pain was only sl',
 ' look at it. the',
 'n. the fingers s',
 'and seeming to v',
 ' still think the',
 'it, stop the pai',
 'und his shoulder',
 'e bonds that had',
 'as shaking uncon',
 'g down his cheek',
 'y comforted by t',
 ' o<UNK>brien was his',
 'de, from some ot',
 'lp seeing what i',
 'metimes they are',
 ' try harder. it ',
 ' limbs tightened',
 'd stopped, leavi',
 'head to the man ',
 'e proceedings. t',
 ' winston<UNK>s eyes,',
 'e and there, the',
 'must be at seven

In [None]:
pdf_file = open(pdf_fp,"rb")

pdf_reader = PyPDF2.PdfReader(pdf_file)

# Extract text from each page in the PDF file
lines = []

for page in range(len(pdf_reader.pages)):
    page_obj = pdf_reader.pages[page]
    text = page_obj.extract_text()
    lines.extend(text.split('\n'))

# Close the PDF file
pdf_file.close()

In [57]:
# Initialize the model with an n-gram value
model = NgramModel(n=10)

# Update the model with a list of strings

for string in lines:
    model.update(string)

generated_text = []
for i in range(len(y_test_gt)):
# Generate text using the model
    generated_text.append(model.generate(16))
    


In [58]:
evaluate(y_test_gt, generated_text)

{'bleu2': 0.19017255185098864, 'perplexity': 12.746881355344126}

In [59]:
generated_text

['ing of <UNK>poli',
 ' assert his powe',
 'l. a great mkbnc',
 ' in almost the o',
 'NK>brien had not',
 'eresy more dnkoh',
 'ld have stretctb',
 'uflage would be ',
 'not<UNK>. theetg',
 '. winston comrd ',
 'ize. one wasgv s',
 ' taking on theny',
 'ating you firses',
 'h, as we shosaie',
 'ut, its meafeei ',
 'rest in doiU nen',
 'r. he had no ddo',
 'd as to giveeb n',
 'power in so,fmod',
 ' claimed tha u e',
 '--was that thame',
 'ly because -true',
 'ut of him uhat e',
 'uld have stretc ',
 'hat he knew whs>',
 't, welcome it,d ',
 'ood, gooderletzg',
 'rn, winstonroace',
 ' to the man  v a',
 'K> and perhaps  ',
 'ken, there wavlf',
 'was not with td ',
 'h, as one takesd',
 'ot make misttwti',
 ', <UNK>orthodoxy',
 ' bootlaces hadc ',
 'nd constantly so',
 'nd raise his ari',
 'been purged oa e',
 'must abandon the',
 'years. he rer,te',
 'mpty glass. nowh',
 'them. these wou ',
 'ox purposes byha',
 'xistence; but th',
 'age herself.ttwa',
 ' guardian of t t',
 ' tossed it 