# Assignment 2

In [1]:
import nltk
import re
import codecs
from collections import Counter,defaultdict
import nltk.data
from nltk.util import ngrams
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from functools import partial
import random
import numpy as np
import time

### Taking data and cleaning

In [2]:
raw_data = ""
with codecs.open("speeches.txt",'r','UTF-8') as file:
    raw_data = file.read()
raw_data = re.sub("[\s]+"," ",raw_data)
raw_data = raw_data.replace("SPEECH","")
raw_data = raw_data.replace("$","")

In [3]:
def remove_punc(text):
    text = text.replace("'","")
    rx = re.compile(r"[^a-z.?!’]+")
    return rx.sub(" ",text)
sent_tkn = PunktSentenceTokenizer()

In [4]:
sentences = sent_tkn.tokenize(raw_data)
sentences = [remove_punc(i.lower().strip()) for i in sentences]

### Test and Train split

In [5]:
total_sents = len(sentences)
random.seed(3000)
random.shuffle(sentences) # shuffle the order of sentences
train_num = 4*total_sents//5
train_sents = sentences[:train_num]
test_sents = sentences[train_num:]

# N-Gram Modelling

#### Creating Data Structures to ease N-Gram MLE Estimation and Perplexity Calculation

In [6]:
class partial_dd(defaultdict):
    def __getitem__(self,key):
        if(len(key) == len(list(self.keys())[0])):
            return super().__getitem__(key)
        else:
            dd = defaultdict(self.default_factory)
            for k,v in self.items():
                if(k[:-1]==key):
                    dd.update({k[-1]:v})
            return dd
def create_dd(dct,typ):
    dd = partial_dd(typ)
    dd.update(dct)
    return dd
float_dd = partial(create_dd,typ=float)

#### Creating Functions for N-Gram Counts and MLE estimation

In [7]:
def get_ngrams(sentences):
    unigram = []
    bigram = []
    trigram = []
    quadgram = []
    for sentence in sentences:
        tokens = ['<s>'] + word_tokenize(sentence) + ['</s>']
        unigram.extend(list(ngrams(tokens,1)))
        bigram.extend(list(ngrams(tokens,2)))
        trigram.extend(list(ngrams(tokens,3)))
        quadgram.extend(list(ngrams(tokens,4)))
    total = len(unigram)
    uni_count = Counter(unigram)
    bi_count = Counter(bigram)
    tri_count = Counter(trigram)
    quad_count = Counter(quadgram)
    return [total,uni_count,bi_count,tri_count,quad_count]

def get_ngram_mles(ngram):
    total = ngram[0] - ngram[1]['<s>']
    uni_count = Counter(ngram[1])
    bi_count = Counter(ngram[2])
    tri_count = Counter(ngram[3])
    quad_count = Counter(ngram[4])
    uni_mle = float_dd({key: value/total for key,value in uni_count.items() if key!='<s>'})
    bi_mle = float_dd({key: value/uni_count[key[:-1]] for key,value in bi_count.items()})
    tri_mle = float_dd({key: value/bi_count[key[:-1]] for key,value in tri_count.items()})
    quad_mle = float_dd({key: value/tri_count[key[:-1]] for key,value in quad_count.items()})
    return [total,uni_mle,bi_mle,tri_mle,quad_mle]

#### Generating MLE for Train Data

In [8]:
train_ngrams = get_ngrams(train_sents)
train_mles = get_ngram_mles(train_ngrams)
vocab = set([i[0] for i in train_ngrams[1]])
vocab_size = len(vocab)

#### Analysis of the N-Grams Possible Vs. Actually Present

In [9]:
print("Unigrams Present:",len(train_ngrams[1]))
print("Unigrams Possible:",vocab_size)
print("Bigrams Present:",len(train_ngrams[2]))
print("Bigrams Possible:",vocab_size**2)
print("Trigrams Present:",len(train_ngrams[3]))
print("Trigrams Possible:",vocab_size**3)
print("Quadgrams Present:",len(train_ngrams[4]))
print("Quadgrams Possible:",vocab_size**4)

Unigrams Present: 5271
Unigrams Possible: 5271
Bigrams Present: 41674
Bigrams Possible: 27783441
Trigrams Present: 83459
Trigrams Possible: 146446517511
Quadgrams Present: 104851
Quadgrams Possible: 771919593800481


#### Defining Funtions to Generate Random Sentences

In [10]:
def get_next(starts,mles):
    cond = tuple(starts)
    gram = len(cond)+1
#     print(gram,cond)
    candidates = mles[gram][cond]
    cand_repr = [i for i in candidates if i!='<s>']
    cand_prob = [candidates[i] for i in cand_repr]
    norm_probs = np.array(cand_prob)/sum(cand_prob)
    experiments = np.random.multinomial(3,norm_probs)
    return cand_repr[list(experiments).index(max(experiments))]
def generate_rand(model_gram,mles):
    rand_sent = ['<s>']
    prevs = lambda x: [] if x==1 else rand_sent[-(model_gram-1):]
    next_word = get_next(prevs(model_gram),mles)
    while(next_word!='</s>'):
        rand_sent.append(next_word)
        next_word = get_next(prevs(model_gram),mles)
    rand_sent.append(next_word)
    return ' '.join(rand_sent)
def pprint(sent):
    s = sent.replace('<s> ','').replace('</s>','').replace(' .','.').replace(' !','!').replace(' ?','?').replace(" ’ ","’")
    print(s,end = '\n\n')

### Unigram Sentences

In [11]:
np.random.seed(5003)
for i in range(5):
    pprint(generate_rand(1,train_mles))



many and i i 

of you about. so re and believe of 

a me about because and.. 

you dollars i. 



### Bigram Sentences

In [12]:
np.random.seed(1000)
for i in range(5):
    pprint(generate_rand(2,train_mles))

you have to know it was on. 

i’t talk about it’re going to make our jobs. 

i think that’re doing a lot more. 

i think i want to be a lot of hispanics. 

and the poll trump? 



### Trigram Sentences

In [13]:
np.random.seed(2000)
for i in range(5):
    pprint(generate_rand(3,train_mles))

we have a very nice people. 

but it’s why being a real border. 

but i think that’s been a little bit of an executive order that president obama’s not going to take away americans’guns then admit the very rich guy very very successful real estate broker. 

it’s going to get rid of common core. 

and we’re going to do terrific and i’m talking about. 



### Quadgram Sentences

In [14]:
np.random.seed(3000)
for i in range(5):
    pprint(generate_rand(4,train_mles))

i’m going to be a real wall. 

but i have to do it. 

you know we have a lot more. 

and i said i think i’d do and i feel terrible about the migration caused by hillary clinton and with all of her many problems and the tremendous mistakes that she’s being totally protected. 

you know i know you’re making less money. 



### Comments on readbility:
- The **unigram model** fares the worst among all since it predicts even fullstops in the middle of the sentences according to my implementation (sentence end marker is not taken as full-stop as sentence might end with any of '.', '!' and '?'). No coherence in the words is present. Some sentences even end in 0 words.
- The **bigram model** is better than unigram but it doesn't maintain grammar correctness for more than 2-3 words. Also there is the issue of the apostrophe appearing in wierd places. 
- The **trigram and quadgram models** get increasingly better and produce very readable and almost grammatically correct sentences. But as the sentence gets longer the context completely changes.

### Preparing Test Data

In [15]:
test_ngrams = get_ngrams(test_sents)
vocab_test = set([i for i in test_ngrams[1]])
vocab_tot_test = vocab | vocab_test
vocab_tot_test_len = len(vocab_tot_test)

### Funtions for Perplexity Calculation
Here I have used backoff with add-1 smoothing done for the unigrams.

In [16]:
def getprob(model,mles,seq):
    if(seq==('<s>',)):
        return 1
    if(model==1):
        if (seq[-1],) in mles[1]:
#             print("waha")
            return (mles[1][(seq[-1],)]*(mles[0])+1)/(mles[0] + vocab_tot_test_len)
        else:
#             print("yaha")
            return 1/(mles[0] + vocab_tot_test_len)
    elif seq[-model:] not in mles[model]:
        return getprob(model-1,mles,seq)
    else:
#         print("yahi hai")
        return mles[model][seq[-model:]]
def get_perplexity_help(sent,model,mles):
    toks =['<s>'] + word_tokenize(sent) + ['<\s>']
    ppl = 0
    for i in range(1,len(toks)+1):
#         print(tuple(toks[max(0,i-model):i]),":",getprob(model,mles,tuple(toks[max(0,i-model):i])))
        ppl -= np.log(getprob(model,mles,tuple(toks[max(0,i-model):i])))
    return ppl

def test_perplexity(sents,model,mles):
    tot_p = 0
    l_p = 0
    for s in sents:
        tot_p+=get_perplexity_help(s,model,mles)
        l_p += (len(word_tokenize(s))+1)
    return np.exp(tot_p/l_p)

### Perplexity of N-Gram Models

In [128]:
ppl_unigram = test_perplexity(test_sents,1,train_mles)
ppl_bigram = test_perplexity(test_sents,2,train_mles)
ppl_trigram = test_perplexity(test_sents,3,train_mles)
ppl_quadgram = test_perplexity(test_sents,4,train_mles)
print("Perplexity of unigram model =",ppl_unigram)
print("Perplexity of bigram model =",ppl_bigram)
print("Perplexity of trigram model =",ppl_trigram)
print("Perplexity of quadgram model =",ppl_quadgram)

Perplexity of unigram model = 608.6038028847778
Perplexity of bigram model = 102.38938649249474
Perplexity of trigram model = 66.05701130552193
Perplexity of quadgram model = 59.4118704289358


# Neural Language Models

In [38]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
keras.backend.clear_session()

#### Defining Vocabulary and Useful Mappings

In [39]:
tot_vocab =set([i for sen in sentences for i in word_tokenize(sen)])
tot_vocab = list(tot_vocab)
tot_vocab = ['<pad>','<s>','</s>'] + tot_vocab
tot_vocab_len = len(tot_vocab)

In [40]:
vecs = keras.utils.to_categorical([i for i in range(tot_vocab_len)],num_classes=tot_vocab_len)
vec_map = {k:v for k,v in zip(tot_vocab,vecs)}
word2int_map = {k:v for k,v in zip(tot_vocab,[i for i in range(tot_vocab_len)])}
int2word_map = {v:k for k,v in zip(tot_vocab,[i for i in range(tot_vocab_len)])}

#### Defining Functions for Preprocessing

In [49]:
max_len = 20
def prep_sent(sent):
    toks = word_tokenize(sent)
#     print(toks)
    return [word2int_map[i] for i in toks]

def trim_and_pad(x_inp,length):
    if(len(x_inp)>length):
        return np.array(x_inp[-length:])
    else:
        a = np.array(x_inp)
        a.resize((length,))
        return a

def preprocess_data(sent_list):
    x_data = []
    y_data = []
    for sent in sent_list:
        sent_encode = [word2int_map['<s>']]+ prep_sent(sent)
        x_data.extend([sent_encode[:i] for i in range(1,len(sent_encode)+1)])
        y_data.extend([word2int_map[i] for i in word_tokenize(sent)]+[word2int_map['</s>']])
    x_data = [trim_and_pad(x,max_len) for x in x_data]
    y_data = keras.utils.to_categorical(y_data,num_classes=tot_vocab_len)
    return np.array(x_data),y_data

#### Preprocessing Test and Train Data

In [51]:
predictors, labels = preprocess_data(train_sents)
test_input, test_label = preprocess_data(test_sents)

### Defining the RNN Model and Training

In [50]:
max_inp = 20
input_s = keras.Input(shape=(max_inp,),name='sentence')
word_features = keras.layers.Embedding(tot_vocab_len,128,input_length=max_inp,name='word_embedding')(input_s)
rnn_layer = keras.layers.SimpleRNN(256,name='RNN_layer')(word_features)
rnn_layer = keras.layers.Dropout(0.2)(rnn_layer)
output_layer = keras.layers.Dense(tot_vocab_len,activation = 'softmax', name='predicted_word')(rnn_layer)
model_rnn = keras.Model(inputs= input_s,outputs = output_layer, name='rnn_language_model')
checkpoint_gen = keras.callbacks.ModelCheckpoint(filepath='rnn_weights_3.hdf5', verbose=1)
model_rnn.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model_rnn.summary()

Model: "rnn_language_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sentence (InputLayer)        [(None, 20)]              0         
_________________________________________________________________
word_embedding (Embedding)   (None, 20, 128)           738432    
_________________________________________________________________
RNN_layer (SimpleRNN)        (None, 256)               98560     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
predicted_word (Dense)       (None, 5769)              1482633   
Total params: 2,319,625
Trainable params: 2,319,625
Non-trainable params: 0
_________________________________________________________________


In [52]:
model_rnn.fit(predictors,labels,batch_size=64 ,epochs=20,verbose=1,callbacks=[checkpoint_gen])

Train on 171875 samples
Epoch 1/20
Epoch 00001: saving model to rnn_weights_3.hdf5
Epoch 2/20
Epoch 00002: saving model to rnn_weights_3.hdf5
Epoch 3/20
Epoch 00003: saving model to rnn_weights_3.hdf5
Epoch 4/20
Epoch 00004: saving model to rnn_weights_3.hdf5
Epoch 5/20
Epoch 00005: saving model to rnn_weights_3.hdf5
Epoch 6/20
Epoch 00006: saving model to rnn_weights_3.hdf5
Epoch 7/20
Epoch 00007: saving model to rnn_weights_3.hdf5
Epoch 8/20
Epoch 00008: saving model to rnn_weights_3.hdf5
Epoch 9/20
Epoch 00009: saving model to rnn_weights_3.hdf5
Epoch 10/20
Epoch 00010: saving model to rnn_weights_3.hdf5
Epoch 11/20
Epoch 00011: saving model to rnn_weights_3.hdf5
Epoch 12/20
Epoch 00012: saving model to rnn_weights_3.hdf5
Epoch 13/20
Epoch 00013: saving model to rnn_weights_3.hdf5
Epoch 14/20
Epoch 00014: saving model to rnn_weights_3.hdf5
Epoch 15/20
Epoch 00015: saving model to rnn_weights_3.hdf5
Epoch 16/20
Epoch 00016: saving model to rnn_weights_3.hdf5
Epoch 17/20
Epoch 00017: 

<tensorflow.python.keras.callbacks.History at 0x1e9acadf208>

### Defining the LSTM Model and Training

In [55]:
max_inp = 20
input_s_lstm = keras.Input(shape=(max_inp,),name='sentence_lstm')
word_features_lstm = keras.layers.Embedding(tot_vocab_len,128,input_length=max_inp,name='word_embedding_lstm')(input_s_lstm)
lstm_layer = keras.layers.LSTM(256,name='LSTM_layer',return_sequences=False)(word_features_lstm)
lstm_layer = keras.layers.Dropout(0.2)(lstm_layer)
output_layer_lstm = keras.layers.Dense(tot_vocab_len,activation = 'softmax', name='predicted_word')(lstm_layer)
model_lstm = keras.Model(inputs= input_s_lstm,outputs = output_layer_lstm, name='lstm_language_model')
checkpoint_gen = keras.callbacks.ModelCheckpoint(filepath='lstm_weights_3.hdf5', verbose=1)
model_lstm.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model_lstm.summary()

Model: "lstm_language_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sentence_lstm (InputLayer)   [(None, 20)]              0         
_________________________________________________________________
word_embedding_lstm (Embeddi (None, 20, 128)           738432    
_________________________________________________________________
LSTM_layer (LSTM)            (None, 256)               394240    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
predicted_word (Dense)       (None, 5769)              1482633   
Total params: 2,615,305
Trainable params: 2,615,305
Non-trainable params: 0
_________________________________________________________________


In [56]:
model_lstm.fit(predictors,labels,batch_size=64 ,epochs=20,verbose=1,callbacks=[checkpoint_gen])

Train on 171875 samples
Epoch 1/20
Epoch 00001: saving model to lstm_weights_3.hdf5
Epoch 2/20
Epoch 00002: saving model to lstm_weights_3.hdf5
Epoch 3/20
Epoch 00003: saving model to lstm_weights_3.hdf5
Epoch 4/20
Epoch 00004: saving model to lstm_weights_3.hdf5
Epoch 5/20
Epoch 00005: saving model to lstm_weights_3.hdf5
Epoch 6/20
Epoch 00006: saving model to lstm_weights_3.hdf5
Epoch 7/20
Epoch 00007: saving model to lstm_weights_3.hdf5
Epoch 8/20
Epoch 00008: saving model to lstm_weights_3.hdf5
Epoch 9/20
Epoch 00009: saving model to lstm_weights_3.hdf5
Epoch 10/20
Epoch 00010: saving model to lstm_weights_3.hdf5
Epoch 11/20
Epoch 00011: saving model to lstm_weights_3.hdf5
Epoch 12/20
Epoch 00012: saving model to lstm_weights_3.hdf5
Epoch 13/20
Epoch 00013: saving model to lstm_weights_3.hdf5
Epoch 14/20
Epoch 00014: saving model to lstm_weights_3.hdf5
Epoch 15/20
Epoch 00015: saving model to lstm_weights_3.hdf5
Epoch 16/20
Epoch 00016: saving model to lstm_weights_3.hdf5
Epoch 17/

<tensorflow.python.keras.callbacks.History at 0x1ea90db43c8>

#### Defining Functions to get Perplexity and Random Sentences

In [57]:
def get_neural_perplexity(model,sents,labels):
    predictions = model.predict(sents)
    ppl = 0
    for i in range(len(predictions)):
        ppl -= np.log(predictions[i][labels[i].argmax(axis=-1)])
    return np.exp(ppl/(len(predictions)))

In [109]:
def get_next_random(prob_vec):
#     nrm = np.linalg.norm(prob_vec,2)
    nrm  = 1.001*np.sum(prob_vec)
    nrm_prob = prob_vec/nrm
#     print(np.sum(nrm_prob))
    ans = np.random.multinomial(3,nrm_prob)
    return ans.argmax(axis=-1)

def get_random_sents(model,max_len,vec_len):
    curr_inp = [word2int_map['<s>']]
    curr_inp = trim_and_pad(curr_inp,vec_len)
    sent = []
    sent.append(word2int_map['<s>'])
    curr_vec = model.predict(np.array([curr_inp]))[0]
    curr_op = get_next_random(curr_vec)
    iters = 1
    while (curr_op!=word2int_map['</s>'] and iters!=max_len):
        sent.append(curr_op)
        curr_inp = trim_and_pad(sent[-vec_len:],vec_len)
        curr_vec = model.predict(np.array([curr_inp]))[0]
#         print(*curr_vec)
        curr_op = get_next_random(curr_vec)
        iters+=1
    return ' '.join([int2word_map[i] for i in sent][1:]).replace(' .','.').replace(' !','!').replace(' ?','?').replace(" ’ ","’")

### Vanilla RNN Model

In [123]:
rnn_pplx = get_neural_perplexity(model_rnn,test_input[:],test_label[:])
print("Perplexity of vanilla RNN model=",rnn_pplx)

Perplexity of vanilla RNN model= 87.984368499613


In [126]:
np.random.seed(3001)
for i in range(5):
    print(get_random_sents(model_rnn,30,20),end='\n\n')

but what was soon no nice television just cuts.

i’ll be a scam more trait.

and i mean no a memo.

and more.

iran thank i’m they build repealing i again is a summit.



### LSTM Model

In [127]:
lstm_pplx = get_neural_perplexity(model_lstm,test_input[:],test_label[:])
print("Perplexity of LSTM model=",lstm_pplx)

Perplexity of LSTM model= 63.81389231418633


In [125]:
np.random.seed(3001)
for i in range(5):
    print(get_random_sents(model_lstm,30,20),end='\n\n')

but what i did hard is she was going to come in and it was so unfair really.

she can’t even hear the name of the bad party.

i was saying why ivanka is i won a couple of months ago.

san pacs are buying in many cases and keep education and saying and i see what this many is ridiculous.

and honestly i’m a rough place.



### Comment on readability:
Both RNN and LSTM produce results which are quite readable but LSTM fares much better in terms of readability as there are less grammatical mistakes and context is maintained for a longer time and sentences are less awkward.

## Comparing Classical and Neural Approaches:

In [131]:
print("-"*60)
print("Classical Approaches:")
print("-"*60)
print("Perplexity of unigram model =",ppl_unigram)
print("Perplexity of bigram model =",ppl_bigram)
print("Perplexity of trigram model =",ppl_trigram)
print("Perplexity of quadgram model =",ppl_quadgram)
print('')
print("-"*60)
print("Neural Approaches:")
print("-"*60)
print("Perplexity of vanilla RNN model=",rnn_pplx)
print("Perplexity of LSTM model=",lstm_pplx)

------------------------------------------------------------
Classical Approaches:
------------------------------------------------------------
Perplexity of unigram model = 608.6038028847778
Perplexity of bigram model = 102.38938649249474
Perplexity of trigram model = 66.05701130552193
Perplexity of quadgram model = 59.4118704289358

------------------------------------------------------------
Neural Approaches:
------------------------------------------------------------
Perplexity of vanilla RNN model= 87.984368499613
Perplexity of LSTM model= 63.81389231418633


- Here we see that the **RNN model** perform better than **bigram** but worse than **trigram** and **quadgram**.
- We also see that the **LSTM model** perform better better than even **trigram** but falls little short of the performace of **quadgram model**.

- The classical approach seems to work better on this dataset. 
- It might be due to the limited vocabulary and the shorter and simpler sentence construction in Donald Trump's speeches. 
- But Neural models, especially LSTM, might generalize better for this data if trained more on this dataset as they do have the potential to imitate the N-Gram models if they have to. 