In [0]:
import pandas as pd
import re
import math
import nltk
import random
from urllib.request import urlopen
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split

In [0]:
text_data = urlopen("http://www.gutenberg.org/files/31100/31100.txt").read()

In [0]:
string_text = str(text_data)
start_index = string_text.find('Chapter 1')+ len('Chapter 1')
data = string_text[start_index:]

In [0]:
#tokenizing into sentences 
sentence_list = sent_tokenize(data)

# Classical Approach

In [0]:
##removing special characters and spaces

spec_chars = re.compile('[`~!@#$%^&*()+={}|\[\]:";<>?,\./“”]')

for i in range(len(sentence_list)):
    line = sentence_list[i].lower()
    line = line.replace("\\r"," ")    
    line = line.replace("\\n"," ")
    line = "<s> " + spec_chars.sub("", line) + " </s>"
    sentence_list[i] = ' '.join(line.split())


In [0]:
#Splitting into train and test data [4:1]
test, train = [], []
for i in range(len(sentence_list)):
  train.append(sentence_list[i]) if (i%5==0) else test.append(sentence_list[i])

In [0]:
def n_grams(sentence_list, n):
    dic, total = {}, 0
    for sent in sentence_list:
        x = sent.split()
        for j in range(len(x)-n+1):
            lis = []
            for k in range(n):
                lis.append(x[j+k])
                s = " ".join(lis)
            dic[s] = dic.get(s, 0) + 1
            total += 1
    return dic, total

In [136]:
unigrams, total_unigrams = n_grams(train, 1)
bigrams, total_bigrams = n_grams(train, 2)
trigrams, total_trigrams = n_grams(train, 3)
quadgrams, total_quadgrams = n_grams(train, 4)
total_unigrams, total_bigrams, total_trigrams, total_quadgrams

(164806, 158932, 153058, 147184)

# Evaluating MLE for Unigram, Bigram, Trigram, Quadgram


In [0]:
MLEuni = {}
MLEbi = {}
MLEtri = {}
MLEquad = {}

for i in unigrams.keys():
    MLEuni[i] = unigrams[i]/total_unigrams
    
for i in bigrams.keys():
        lst = i.split()
        MLEbi[i] = bigrams[i]/unigrams[' '.join(lst[:-1])]
      
for i in trigrams.keys():
        lst = i.split()
        MLEtri[i] = trigrams[i]/bigrams[' '.join(lst[:-1])]
      
for i in quadgrams.keys():
        lst = i.split()
        MLEquad[i] = quadgrams[i]/trigrams[' '.join(lst[:-1])]

In [0]:
def parse(n_grams):
    parse_dict = {}
    for i in n_grams.keys():
        j = " ".join(i.split()[:-1])
        k = i.split()[-1]
        
        if (j in parse_dict):
            parse_dict[j][0].append(k)
            parse_dict[j][1].append(n_grams[i])
            
        else:
            parse_dict[j] = [[],[]]
            parse_dict[j][0].append(k)
            parse_dict[j][1].append(n_grams[i])
            
    return parse_dict

def predict_word(n, n_1_gram):
    l = [[],[]]
    if (n == 1):
        l[0] = list(MLE_unigrams.keys())
        l[1] = list(MLE_unigrams.values())
    if (n == 2):
        l = parse(MLEbi)[n_1_gram]
    if (n == 3):
        l = parse(MLEtri)[n_1_gram]
    if (n == 4):
        l = parse(MLEquad)[n_1_gram]
    
    probables, probabilities = np.array(l[0]), np.array(l[1])
    return np.random.choice(probables, p=probabilities)
   

def helper_Generator(n, line, n_1_gram):
    word = predict_word(n, n_1_gram)
    
    if (word != "<s>"):
        line = line + " " + word
        n_1_gram = n_1_gram.split()
        n_1_gram.append(word)
        n_1_gram = ' '.join(n_1_gram[1:])
        
    if (word == "</s>"):
        return line
      
    else:
        return helper_Generator(n, line, n_1_gram)


# Generator Function

In [0]:
def Generator(k):
    line = "<s>"
    
    if (k == 3):
        word = predict_word(2, line)
        line = line + " " + word
        if (word == "</s>"):
            return line
          
    elif (k == 4):
        word = predict_word(2, line)
        line = line + " " + word
        if (word == "</s>"):
            return line
          
        word = predict_word(3, line)
        line = line + " " + word
        if (word == "</s>"):
            return line

    string = helper_Generator(k, line, line)
    return string

In [171]:
#Generate Sentences
print(Generator(4))
print(Generator(2))
print(Generator(3))
print(Generator(2))
print(Generator(4))

<s> elinor tried to talk of again to mrs allen that she would and the argument ended only with the young and we were a thoughtless gay set without any strict rules of conduct </s>
<s> into the matter of jane fairfax </s>
<s> these letters were but they always looked another way a bolt flew and she came back not to mention her solemn protestations of innocence the weakness of the matter--ha snows a little at netherfield till the marriage of her uncle and aunt had already met him at pemberley with you exactly </s>
<s> well bred and though you so soon as for me it known </s>
<s> from the very circumstance of its being larger sir </s>


# Probability Function

In [0]:
def EvalProb(sent, n):
    sent,p = sent.split(),1
    if (n == 1):
        prob = MLEuni
        p = -1/(prob['<s>'])
    elif (n == 2):
        prob = MLEbi
    elif (n == 3):
        prob = MLEtri
        p *= (MLEbi.get(' '.join(sent[0:2]), 0))
    else:
        prob = MLEquad
        p *= (MLEbi.get(' '.join(sent[0:2]), 0))
        p *= (MLEtri.get(' '.join(sent[0:3]), 0))

    for i in range(len(sent)-n+1):
        n_gram = ' '.join(sent[i:i+n])
        p *= prob.get(n_gram, 0)
        
    return p   


# Perplexity Function

In [0]:

count = n_grams(train, 3)  ##trigram

def EvalPerp(sentence_list, N):
  perp = []

  for sent in test:
      ngrams = []
      tokens = sent.split()

      for i in range(len(tokens)-N+1):
          ngrams.append(tokens[i:i+N])

      for i in ngrams:
          seq  = ' '.join(i[:-1])
          last = str(i[-1])

          if seq not in count:
              perp.append(1)

          elif last not in count[seq]:
              perp.append(1)

          else:
              if N!=1:
                  x = count[seq].items()
              else:
                  x = count[''].items()
              total = sum(w for c, w in x)
              perp.append(count[seq][last]/total)
  return perp

## The generated text is human readable, but isn't grammatically perfect and hence require some cross-checking before publishing.

# Next Approach (Neural)

In [0]:
##Simple RNN

In [201]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout,SimpleRNN 
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as utils
import re
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [0]:
## Check File Assignment_Nueral_2.ipynb for remaining approaches

### Even in the nueral approach we might see grammatically incorrect sentences.