## **Reading a Raw Text Corpus**

Retrieve & save raw corpus

In [None]:

import urllib.request
from bs4 import BeautifulSoup
import re
#from Ipython.display import HTML

r=urllib.request.urlopen("https://www.gutenberg.org/files/730/730-0.txt").read()  #extracting data from webapge
soup=BeautifulSoup(r,"lxml")
data= str(soup)
start=data.find('CHAPTER I')
end = data.find('*** END OF THE PROJECT GUTENBERG EBOOK OLIVER TWIST ***')
parsed = data[start:end]
file = open("rawCorpus.txt",'w')  
file.write(parsed)
file.flush() 
file.close()



Read the corpus

In [None]:
# Read the corpus from rawCorpus.txt, in a variable `rawReadCorpus`
# *** Write code ***
file=open("rawCorpus.txt",'r')
rawReadCorpus=file.read()
file.close()

## **Preprocessing the corpus**

In [None]:
# Importing modules
import nltk
nltk.download('punkt') # For tokenizers
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer,TweetTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# *** Write code for preprocessing the corpus ***
lowercorp=rawReadCorpus
lowernew=lowercorp.replace('_','') #handling the edge case of _will_ , etc
lowerforword=lowernew.lower()
token=RegexpTokenizer("[a-z'a-z]+")  # handling the edge case of can't,won't etc
wordtok=token.tokenize(lowerforword)

# Print first 5 words/tokens of your preprocessed corpus *** Write code ***
print(wordtok[0:5])

['chapter', 'i', 'treats', 'of', 'the']


In [None]:
# Importing modules
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **Language Modeling**

In [None]:
from nltk.util import ngrams
from nltk.probability import FreqDist
unigrams=[]
bigrams=[]
trigrams=[]
agram=ngrams(wordtok,1) #making unigram
bgram=ngrams(wordtok,2) #making bigram
cgram=ngrams(wordtok,3) #making trigram
for content in agram: # *** Write code ***
    unigrams.append(content) 
    # *** Write code ***
for content in bgram: # *** Write code ***
    bigrams.append(content)
for content in cgram: # *** Write code ***
    trigrams.append(content)

arti=["a","an","the","to","in","into","on","onto","at","inside","over","above","below","beneath","under","underneath","by","near","between","among","opposite","across","along","behind","off","toward",'within',"this","that","these","those","my","your","his","her","its","our","their","much","many","few","little","most","some","any","enough","either","neither","each","every","half","such","other","another"]+stopwords.words('english')
unigrams_Processed = [gram for gram in unigrams if not any(stop in gram for stop in arti)] # arti is unwanted words
bigrams_Processed =[gram for gram in bigrams if not any(stop in gram for stop in arti)]
trigrams_Processed =[gram for gram in trigrams if not any(stop in gram for stop in arti)]

def get_ngrams_freqDist(n, ngramList):
    #This function computes the frequency corresponding to each ngram in ngramList 
    #Here, n=1 for unigram, n=2 for bigram, etc.
    #ngramList = list of unigrams when n=1, ngramList = list of bigrams when n=2
    #Returns: ngram_freq_dict (a Python dictionary where key = a ngram, value = its frequency)
    
    # *** Write code ***
    ngram_freq_dict=FreqDist()  # making frequncy distribution 
    for t in ngramList:
      ngram_freq_dict[t]+=1
    
    return ngram_freq_dict
                                               
unigrams_freqDist = get_ngrams_freqDist(1, unigrams)
unigrams_Processed_freqDist = get_ngrams_freqDist(1, unigrams_Processed)
bigrams_freqDist = get_ngrams_freqDist(2, bigrams_Processed)
bigrams_Processed_freqDist = get_ngrams_freqDist(2, bigrams_Processed)
trigrams_freqDist = get_ngrams_freqDist(3, trigrams_Processed)
trigrams_Processed_freqDist = get_ngrams_freqDist(3, trigrams_Processed) 

## **Next n words' Prediction using Smoothed Models**

For a bigram model, add-one smoothing is defined by $P_{Add-1}(w_i|w_{i-1})=\frac{count(w_{i-1},w_i)+1}{count(w_{i-1})+V}$.
That is, pretend we saw each word one more time than we did.




In [None]:
testSent1 = "There was a sudden jerk, a terrific convulsion of the limbs; and there he"
testSent2 = "They made room for the stranger, but he sat down"
testSent3 = "The hungry and destitute situation of the infant orphan was duly reported by"

In [None]:
# *** Write code ***
ngrams_vocab = {1: set([]), 2: set([]), 3: set([])} # contains unique ngrams. key 1 is for unigrams, key 2 is for bigrams and so on
for gram in unigrams:
  if gram not in ngrams_vocab[1]:
    ngrams_vocab[1].add(gram)

for gram in bigrams:
  if gram not in ngrams_vocab[2]:
    ngrams_vocab[2].add(gram)

for gram in trigrams:
  if gram not in ngrams_vocab[3]:
    ngrams_vocab[3].add(gram)


total_ngrams = {1: len(unigrams), 2: len(bigrams), 3: len(trigrams)}   #keep count of total n grams. similar logic for keys as as ngrams_vocab
total_vocab = {1: len(ngrams_vocab[1]), 2: len(ngrams_vocab[2]), 3: len(ngrams_vocab[3])}   #keep count of unique n grams 

ngrams_all_prob = {1: [], 2: [], 3: []}      # This list will contain probability in sorted order

for ngram in ngrams_vocab[1]:  
  tmplist = [ngram]
  tmplist.append(unigrams.count(ngram))
  (ngrams_all_prob[1]).append(tmplist)

for ngram in ngrams_vocab[2]:
  tmplist = [ngram]
  tmplist.append(bigrams.count(ngram))
  (ngrams_all_prob[2]).append(tmplist)

for ngram in ngrams_vocab[3]:
  tmplist = [ngram]
  tmplist.append(trigrams.count(ngram))
  (ngrams_all_prob[3]).append(tmplist)

for i in range(3):              # smoothing
    for ngram in ngrams_all_prob[i + 1]:
        ngram[-1] = (ngram[-1] + 1) / (total_ngrams[i + 1] + total_vocab[i + 1])

def Sort_prob(lst):       # returns a list of ngrams sorted by their probability values
    return(sorted(lst, key = lambda x: x[1], reverse = True))

for i in range(3):      # sorting according to probability values 
    ngrams_all_prob[i + 1] = Sort_prob(ngrams_all_prob[i + 1])


In [None]:
for i in range(5):
  print(ngrams_all_prob[2][i]) # my name is satish  temp - [satish,the,said]

[('of', 'the'), 0.0036570311479623007]
[('in', 'the'), 0.002557393901858429]
[('to', 'the'), 0.0019759765073667267]
[('said', 'the'), 0.001908565794961892]
[('mr', 'bumble'), 0.001470396164330464]


In [None]:
#predicting next word in given test sentences

def predict_by_bigram(t1, n):
    temp = []
    temp.append(t1[-1])
    for j in range(n):  # this for loop will search for next possible words
        last_word = temp[-1]
        for pred in ngrams_all_prob[2]:
            if pred[0][0] == last_word:  # searching for bigram
                temp.append(pred[0][1])
                break
    predicted_tok_sent1 = t1
    # print(len(temp))
    for i in range(n):
        predicted_tok_sent1.append(temp[1 + i])  # appending the predicted words in the tokens

    final_predicted = ""
    for w in predicted_tok_sent1:  # making the string from tokens
        final_predicted += w + " "
    return final_predicted

def predict_by_trigram(t2, n):
    temp = []
    temp.append(t2[-2])
    temp.append(t2[-1])
    for j in range(n):  # this for loop will search for next possible words
        last_word = temp[-1]
        second_last_word = temp[-2]
        for pred in ngrams_all_prob[3]:
            if pred[0][0] == second_last_word and pred[0][1] == last_word:  # searching for trigram
                temp.append(pred[0][2])
                break
    predicted_tok_sent2 = t2
    for i in range(n):
        predicted_tok_sent2.append(temp[2 + i])  # appending the predicted words in the tokens

    final_predicted = ""
    for w in predicted_tok_sent2:  # making the string from tokens
        final_predicted += w + " "
    return final_predicted

def inputsent():
    senttence = input ("enter the sentence\n")
    n = int(input ("enter the number of words you want to predict\n"))

    tok_sent1 = word_tokenize(senttence)  # tokeninzing the input string

    ngrams_sent1 = {1: [], 2: [], 3: []}  # contains ngrams formed at the end of testSent1

    for i in range(3):
      try:
        ngrams_sent1[i + 1] = list(ngrams(tok_sent1, i + 1))[-1]
      except:
        continue

    predict_sent1 = {2: [], 3: []}  # will contain bi and trigram prediction

    ngrams_sent1_2 = ngrams_sent1[2]
    ngrams_sent1_3 = ngrams_sent1[3]

    print("The predicted words for the given sentence are as follows:")
    print("Bigram--->" + predict_by_bigram(tok_sent1.copy(),n))
    try:
      print("Trigram--->" + predict_by_trigram(tok_sent1.copy(),n) + '\n')
    except:
      print("Trgiram model cannot predict on the sentence with words less than 2")

In [12]:
# perplexity
def bigram_perplexity(sentence):
    prob = 1
    perp = None
    words = sentence.split()
    ct=0
      
    for bigram in ngrams(word_tokenize(sentence),2):
        #occurence in bigram
        if bigram not in bigrams_freqDist.keys():
            ct_bigram = 0
        else:
            ct_bigram = bigrams_freqDist[bigram]
        #occurence in unigram     
        if bigram[0] not in unigrams_freqDist.keys():
            ct_unigram = 0
        else:
            ct_unigram = unigrams_freqDist[bigram[0]]
        prob = prob * ((ct_bigram + 1) / (ct_unigram + len(unigrams_freqDist)))   
      
    for i in ngrams(word_tokenize(sentence),2) :
        ct = ct + 1
    perp = (1/prob) ** (1/(ct))
    return perp

def trigram_perplexity(sentence):
    prob = 1
    perp = None
    words = sentence.split()
    ct=0
    
    for trigram in ngrams(word_tokenize(sentence),3):
        #occurence in trigram
        if trigram not in trigrams_freqDist.keys():
            ct_trigram = 0
        else:
            ct_trigram = trigrams_freqDist[trigram]
        #occurence in bigram
        bigram=tuple((trigram[0], trigram[1]))
        if bigram not in bigrams_freqDist.keys():
            ct_bigram = 0
        else:
            ct_bigram = bigrams_freqDist[bigram]
        prob = prob * ((ct_trigram + 1) / (ct_bigram + len(unigrams_freqDist)))
      
    for i in ngrams(word_tokenize(sentence),2) :
        ct = ct + 1
    perp = (1/prob) ** (1/(ct))
    return perp
def perplexity(sentence):
  print("the perplexity is as follows:\n")
  print("bigram perplexity =",bigram_perplexity(predict_by_bigram(tok_sent1.copy(),int(n))))
  print("trigram perplexity =",trigram_perplexity(predict_by_trigram(tok_sent1.copy(),int(n))))

In [17]:
inputsent()

KeyboardInterrupt: ignored