In [None]:
import evaluate as ev # importing 
import numpy as np     
from model_builder import * 
!pip install gdown  
import gdown # package that downloads files from a shared google drive to local system
#import nltk
#nltk.download('omw-1.4')

## Section 1 - Exercising all functionality with n=2

<h3 style="text-align:left; color: black;">Perplexity Of Held Out Set</h3>  


In [3]:

# Note - calculating the perplexity of the entire data yields an undefined perplexity measure (extremely high as the probability the model assigns the data is extremely small)

# calculates average sentence perplexity across data 
def average_perplexity(n:int, use_smoothing:bool, data:list):  
    perplexities = []
    for sentence in data:  
        # calculating the perplexity of each sentence
        perplexities.append(ev.calculate_perplexity_sentence(n=n, use_smoothing=use_smoothing, sentence=sentence, clean_sentence=True)) 
    return np.nanmean(perplexities)

In [11]:

test_data_url = "https://drive.google.com/uc?export=download&id=1QiTMvnjYu-e0BxkKOych9WhrV13x77fO" 


output = 'test_data.txt' 

# Download the file
#gdown.download("https://drive.google.com/uc?export=download&id=1QiTMvnjYu-e0BxkKOych9WhrV13x77fO" , output, quiet=False)

with open(output, 'r') as file: 
    test_data = file.read() 

# Note: We are calculating the average perplexity of the test data set sentence by sentence for 50 sentences in the test set. We observed that the perplexity of the entire test data is undefined as the probability the model assigns the test set data is effectively zero 
# There are also sentences in the test data set that have extremely high perplexities so 

sentences = test_data.split('+') 
sentences = sentences[:50]


print(f"Perplexity with n=2 using good turing smoothing is <{average_perplexity(n=2, use_smoothing=True, data=sentences)}>")

print(f"Perplexity with n=2 unsmoothed is <{average_perplexity(n=2, use_smoothing=True, data=sentences)}>")








Perplexity with n=2 using good turing smoothing is <43006148.60803921>
Perplexity with n=2 unsmoothed is <43006148.60803921>


<h3 style="text-align:left; color: black;">Unscrambling a sentence </h3> 

In [5]:
with open('eg_scrambled_sentence.txt', 'w') as file: 
    file.write("cat saw i a") 
    
ev.unscramble(n=2, use_smoothing=True, scrambled_file='eg_scrambled_sentence.txt')
ev.unscramble(n=2, use_smoothing=False, scrambled_file='eg_scrambled_sentence.txt')


Language Model: <N-gram with n = 2 i.e.  bigram .Uses Good-Turing Smoothing with a log-function>
Original Text:  cat saw i a
Unscrambled sentence:  i saw a cat
Original Perplexity:  907891.0752602484
Unscrambled Perplexity:  2561.874399156617
Language Model: <N-gram with n = 2 i.e.  bigram .No smoothing used>
Original Text:  cat saw i a
Unscrambled sentence:  cat i saw a
Original Perplexity:  Undefined
Unscrambled Perplexity:  155795.6131688053


## Section 2 - Exercising all functionality with n=2 on new data source


<h3 style="text-align:left; color: black;">Perplexity of new data</h3> 

In [7]:
# our new data
output = 'new_data.txt'  

# Download the file - can comment this out if the file has already been downloaded 
# This file is called new_data.txt and is available in the shared google drive linked in the README
gdown.download("https://drive.google.com/uc?export=download&id=146pd1hoMUdHjz-irf0yBy-YKuhg-FBHY" , output, quiet=False) 




# cleans text file at the specified path
def preprocess_data(filepath:str)->list: 
    # Reading contents from file 
    with open(filepath, 'r') as file: 
        new_data = file.read() 

    # calling the clean_corpus function to clean(remove special characters and convert to lower case) and lemmatize string
    cleaned_data = clean_corpus(new_data) 
    cleaned_data = cleaned_data.split('+') # get all the sentences in our novel corpus(each sentence ends with an '+' EOS character)
    cleaned_data = [sentence for sentence in cleaned_data if sentence.strip()]

    for i in range(len(cleaned_data)):
        cleaned_data[i] = '+'+ cleaned_data[i] # adding a special EOS character that represents the end of one sentence and the start of another
    return cleaned_data 

cleaned_data = preprocess_data('new_data.txt')  

# Using a smoothed bigram model to get the average sentence perplexity in the test set
print(f"Perplexity with n=2 using good turing smoothing is: <{average_perplexity(n=2, use_smoothing=True, data=cleaned_data)}>") 

# Using an unsmoothed bigram model to get the average sentence perplexity in the test 
print(f"Perplexity with n=2 without smoothing: <{average_perplexity(n=2, use_smoothing=False, data=cleaned_data)}>")

Downloading...
From: https://drive.google.com/uc?export=download&id=146pd1hoMUdHjz-irf0yBy-YKuhg-FBHY
To: C:\Users\rohit\PycharmProjects\CSDS497\new_data.txt
100%|█████████████████████████████████████████████████████████████████████████████| 2.21k/2.21k [00:00<00:00, 1.11MB/s]


Perplexity with n=2 using good turing smoothing is: <2442291.2992360266>
Perplexity with n=2 without smoothing: <7702569.208915002>


<h3 style="text-align:left; color: black;">Unscrambling a sentence </h3> 

In [6]:

with open('new_data_scrambled.txt', 'w') as file: 
    file.write('they ball red saw a') 
    
#using a smoothed bigram model to unscramble 
ev.unscramble(n=2, use_smoothing=True, scrambled_file='new_data_scrambled.txt') 

#using an unsmoothed bigram model to unscramble 
ev.unscramble(n=2, use_smoothing=False, scrambled_file='new_data_scrambled.txt')

Language Model: <N-gram with n = 2 i.e.  bigram .Uses Good-Turing Smoothing with a log-function>
Original Text:  they ball red saw a
Unscrambled sentence:  they saw a red ball
Original Perplexity:  31022963.59629315
Unscrambled Perplexity:  13131.422739635205
Language Model: <N-gram with n = 2 i.e.  bigram .No smoothing used>
Original Text:  they ball red saw a
Unscrambled sentence:  red ball a they saw
Original Perplexity:  Undefined
Unscrambled Perplexity:  944134.4671214414


## Section 3- Comparing performance of models for n=1,2,3 on the new data set

<h3 style="text-align:left; color: black;">Perplexity of new data</h3>

In [8]:
 
# calling preprocess_data method defined before
cleaned_data = preprocess_data('new_data.txt')

# calculating average sentence perplexity(in new data set)
print(f"Perplexity with n=1 using good turing smoothing is: <{average_perplexity(n=1, use_smoothing=True, data=cleaned_data)}>") 
print(f"Perplexity with n=2 using good turing smoothing is: <{average_perplexity(n=2, use_smoothing=True, data=cleaned_data)}>") 
print(f"Perplexity with n=3 using good turing smoothing is: <{average_perplexity(n=3, use_smoothing =True, data=cleaned_data)}>")


Perplexity with n=1 using good turing smoothing is: <964.3445896021693>
Perplexity with n=2 using good turing smoothing is: <2442291.2992360266>
Perplexity with n=3 using good turing smoothing is: <3365598214995.135>


<h3 style="text-align:left; color: black;">Unscrambling a sentence using n-gram models where n=1,2,3</h3> 

In [None]:
# create a file with scrambled text 

# here we assume that all the words in the scrambled sentence are from our lexicon 

with open('scrambled_sentence.txt', 'w') as file: 
    file.write('a is cat this') 

# demonstrating the unscrambler using n=1,2,3 smoothed n grams 
ev.unscramble(n=1, use_smoothing=True,scrambled_file = 'scrambled_sentence.txt') 
print()  
ev.unscramble(n=2, use_smoothing=True, scrambled_file='scrambled_sentence.txt')
print()





Language Model: <N-gram with n = 1 i.e.  unigram .Uses Good-Turing Smoothing with a log-function>
Original Text:  a is cat this
Unscrambled sentence:  a is cat this
Original Perplexity:  339.7988347502979
Unscrambled Perplexity:  339.7988347502979

Language Model: <N-gram with n = 2 i.e.  bigram .Uses Good-Turing Smoothing with a log-function>
Original Text:  a is cat this
Unscrambled sentence:  this is a cat
Original Perplexity:  40812917.61629939
Unscrambled Perplexity:  736.4214701181066

Language Model: <N-gram with n = 3 i.e.  trigram .Uses Good-Turing Smoothing with a log-function>


In [12]:
ev.unscramble(n=3, use_smoothing=True, scrambled_file = 'scrambled_sentence.txt')

Language Model: <N-gram with n = 3 i.e.  trigram .Uses Good-Turing Smoothing with a log-function>
Original Text:  a is cat this
Unscrambled sentence:  cat is a this
Original Perplexity:  4538365171.359575
Unscrambled Perplexity:  83943.18060161584


## Section 4- Comparing performance of smoothed and unsmoothed models where n=2 on data drawn from lexicon and data necessarily drawn from lexicon

### Data drawn from lexicon

In [8]:

# Creating a file containing text which is made of tokens present in our lexicon (but could include other tokens) 

with open('not_strictly_lexicon.txt', 'w') as file: 
    file.write('Susie played the guitar She did not see her mom') 
    
with open('scrambled_text_bigram_demo.txt', 'w') as file: 
    file.write('chair sat cat on the the')

# using an unsmoothed bigram model to get the perplexity of the text
ev.calculate_perplexity(n=2, use_smoothing=False, test_data_file='not_strictly_lexicon.txt') 

# demonstrating the usncrambler for an unsmoothed bigram model  
ev.unscramble(n=2, use_smoothing=False, scrambled_file='scrambled_text_bigram_demo.txt') 

# using a smoothed bigram model to get the perplexity of the text
ev.calculate_perplexity(n=2, use_smoothing=True, test_data_file='not_strictly_lexicon.txt') 

#demonstrating the unscrambler for a smoothed bigram model 
ev.unscramble(n=2, use_smoothing=True, scrambled_file='scrambled_text_bigram_demo.txt')



Language Model: <N-gram with n = 2. No smoothing used>
Perplexity = <Undefined>
Language Model: <N-gram with n = 2 i.e.  bigram .No smoothing used>
Original Text:  chair sat cat on the the
Unscrambled sentence:  sat on the chair the cat
Original Perplexity:  Undefined
Unscrambled Perplexity:  950192.501162325
Language Model: <N-gram with n = 2. Uses Good-Turing Smoothing with a log-function>
Perplexity = <2589391.154690356>
Language Model: <N-gram with n = 2 i.e.  bigram .Uses Good-Turing Smoothing with a log-function>
Original Text:  chair sat cat on the the
Unscrambled sentence:  sat on the chair the cat
Original Perplexity:  4312485.81614361
Unscrambled Perplexity:  5186.606104977492


### Data necessarily drawn from lexicon

In [10]:


# Creating a file containing text which is strictly made of tokens present in our lexicon 

with open('lexicon_only.txt', 'w') as file: 
    file.write('The cat sat on the chair.The man read the news on his phone.There was no music playing.')  
    
with open('scrambled_text_bigram_demo.txt', 'w') as file: 
    file.write('chair sat cat on the the')

# using an unsmoothed bigram model to get the perplexity of the text made of tokens strictly from our lexicon 
ev.calculate_perplexity(n=2, use_smoothing=False, test_data_file='lexicon_only.txt') 

# demonstrating the usncrambler for an unsmoothed bigram model  
ev.unscramble(n=2, use_smoothing=False, scrambled_file='scrambled_text_bigram_demo.txt') 

# using a smoothed bigram model to get the perplexity of the text made of tokens scrictly from our lexicon 
ev.calculate_perplexity(n=2, use_smoothing=True, test_data_file='lexicon_only.txt') 

#demonstrating the unscrambler for a smoothed bigram model 
ev.unscramble(n=2, use_smoothing=True, scrambled_file='scrambled_text_bigram_demo.txt')




Language Model: <N-gram with n = 2. No smoothing used>
Perplexity = <Undefined>
Language Model: <N-gram with n = 2 i.e.  bigram .No smoothing used>
Original Text:  chair sat cat on the the
Unscrambled sentence:  sat on the chair the cat
Original Perplexity:  Undefined
Unscrambled Perplexity:  950192.501162325
Language Model: <N-gram with n = 2. Uses Good-Turing Smoothing with a log-function>
Perplexity = <3346106.4254960623>
Language Model: <N-gram with n = 2 i.e.  bigram .Uses Good-Turing Smoothing with a log-function>
Original Text:  chair sat cat on the the
Unscrambled sentence:  sat on the chair the cat
Original Perplexity:  4312485.81614361
Unscrambled Perplexity:  5186.606104977492
