# Ex2

## Q1

TBD

## Q2

TBD

## Q3

TBD

## Q4

### Get Corpus and Dictionary

In [1]:
import urllib2
import zipfile
import os

data_folder = 'corpus_combined'
corpus_file = data_folder + '.zip'

if not os.path.exists(data_folder):
    os.makedirs(data_folder)
    
response = urllib2.urlopen('http://www2.mta.ac.il/~gideon/courses/nlp/data/' + corpus_file)
zipcontent = response.read()
with open(corpus_file, 'w') as f:
    f.write(zipcontent)
zip_ref = zipfile.ZipFile(corpus_file, 'r')
zip_ref.extractall(data_folder)
zip_ref.close()
os.remove(corpus_file)

response = urllib2.urlopen('http://www2.mta.ac.il/~gideon/courses/nlp/data/word_list_20k.txt')
dictionary = response.read().split('\n')
dictionary.remove('')

### Prepare training and test set

In [2]:
import urllib2

# training
response = urllib2.urlopen('http://www.gutenberg.org/files/421/421-0.txt')
kidnapped = response.read().decode('utf8')
response = urllib2.urlopen('http://www.gutenberg.org/ebooks/730.txt.utf-8')
oliver_twist = response.read().decode('utf8')
training_dataset = [kidnapped, oliver_twist]

# validation
response = urllib2.urlopen('http://www.gutenberg.org/files/98/98-0.txt')
a_tail_of_two_cities = response.read().decode('utf8')
validation_dataset = [a_tail_of_two_cities]

# testing
persuasion = open(data_folder + '/persuasion.txt', 'r').read()
testing_dataset = [persuasion]

### Hyperparameter Tuning

In [3]:
hyperparameters = {
    'lowercase': True,
    'stemming': True,
    'remove_stopwords': True,
    'smoothing': True,
    'lambda': 0.5
}

### Pre-Processing

In [None]:
from chapterize import Book
from nltk.tokenize import word_tokenize

OOV = 'OOV'
NUM = 'NUM'

# removing header, footer and chapters
training = ''
for book in training_dataset:
    training += Book(book).getContent() + ' '
validation = ''
for book in validation_dataset:
    validation += Book(book).getContent() + ' '
testing = ''
for book in testing_dataset:
    testing += Book(book).getContent() + ' '

# lowercasing
if hyperparameters['lowercase']:
    training = training.lower()
    validation = validation.lower()
    testing = testing.lower()
    dictionary = [word.lower() for word in dictionary]
    
# tokenizing
training = word_tokenize(training)
validation = word_tokenize(validation)
testing = word_tokenize(testing)

# removing stopwords
if hyperparameters['remove_stopwords']:
    from nltk.corpus import stopwords
    
    stopwords_set = set(stopwords.words('english'))
    training = [token for token in training if token not in stopwords_set]
    validation = [token for token in validation if token not in stopwords_set]
    testing = [token for token in testing if token not in stopwords_set]
    dictionary = [word for word in dictionary if word not in stopwords_set]
    
# stemming
if hyperparameters['stemming']:
    from nltk.stem import PorterStemmer
    
    ps = PorterStemmer()
    training = [ps.stem(token) for token in training]
    validation = [ps.stem(token) for token in validation]
    testing = [ps.stem(token) for token in testing]
    dictionary = [ps.stem(word) for word in dictionary]

dictionary.append(OOV)
dictionary.append(NUM)

# replacing OOV and NUM tokens
missing_tokens = set([token for token in training if token not in dictionary])
for (i, token) in enumerate(training):
    if token in missing_tokens:
        if token.isdigit():
            training[i] = NUM
        else:
            training[i] = OOV
            
missing_tokens = set([token for token in validation if token not in dictionary])
for (i, token) in enumerate(validation):
    if token in missing_tokens:
        if token.isdigit():
            validation[i] = NUM
        else:
            validation[i] = OOV

missing_tokens = set([token for token in testing if token not in dictionary])
for (i, token) in enumerate(testing):
    if token in missing_tokens:
        if token.isdigit():
            testing[i] = NUM
        else:
            testing[i] = OOV

### Training

In [None]:
def train_ngram_model(tokens):
    model = {}
    for token in tokens:
        model[token] = model.get(token, 0) + 1
    return model

In [None]:
from nltk import bigrams, trigrams

# unigram
unigram_model = train_ngram_model(training)

# bigram
bigram_model = train_ngram_model(list(bigrams(training)))

# unigram
trigram_model = train_ngram_model(list(trigrams(training)))

### Validation

In [None]:
from __future__ import division

def cross_entropy(tokens, model):
    import math
    
    if hyperparameters['smoothing']:
        total_sum = 0.0
        l = hyperparameters['lambda']
        v = len(set(tokens))
        for token in tokens:
            total_sum += -math.log((model.get(token, 0) + l) / (len(tokens) + l * v), 2)
        return 1 / len(tokens) * total_sum
    else:
        return 1 / len(tokens) * sum([-math.log(model[token] / len(tokens), 2) 
                                     for token in tokens
                                        if token in model and model[token] > 0])

In [None]:
from nltk import bigrams, trigrams

# unigram
result = cross_entropy(validation, unigram_model)
print 'Validation Unigram Cross-Entropy is:', '{0:.2f}'.format(result)

# bigram
result = cross_entropy(list(bigrams(validation)), bigram_model)
print 'Validation Bigram Cross-Entropy is:', '{0:.2f}'.format(result)

# trigram
result = cross_entropy(list(trigrams(validation)), trigram_model)
print 'Validation Trigram Cross-Entropy is:', '{0:.2f}'.format(result)

### Testing

In [None]:
from nltk import bigrams, trigrams

# unigram
result = cross_entropy(testing, unigram_model)
print 'Testing Unigram Cross-Entropy is:', '{0:.2f}'.format(result)

# bigram
result = cross_entropy(list(bigrams(testing)), bigram_model)
print 'Testing Bigram Cross-Entropy is:', '{0:.2f}'.format(result)

# trigram
result = cross_entropy(list(trigrams(testing)), trigram_model)
print 'Testing Trigram Cross-Entropy is:', '{0:.2f}'.format(result)

### Conclusion

A. For this exercise I used my own code written in Python using the nltk library. To remove chapter names, headers and footers I used [this](https://github.com/JonathanReeve/chapterize) open-source library.

B. For the training dataset I used **Kidnapped** by *Robert Louis Stevenson* and **Oliver Twist** by *Charles Dickens*. For validation purposes I used **A Tale of Two Cities** by *Charles Dickens*.

C. I used Additive/Lidstone Smoothing with &lambda; = 0.5

D. Yes, using nltk's word_tokenize I removed all "noisy" characters and left only words 

E. I used three language models - 
    1. Unigram
    2. Bigram
    3. Trigram
    
Results are as follows - 

<table>
    <tr><td>Lowercase</td><td>Stemming</td><td>No Stopwords</td><td>Unigram</td><td>Bigram</td><td>Trigram</td></tr>
    <tr><td>True</td><td>True</td><td>True</td><td>7.12</td><td>13.31</td><td>16.19</td></tr>
    <tr><td>True</td><td>True</td><td>False</td><td>6.91</td><td>13.20</td><td>16.63</td></tr>
    <tr><td>True</td><td>False</td><td>True</td><td>7.53</td><td>13.59</td><td>16.26</td></tr>
    <tr><td>True</td><td>False</td><td>False</td><td>7.17</td><td>13.35</td><td>16.67</td></tr>
    <tr><td>False</td><td>True</td><td>True</td><td>6.68</td><td>12.58</td><td>15.61</td></tr>
    <tr><td>False</td><td>True</td><td>False</td><td>6.78</td><td>13.02</td><td>16.47</td></tr>
    <tr><td><b>False</b></td><td><b>False</b></td><td><b>True</b></td><td><b>6.01</b></td><td><b>11.33</b></td><td><b>14.41</b></td></tr>
    <tr><td>False</td><td>False</td><td>False</td><td>6.36</td><td>12.18</td><td>15.61</td></tr>
</table>

F. I performed hyperparameter tuning on the smoothing lambda value and during the pre-processing (i.e. remove stopwords, run stemming etc.)

G. Yes

H. I used the provided dictionary

I. For missing words I used the token OOV and for missing numbers I used the token NUM

J. <b>???</b>