# Natural Language Processing


## The dataset

Federalist Papers downloaded from Project Guttenberg, available here: http://www.gutenberg.org/ebooks/18 .

In [3]:
import re

def load_federalist_corpus(filename):
    """ Load the federalist papers as a tokenized list of strings, one for each eassay"""
    with open(filename, "rt") as f:
        data = f.read()
    papers = data.split("FEDERALIST")
    
    # all start with "To the people of the State of New York:" (sometimes . instead of :)
    # all end with PUBLIUS (or no end at all)
    locations = [(i,[-1] + [m.end()+1 for m in re.finditer(r"of the State of New York", p)],
                 [-1] + [m.start() for m in re.finditer(r"PUBLIUS", p)]) for i,p in enumerate(papers)]
    papers_content = [papers[i][max(loc[1]):max(loc[2])] for i,loc in enumerate(locations)]

    # discard entries that are not actually a paper
    papers_content = [p for p in papers_content if len(p) > 0]

    # replace all whitespace with a single space
    papers_content = [re.sub(r"\s+", " ", p).lower() for p in papers_content]

    # add spaces before all punctuation, so they are separate tokens
    punctuation = set(re.findall(r"[^\w\s]+", " ".join(papers_content))) - {"-","'"}
    for c in punctuation:
        papers_content = [p.replace(c, " "+c+" ") for p in papers_content]
    papers_content = [re.sub(r"\s+", " ", p).lower().strip() for p in papers_content]
    
    authors = [tuple(re.findall("MADISON|JAY|HAMILTON", a)) for a in papers]
    authors = [a for a in authors if len(a) > 0]
    
    numbers = [re.search(r"No\. \d+", p).group(0) for p in papers if re.search(r"No\. \d+", p)]
    
    return papers_content, authors, numbers
    
    

In [13]:
# AUTOLAB_IGNORE_START
papers, authors, numbers = load_federalist_corpus("pg18.txt")

# AUTOLAB_IGNORE_STOP

## Bag of words, and TFIDF

In [14]:
import collections # optional, but we found the collections.Counter object useful
import scipy.sparse as sp
import numpy as np
import math
import random

def tfidf(docs):
    """
    Create TFIDF matrix.  This function creates a TFIDF matrix from the
    docs input.

    Args:
        docs: list of strings, where each string represents a space-separated
              document
    
    Returns: tuple: (tfidf, all_words)
        tfidf: sparse matrix (in any scipy sparse format) of size (# docs) x
               (# total unique words), where i,j entry is TFIDF score for 
               document i and term j
        all_words: list of strings, where the ith element indicates the word
                   that corresponds to the ith column in the TFIDF matrix
    """
    tf_docs = []
    idfs = {}
    edges = {}
    N = len(docs)
    
    for thisDoc in docs:
        words_thisDoc = thisDoc.split(" ")
        dict_word_cnts = dict(collections.Counter(words_thisDoc)) 
        if "" in dict_word_cnts:
            del dict_word_cnts[""]
        tf_docs.append(dict_word_cnts)
        
        for key in dict_word_cnts:
            if key in idfs:
                idfs[key] += 1
            else:
                idfs[key] = 1
    
    all_words = []
    index_ctr = {}
    word_counter = 0
    for word_idf in idfs:
        index_ctr[word_idf] = word_counter
        all_words.append(word_idf)
        word_counter += 1
        
        edges[word_idf] = {}
        doc_counter = 0
        for thisDocDict in tf_docs:
            if word_idf in thisDocDict:
                if np.float(thisDocDict[word_idf] * np.log(N/idfs[word_idf])) != 0.0:
                    edges[word_idf][doc_counter] = np.float64(thisDocDict[word_idf] * np.log(N/idfs[word_idf]))
            doc_counter += 1
    
    row = []
    col = []
    data = []
    
    for pkey in edges:
        for ckey in edges[pkey]:
            col.append(index_ctr[pkey])
            row.append(ckey)
            data.append(edges[pkey][ckey])

    A = sp.coo_matrix((data, (row, col)), dtype = np.float64)
    A = A.tocsr()
    
    return (A, all_words)


In [15]:
X_tfidf,_ = tfidf(papers)
X_tfidf

In [16]:
data = [
    "the goal of this lecture is to explain the basics of free text processing ",
    "the bag of words model is one such approach",
    "text processing via bag of words" 
]

# tfidf(data)
X_tfidf, words = tfidf(data)
print(X_tfidf.todense())
print(words)


[[ 0.81093022  1.09861229  0.          1.09861229  1.09861229  0.40546511
   1.09861229  1.09861229  1.09861229  1.09861229  0.40546511  0.40546511
   0.          0.          0.          0.          0.          0.          0.        ]
 [ 0.40546511  0.          0.          0.          0.          0.40546511
   0.          0.          0.          0.          0.          0.
   0.40546511  0.40546511  1.09861229  1.09861229  1.09861229  1.09861229
   0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.40546511  0.40546511  0.40546511
   0.40546511  0.          0.          0.          0.          1.09861229]]
['the', 'goal', 'of', 'this', 'lecture', 'is', 'to', 'explain', 'basics', 'free', 'text', 'processing', 'bag', 'words', 'model', 'one', 'such', 'approach', 'via']


In [17]:
def cosine_similarity(X):
    """
    Return a matrix of cosine similarities.
    
    Args:
        X: sparse matrix of TFIDF scores or term frequencies
    
    Returns:
        M: dense numpy array of all pairwise cosine similarities.  That is, the 
           entry M[i,j], should correspond to the cosine similarity between the 
           ith and jth rows of X.
    """
    X_dense = X.todense()
    X_dense = np.squeeze(np.asarray(X_dense))
    shape = (X_dense.shape[0],X_dense.shape[0])
    result_nparray = np.ones(shape)
    for i in range(len(X_dense)):
        for j in range(len(X_dense)):
            if i != j:
                numerator = np.sum(X_dense[i]*X_dense[j])
                doc_i_denom = np.sqrt(np.sum(np.square(X_dense[i])))
                doc_j_denom = np.sqrt(np.sum(np.square(X_dense[j])))
                result_nparray[i,j] = numerator / (doc_i_denom * doc_j_denom)
                
    return(result_nparray)
    pass

## N-gram language models

In [26]:
class LanguageModel:
    def __init__(self, docs, n):
        """
        Initialize an n-gram language model.
        
        Args:
            docs: list of strings, where each string represents a space-separated
                  document
            n: integer, degree of n-gram model
        """
        
        self.counts = {}
        self.n = n
        self.count_sums = {}
        
        all_words = []
        for d in docs:
            word_list = d.split(" ")
            
            all_words.extend(word_list)
            
            ngram_list = [tuple(word_list[i:i+n]) for i in range(len(word_list)-n+1)]
            for ngrams in ngram_list:
                n_minus1_tokens = ' '.join(ngrams[:n-1])
                nth_token = ngrams[n-1]
                if n_minus1_tokens in self.counts:
                    if nth_token in self.counts[n_minus1_tokens]:
                        self.counts[n_minus1_tokens][nth_token] += 1
                    else:
                        self.counts[n_minus1_tokens][nth_token] = 1
                else:
                    self.counts[n_minus1_tokens] = {}
                    self.counts[n_minus1_tokens][nth_token] = 1
                if n_minus1_tokens in self.count_sums:
                    self.count_sums[n_minus1_tokens] += 1
                else:
                    self.count_sums[n_minus1_tokens] = 1

        self.dictionary = set(all_words)
        pass
    
    def perplexity(self, text, alpha=1e-3):
        """
        Evaluate perplexity of model on some text.
        
        Args:
            text: string containing space-separated words, on which to compute
            alpha: constant to use in Laplace smoothing
            
        Note: for the purposes of smoothing, the dictionary size (i.e, the D term)
        should be equal to the total number of unique words used to build the model
        _and_ in the input text to this function.
            
        Returns: perplexity
            perplexity: floating point value, perplexity of the text as evaluted
                        under the model.
        """
        exponent = 0
        n = self.n
        text_word_list = text.split(" ")
        
        if len(text_word_list) < n:
            return
        
        D = len(self.dictionary.union(set(text_word_list)))
        ngram_text_list = [tuple(text_word_list[i:i+n]) for i in range(len(text_word_list)-n+1)]
        size = len(ngram_text_list)
        
        for ngrams in ngram_text_list:
            n_minus1_tokens = ' '.join(ngrams[:n-1])
            nth_token = ngrams[n-1]
            prob = 0
            if n_minus1_tokens in self.counts:
                if nth_token in self.counts[n_minus1_tokens]:
                    prob = float(self.counts[n_minus1_tokens][nth_token] + alpha)/(self.count_sums[n_minus1_tokens] 
                                                                                   + (alpha*D))
                else:
                    prob = float(alpha)/(self.count_sums[n_minus1_tokens] + (alpha*D))
            else:
                prob = float(alpha)/((alpha*D))
            exponent += -1 * np.log2(prob)

        return(2**(exponent / size))
        
        pass
        
    def sample(self, k):
        """
        Generate a random sample of k words.
        
        Args:
            k: integer, indicating the number of words to sample
            
        Returns: text
            text: string of words generated from the model.
        """
        n = self.n
        text_list = random.choice(list(self.counts)).split(" ")
        while(len(text_list) < k):
            start = ' '.join(text_list[len(text_list) - n + 1: ])
            if start not in self.counts:
                start = random.choice(list(self.counts))
            
            random_nthToken = random.choice(list(self.counts[start]))
            text_list.append(random_nthToken)
        text = ' '.join(text_list)

        return text

    
    

In [27]:
# AUTOLAB_IGNORE_START
documents = ['a b c d e f .',
            'b c e e f .']
unkno = ['b a b d e f .']

l_hamilton = [papers[i] for i in range(len(authors)) if len(authors[i]) == 1 and authors[i][0] == 'HAMILTON']
l_jay = [papers[i] for i in range(len(authors)) if len(authors[i]) == 1 and authors[i][0] == 'JAY']
l_madison = [papers[i] for i in range(len(authors)) if len(authors[i]) == 1 and authors[i][0] == 'MADISON']

l_hamilton_model = LanguageModel(l_hamilton, 3)
l_jay_model = LanguageModel(l_jay, 3)
l_madison_model = LanguageModel(l_madison, 3)


print(l_hamilton_model.perplexity(papers[0]))
l_hamilton_model.sample(200)
unknown = [papers[i] for i in range(len(authors)) if len(authors[i]) == 2]
print(len(unknown))

print(sum([l_hamilton_model.perplexity(thisDoc) for thisDoc in unknown]))
print(sum([l_jay_model.perplexity(thisDoc) for thisDoc in unknown]))
print(sum([l_madison_model.perplexity(thisDoc) for thisDoc in unknown]))

# AUTOLAB_IGNORE_STOP



12.5877243606


'what remedy can there be but one side , rests on mere general assertion , unsupported by any means be compared with the executive are comprehended in one way or other sinister motives , which signalizes the genius and fate of a right would ever be furnished by occasional assessments , at least you will , after all , as were before occasioned by obstructing the progress of things must rest on the evidence of this essential support , till the frail and tottering edifice seems ready to take effect , it be to enfeeble the union , an intimate intercourse between them a superiority of the next head of any candid and honest adversary of the assembly or senate , to sound the alarm when necessary ? what shall be passed . " humanity and good se se , we are even dissimilar views in the necessity of laying taxes ought to prize causes . notwithstanding , if both the dangers that may threaten their american dominions from the diffusive construction of ships is also an established fact , commanded t