# Week 2 Homework

Group *May oh ness*

Nick Halliwell, Aina Lopez, Yaroslav Marchuk

## 1. A skeleton class structure for documents

In [3]:
import numpy as np
import codecs
import nltk
import re
import math 
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from itertools import repeat


class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        

        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
            
    def document_term_matrix(self):
        """
        description:  returns a D by V array of frequency counts
        """  
        # subroutine: computes the counts of each vocabulary in the document
        def counts(doc):
            # initialize a matrix
            term_mat = [0]*len(self.token_set)
            for token in doc.tokens:
                term_mat[list(self.token_set).index(token)] = term_mat[list(self.token_set).index(token)] + 1
            return term_mat;
            
        self.doc_term_matrix = []
        
        for doc in self.docs:
            self.doc_term_matrix.append([doc.pres + " " + doc.year, counts(doc)])


      
    def tf_idf(self):
        """
        description:  returns a D by V array of tf-idf scores
        """
        # Compute inverse document frequency 
        idf = [0]*len(self.token_set)
        for token in self.token_set:
            ind = 0
            for doc in self.docs:
                if token in doc.tokens:
                    ind += 1 
            idf[list(self.token_set).index(token)] = math.log(self.N/ind)
        
        # Create a subroutine that computes tf_idf for one document
        def tfidf(doc):
            term_mat = [0]*len(self.token_set)
            for token in doc.tokens:
                term_mat[list(self.token_set).index(token)] = term_mat[list(self.token_set).index(token)] + 1 
        
            for i,term in enumerate(term_mat):
                if term != 0:
                    term_mat[i] = (1 + math.log(term)) * idf[i]
            return term_mat;
        
        #tf_idf
        self.tf_idf_matrix = []
        for doc in self.docs:
            self.tf_idf_matrix.append([doc.pres + " " + doc.year, tfidf(doc)])
            
            
        
    def dict_rank(self, n, dictionary, token_repr):
        """
        description:  returns the top n documents based on a given dictionary and represenation of tokens
        """
        if token_repr == "tf-idf":
            self.tf_idf()
            representation = self.tf_idf_matrix
            
        if token_repr == "doc-term":
            self.document_term_matrix()
            representation = self.doc_term_matrix
            
        # Return top n docs based on dictionary given
        score = []
        x=self.token_set
        x=list(x)
        for token in x: 
            try:
                score.append(dictionary[token])
            except: 
                score.append(0)

        # get a vector with all the scores in order
        score=[int(x) for x in score]
        rank = {}
        elements=range(len(representation))
   
        for i in elements:
            rank[representation[i][0]] = np.dot(score,representation[i][1])
            
        # Get sorted view of the keys.
        s = sorted(rank, key=rank.get, reverse=True)[0:(n-1)]
        
        ranking = {}
        for key in s:
            ranking[key] =  rank[key]
        
        return ranking 

In [5]:
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])





Load the document and create the corpus


In [6]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

In [7]:
text = open("/Users/ainalopez/Downloads/text_mining-master/data/pres_speech/sou_all copy.txt", 'r').read()
# text = open("/home/yaroslav/Projects/text_mining/data/pres_speech/sou_all.txt", 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

#Instantite the corpus class
# corpus = Corpus(pres_speech_list, '/home/yaroslav/Projects/text_mining/data/stopwords/stopwords.txt', 2)
corpus = Corpus(pres_speech_list, '/Users/ainalopez/Downloads/text_mining-master-2/data/stopwords/stopwords.txt', 2)

## 2. tf_idf matrix and SVD

In [9]:
# Compute tf_idf matrix 
corpus.tf_idf()
matrix = corpus.tf_idf_matrix

X =[] 
presidents = []

for row in matrix:
    X.append(row[1])  
    presidents.append(row[0].split()[0])

X = np.array(X)


In [94]:
# Compute svd 
sing_values_nb = 70
 
U, s, V = np.linalg.svd(X)
X_hat = np.dot(U[:,0:(sing_values_nb-1)] * s[0:(sing_values_nb-1)], V[0:(sing_values_nb-1),:])

# 3. Cosine Similarity of documents

In [95]:
def cosine_similarity(doc1, doc2):
    return np.dot(doc1, doc2) / ( math.sqrt(np.dot(doc1, doc1))* math.sqrt(np.dot(doc2, doc2)) )

similarity_X = np.zeros((len(X), len(X)))
similarity_X_hat = np.zeros((len(X_hat), len(X_hat)))

for i in range(len(X)):
    for j in range(len(X)):
        similarity_X[i][j] = cosine_similarity(X[i], X[j])
        
for i in range(len(X_hat)):
    for j in range(len(X_hat)):
        similarity_X_hat[i][j] = cosine_similarity(X_hat[i], X_hat[j])
  

Create labels for the speeches made by Republicans and Democrats.

In [96]:
# Add labels per president 
Republicans = ["Lincoln", "Grant", "Hayes", "Eisenhower", "Arthur", "Harrison", "McKinley", 
              "Taft", "Harding", "Coolidge", "Hoover", "Nixon", "Ford", "Reagan", "Bush", "Tyler", "Taylor", "Fillmore"]

Democrats = ["Jackson", "Buren", "Polk", "Pierce", "Buchanan", "Johnson", "Cleveland", "Wilson", 
             "Truman", "Kennedy", "Carter", "Clinton", "Obama"]

Others = ["Jefferson", "Madison", "Monroe", "Adams" , "Washington"]


ideology = []
R = []
D = []

for i in range(len(presidents)):
    if presidents[i] in Republican:
        ideology.append('Republicans')
        R.append(i)
    elif presidents[i] in Democratic:
        ideology.append('Democrats')
        D.append(i)
    elif presidents[i] in Others:
        ideology.append('Others')
    else:
        ideology.append('NA')
        


Since similarity matrices are very large, we cannot inspect them directly. 

We are going to compute the mean of the similarity of Democrat speeches, the mean of similarity of Republican speeches and the mean of similarity between Republican and democrat speeches.

In [97]:
rep_rep = []
dem_dem = []
rep_dem = []

rep_rep_h = []
rep_dem_h = []
dem_dem_h = []


for r in range(len(R)):
    rep_rep.extend(similarity_X[r, R])
    rep_rep_h.extend(similarity_X_hat[r, R])
    rep_dem.extend(similarity_X[r, D])
    rep_dem_h.extend(similarity_X_hat[r, D])
    
for d in range(len(D)):
    dem_dem.extend(similarity_X[d, D])
    dem_dem_h.extend(similarity_X_hat[d, D])
    rep_dem.extend(similarity_X[d, R])
    rep_dem_h.extend(similarity_X_hat[d, R])


print "Republican speeches "
print "Without SVD: ", np.mean(rep_rep)
print "With SVD: ", np.mean(rep_rep_h)

print(" ")

print "Democrat speeches "
print "Without SVD: ",np.mean(dem_dem) 
print "With SVD: ",np.mean(dem_dem_h)

print(" ")

print "Others speeches "
print "Without SVD: ", np.mean(rep_dem)
print "With SVD: ", np.mean(rep_dem_h)




Republican speeches 
Without SVD:  0.0838146250777
With SVD:  0.279057416427
 
Democrat speeches 
Without SVD:  0.0893912241585
With SVD:  0.30031307907
 
Others speeches 
Without SVD:  0.0849360505636
With SVD:  0.285312456328


Results:

Republican speeches similarity
Without SVD:  0.0838146250777
With SVD:  0.279057416427
 
Democrat speeches similarity
Without SVD:  0.0893912241585
With SVD:  0.30031307907
 
Democrat vs Republican speeches similarity
Without SVD:  0.0849360505636
With SVD:  0.285312456328

We can see that the average cosine similarity within speeches of the same ideology increased when we used SVD. However, surprisingly, the average cosine similarity within speeches of different ideology also increased.
