# Week 2 Homework

Group *May oh ness*

Nick Halliwell, Aina Lopez, Yaroslav Marchuk

## 1. A skeleton class structure for documents

In [1]:
import numpy as np
import codecs
import nltk
import re
import math 
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from itertools import repeat


class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        

        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
            
    def document_term_matrix(self):
        """
        description:  returns a D by V array of frequency counts
        """  
        # subroutine: computes the counts of each vocabulary in the document
        def counts(doc):
            # initialize a matrix
            term_mat = [0]*len(self.token_set)
            for token in doc.tokens:
                term_mat[list(self.token_set).index(token)] = term_mat[list(self.token_set).index(token)] + 1
            return term_mat;
            
        self.doc_term_matrix = []
        
        for doc in self.docs:
            self.doc_term_matrix.append([doc.pres + " " + doc.year, counts(doc)])


      
    def tf_idf(self):
        """
        description:  returns a D by V array of tf-idf scores
        """
        # Compute inverse document frequency 
        idf = [0]*len(self.token_set)
        for token in self.token_set:
            ind = 0
            for doc in self.docs:
                if token in doc.tokens:
                    ind += 1 
            idf[list(self.token_set).index(token)] = math.log(self.N/ind)
        
        # Create a subroutine that computes tf_idf for one document
        def tfidf(doc):
            term_mat = [0]*len(self.token_set)
            for token in doc.tokens:
                term_mat[list(self.token_set).index(token)] = term_mat[list(self.token_set).index(token)] + 1 
        
            for i,term in enumerate(term_mat):
                if term != 0:
                    term_mat[i] = (1 + math.log(term)) * idf[i]
            return term_mat;
        
        #tf_idf
        self.tf_idf_matrix = []
        for doc in self.docs:
            self.tf_idf_matrix.append([doc.pres + " " + doc.year, tfidf(doc)])
            
            
        
    def dict_rank(self, n, dictionary, token_repr):
        """
        description:  returns the top n documents based on a given dictionary and represenation of tokens
        """
        if token_repr == "tf-idf":
            self.tf_idf()
            representation = self.tf_idf_matrix
            
        if token_repr == "doc-term":
            self.document_term_matrix()
            representation = self.doc_term_matrix
            
        # Return top n docs based on dictionary given
        score = []
        x=self.token_set
        x=list(x)
        for token in x: 
            try:
                score.append(dictionary[token])
            except: 
                score.append(0)

        # get a vector with all the scores in order
        score=[int(x) for x in score]
        rank = {}
        elements=range(len(representation))
   
        for i in elements:
            rank[representation[i][0]] = np.dot(score,representation[i][1])
            
        # Get sorted view of the keys.
        s = sorted(rank, key=rank.get, reverse=True)[0:(n-1)]
        
        ranking = {}
        for key in s:
            ranking[key] =  rank[key]
        
        return ranking 

In [2]:
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])





Load the document and create the corpus


In [3]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

In [4]:
#text = open("/Users/ainalopez/Downloads/text_mining-master/data/pres_speech/sou_all copy.txt", 'r').read()
text = open("/home/yaroslav/Projects/text_mining/data/pres_speech/sou_all.txt", 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

#Instantite the corpus class
corpus = Corpus(pres_speech_list, '/home/yaroslav/Projects/text_mining/data/stopwords/stopwords.txt', 2)
#corpus = Corpus(pres_speech_list, '/Users/ainalopez/Downloads/text_mining-master-2/data/stopwords/stopwords.txt', 2)

########################## HW3 ##########################
###################################################



#######################################


Define functions to get m_kv and n_dk

In [148]:
import numpy as np

# the number of words in document d that have topic allocation k
def n_k(z, K):      
    n = [sum(np.equal(z ,k)) for k in range(K)]
    return n


# for each word, the number of times each topic appears
def m_k(Z, K): 
    m=[]
    for k in range(K):
        m.append([0]*N)
    
    for d in range(len(Z)):
        print "doc" + str(d)
        words = corpus.docs[d].tokens
        for w in range(len(words)):
            v=unique_words.index(words[w])
            k=Z[d][w]
            m[k][v]=m[k][v] + 1
    return m


Start Algorithm:


In [162]:


N = len(corpus.token_set) # number of unique words
unique_words=list(corpus.token_set)
D = corpus.N                  # number of documents
K = 3                         # number of topics
alpha = [1]*K             # hyperparameter 1
eta = [1]*N                 # hyperparameter 2


S = 5                         # number of iterations
burn_in = 2                   # burn in period


# num words in each document
V =[len(corpus.docs[d].tokens) for d in range(D)]

# Randomly allocate the topics list inside a list 
Z=[]
for d in range(D):
    new = [np.random.randint(K-1) for i in range(V[d])]
    Z.append(new)
    
    
# beta = matrix # % of topic in a word (given a document)
random.seed(111)
beta = np.random.dirichlet(eta, K)

# theta = matrix col = topics # % of topic in a document 
random.seed(111)
theta = np.random.dirichlet(alpha, D)


# Gibbs Sampling algorithm

for s in range(S):
    # Allocation of topics to words
    for d in range(10): 

        # Sample the assingnment of words
        z = []
        V = len(corpus.docs[d].tokens)


        words = corpus.docs[d].tokens
        V=[unique_words.index(x) for x in words]

        for v in V:
            # Compute probabilities
            all_prob = [[theta[d,k] * beta[k,v] / np.dot(theta[d,],beta[:,v])] for k in (range(K))]
            prob=[]
            for x in range(K):
                prob.append(all_prob[x][0])
            prob=[x/3 for x in prob]

            #Sample topics of words
            z.append(np.argmax( np.random.multinomial(1, prob, 1) ))

        Z[d]=z
        print d
    # Update Theta
    for z in Z:
        n_dk = n_k(z, K)
        alpha_new=[(alpha[k] + n_dk[k]) for k in range(K)]
        theta_cprobs = np.random.dirichlet(alpha_new, 1)[0]
        theta[d] = np.random.dirichlet(theta_cprobs, 1)

    print "Theta updated!"

    # Update Betas
    mk=m_k(Z,K)

    for k in range(K):
        eta_new=[(eta[n] + mk[k][n]) for n in range(N)]
        beta_cprobs = np.random.dirichlet(eta_new, 1)[0]
        beta[k] = np.random.dirichlet(beta_cprobs, 1)

    print "Betas updated!"

0
1
2
3
4
5
6
7
8
9
Theta updated!
doc0
doc1
doc2
doc3
doc4
doc5
doc6
doc7
doc8
doc9
doc10
doc11
doc12
doc13
doc14
doc15
doc16
doc17
doc18
doc19
doc20
doc21
doc22
doc23
doc24
doc25
doc26
doc27
doc28
doc29
doc30
doc31
doc32
doc33
doc34
doc35
doc36
doc37
doc38
doc39
doc40
doc41
doc42
doc43
doc44
doc45
doc46
doc47
doc48
doc49
doc50
doc51
doc52
doc53
doc54
doc55
doc56
doc57
doc58
doc59
doc60
doc61
doc62
doc63
doc64
doc65
doc66
doc67
doc68
doc69
doc70
doc71
doc72
doc73
doc74
doc75
doc76
doc77
doc78
doc79
doc80
doc81
doc82
doc83
doc84
doc85
doc86
doc87
doc88
doc89
doc90
doc91
doc92
doc93
doc94
doc95
doc96
doc97
doc98
doc99
doc100
doc101
doc102
doc103
doc104
doc105
doc106
doc107
doc108
doc109
doc110
doc111
doc112
doc113
doc114
doc115
doc116
doc117
doc118
doc119
doc120
doc121
doc122
doc123
doc124
doc125
doc126
doc127
doc128
doc129
doc130
doc131
doc132
doc133
doc134
doc135
doc136
doc137
doc138
doc139
doc140
doc141
doc142
doc143
doc144
doc145
doc146
doc147
doc148
doc149
doc150
doc151
doc152
doc1

ValueError: object too deep for desired array

[  2.48210053e-06   1.43348322e-04   7.66228158e-06 ...,   3.14866592e-06
   4.24934456e-05   8.61481981e-07]
[  4.54462298e-06   1.61632525e-04   1.77190105e-05 ...,   8.49334695e-07
   1.54036222e-05   2.56438766e-06]
[  1.40336203e-05   1.77720286e-06   1.24278422e-05 ...,   6.15729584e-05
   1.31087434e-04   1.44956312e-05]
Betas updated!
