In [5]:
import re

from nltk.tokenize import RegexpTokenizer
from nltk.stem import RSLPStemmer
from nltk.corpus import stopwords as StopWords

In [6]:
REGEX_TO_SPLIT_DOCUMENTS = "(O|A)+\s+Sr(\.|\.º|\.ª)\s+([A-zÀ-ú]|\s*)+(\(.*\))?: —"


deputies_docs_unprocessed = {}
documents_unprocessed_idx = {}
documents_to_deputies = {}

doc_idx = 0
did_first_match = False

with open("resources/example_of_parlamentar_discussion/darl14sl02n014.txt") as file:
    
    first_line = next(file)
    DATE_SECTION_REGEX = "(?i)\d+ de (\w+) de \d{4}"
    romanic_number = "(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})"
    SERIES_SECTION_REGEX = romanic_number + " (Série|SÉRIE) — (Número|NÚMERO) \d{1,3}"
    
    
    pattern = re.compile(REGEX_TO_SPLIT_DOCUMENTS)
    date_section_pattern = re.compile(DATE_SECTION_REGEX)
    series_section_pattern = re.compile(SERIES_SECTION_REGEX)
    numberic_pattern = re.compile("\d+")
    
    current_docs = ""
    current_deputy = None
    
    for line in file:
        date_section_match = date_section_pattern.search(line)
        series_section_match = series_section_pattern.search(line)
        if date_section_match is not None or series_section_match is not None:
            #we are in a section, let's consume until a number appear
            line_is_page_number = False
            while not line_is_page_number:
                #check if line is number
                #if it is, then line_is_page_number = True
                line = next(file)
                numeric_match = numberic_pattern.search(line)
                if numeric_match is not None:
                    line_is_page_number = True
                    line = next(file)
        match = pattern.search(line)
        if match is not None:
            #a new document
            #is this the first one? if it is, then we already consumed the summary section
            if current_deputy is not None:
                #save current document
                documents_unprocessed_idx[doc_idx] = current_docs
                if current_deputy not in deputies_docs_unprocessed:
                    deputies_docs_unprocessed[current_deputy] = []
                deputies_docs_unprocessed[current_deputy].append(doc_idx)
                documents_to_deputies[doc_idx] = current_deputy
                doc_idx += 1
            #docs stored. start processing new one
            current_deputy = match.group()[0:-3]
            current_docs = line.replace(current_deputy, '')
        else:
            current_docs += line
    if current_deputy is not None:
                #save current document
                documents_unprocessed_idx[doc_idx] = current_docs
                if current_deputy not in deputies_docs_unprocessed:
                    deputies_docs_unprocessed[current_deputy] = []
                deputies_docs_unprocessed[current_deputy].append(doc_idx)
                documents_to_deputies[doc_idx] = current_deputy
                doc_idx += 1

                

In [7]:
# Tokenize docs
tokenizer = RegexpTokenizer(r'\w+')
ps = RSLPStemmer()
pt_stop_words = StopWords.words('portuguese')

def mytokeniser(s):
    aux = filter(lambda x: x not in pt_stop_words , [w.lower() for w in tokenizer.tokenize(s)])
    return list(map(ps.stem, aux))

documents_tokenized = {}

for idx in documents_unprocessed_idx:
    documents_tokenized[idx] = mytokeniser(documents_unprocessed_idx[idx])

tokenized_corpus = list(documents_tokenized.values())

In [8]:
docs_to_be_removed = []
for idx in range(0, len(tokenized_corpus)):
    if len(tokenized_corpus[idx]) == 0:
        deputies_docs_unprocessed[documents_to_deputies[idx]].remove(idx)
        docs_to_be_removed.append(idx)
        
for el in sorted(docs_to_be_removed, reverse=True):
    del tokenized_corpus[el]

In [9]:
tokenized_corpus_idf_search = list(map(set, tokenized_corpus))

vocab = set()

for doc in tokenized_corpus:
    vocab = vocab.union(set(doc))  

print(f"My vocabolary size is {len(vocab)}")

My vocabolary size is 2460


In [10]:
import math

idfvocab = {}

def idf(term, corpus):
    cnt =  sum([1 if term in doc else 0 for doc in corpus])
    return math.log10( len(corpus) / cnt )

for term in vocab:
    term_idf = idf(term, tokenized_corpus_idf_search)
    idfvocab[term] = term_idf

In [11]:
import numpy as np

idfvocab_it = [(el[0],el[1]) for el in idfvocab.items()]

aux = np.array( idfvocab_it )
low = float( min( aux[:,1] ) )
high = float( max( aux[:,1] ) )

print(f"Min is {low} and max is {high}")

Min is 0.1595031217250559 and max is 2.3873898263387296


In [12]:
def keep_terms( lower, upper, threshold, step, idf_vocabulary ):
    low = lower
    up = upper
    candidates = idf_vocabulary
    while len(candidates) > threshold:
        #print(f"current vocabolary size is {len(candidates)}")
        low = low + step
        up = up - step
        candidates = [  term for term in idf_vocabulary if term[1] >= low and term[1] <= up  ]
    return candidates


#cnd = keep_terms(low, high, int(len(idfvocab_it)*0.2), 0.005, idfvocab_it)
cnd = keep_terms(low, high, len(idfvocab_it), 0.005, idfvocab_it)
len(cnd)

vc = np.array(cnd) #a matrix, with column 0 being terms and column 1 being idf
vc_terms = vc[:,0]

In [13]:
def normTFx(term,doc):
    return doc.count(term)/len(doc)

def tfidfmat(corpus, tl, idfvocab) :
    mat =[]
    for term in tl :
        idft = idfvocab[term]
        row = []
        for doc in corpus:
            tft = normTFx(term,doc)
            tf_idf_term_document = tft*idft
            row.append(tf_idf_term_document)
        mat.append(row)
    return mat    
            
    

tfidf_matrix = tfidfmat(tokenized_corpus, vc_terms, idfvocab) 
tfidf_matrix_np = np.array(tfidf_matrix)

In [14]:
# Here is an optimized implementation for te creation of the TF.IDF Matrix, that takes less than half the time than the implementation above

def normTFx_optimized(term, docMapCount, docLength):
    '''
        If you compare this implementation with the above one, you can notice that:
           - docMapCount is a dictionary where the key is the term, and the value is the count of the term in that document
           - the length of the document is received as a parameter
        This leads that, in this implementation, for each time we need to count a term in a document, we get that in constant time
    '''
    return (docMapCount.get(term) or 0)/docLength


from functools import reduce


def reduce_doc_map_count(reduced, el):
    if el in reduced:
        reduced[el] += 1
    else:
        reduced[el] = 1
    return reduced


'''
    the bellow code is doing the same as the following:
    for doc in tokenized_corpus_sampled:
        docMapCount = {}
        for term in doc:
            if term in docMapCount:
                docMapCount[term] += 1
            else:
                docMapCount[term] = 1
'''
tokenized_corpus_map_count = list(map( lambda doc: reduce( reduce_doc_map_count , doc, {}  ) , tokenized_corpus  ))

def tfidfmat(corpusMapCount, corpus, tl,idfvocab) :
    mat =[]
    for term in tl :
        idft = idfvocab[term]
        row = []
        for i in range(len(corpus)):
            tft = normTFx_optimized(term,corpusMapCount[i],len(corpus[i]))
            tf_idf_term_document = tft*idft
            row.append(tf_idf_term_document)
        mat.append(row)
    return mat    
            
    

tfidf_matrix = tfidfmat(tokenized_corpus_map_count, tokenized_corpus, vc_terms, idfvocab) 
tfidf_matrix_np = np.array(tfidf_matrix)

In [15]:
tfidf_matrix_np.shape

(488, 244)

In [16]:
print(f"The document with index 0 contains {len(tokenized_corpus[0])} words")
print(f"The term with index 0 is `{vc_terms[0]}`")

print(f"The importance of the term `{vc_terms[0]}` in the document with idx = 0 is {tfidf_matrix_np[0,0]}")

The document with index 0 contains 50 words
The term with index 0 is `patrã`
The importance of the term `patrã` in the document with idx = 0 is 0.0


In [17]:
from sklearn.decomposition import NMF
model = NMF(n_components=4, init='random', random_state=0)
W = model.fit_transform(tfidf_matrix_np) # loadings
H = model.components_ #scores

In [18]:
def get_top_N_terms(matrix_slice, N):
    return matrix_slice.argsort()[-N:]

def get_terms_from_slice(loadings_matrix, idx, topN, bag_of_words, orientation="col"):
    '''
        the parameter `orientation` can either be "col" or "row", so we can process a loadings matrix being it transposed or not
    '''
    k = None
    if orientation == "col":
        k = loadings_matrix[:,idx]
    elif orientation == "row":
        k = loadings_matrix[idx,:]
    else:
        raise Exception("Orientation not recognized")
    k_top5terms_idx = get_top_N_terms(k,topN)
    return bag_of_words[k_top5terms_idx]



In [19]:
top_words = 10
for k in range(0,W.shape[1]):
    # Get terms for the k-th characteristic / topic
    print(f"The terms with more weight in the component {k} are: {get_terms_from_slice(W, k, top_words, vc_terms)}")

# here we are printing the top 7, but the this choise is arbitrary - we are going to analyze as much as we need to understand the topics

The terms with more weight in the component 0 are: ['mal' 'senh' 'ness' 'assin' 'fic' 'ouv' 'prim' 'ide' 'ministr' 'verdad']
The terms with more weight in the component 1 are: ['falt' 'próx' 'vou' 'ferr' 'faç' 'favor' 'peç' 'que' 'conclu' 'centr']
The terms with more weight in the component 2 are: ['pens' 'gost' 'falt' 'vou' 'próx' 'segund' 'aind' 'fal' 'nad' 'diss']
The terms with more weight in the component 3 are: ['mesm' '2' 'utiliz' 'inform' 'desperdíci' 'temp' 'segund' 'encerr'
 'proced' 'poi']


In [20]:
sorted(list(enumerate(H[:,deputies_docs_unprocessed["O Sr. André Ventura (CH)"][4]])), key=lambda x: x[1])

[(3, 6.459847355601038e-06),
 (0, 4.6499527237510584e-05),
 (2, 0.00011765550185564994),
 (1, 0.0008421342184620323)]