## Matrix and Vocabulary Construction

In [1]:
import pandas as pd

import numpy as np

from scipy import sparse

import nltk
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
news = pd.read_csv("../data/estadao_noticias_eleicao.csv", encoding="utf-8")

In [3]:
content = news.titulo + " " + news.subTitulo + " " + news.conteudo
content = content.fillna("")

In [4]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

#### Removing punctuation

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
tokens_lists = content.apply(lambda text: tokenizer.tokenize(text.lower()))

#### Removing stopwords

In [6]:
stopword_ = stopwords.words('portuguese')
filtered_tokens = tokens_lists.apply(lambda tokens: [token for token in tokens if token not in stopword_])

#### Transforming list of lists into one list

In [7]:
tokens = [token for tokens_list in filtered_tokens for token in tokens_list]

In [8]:
matrix, vocab = co_occurrence_matrix(tokens)

## Consult Bigram Frequency

In [9]:
consultable_matrix = matrix.tocsr()

In [10]:
def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

#### Example

In [11]:
w1 = 'poucos'
w2 = 'recursos'
consult_frequency(w1, w2)

3