## Matrix and Vocabulary Construction

In [54]:
import pandas as pd
import numpy as np
from scipy import sparse
import nltk
from nltk import bigrams    
import scipy.sparse as sps

In [55]:
news = pd.read_csv("./estadao_noticias_eleicao.csv", encoding="utf-8")

In [56]:
content = news.titulo + " " + news.subTitulo + " " + news.conteudo
content = content.fillna("")

In [57]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

In [58]:
tokens_lists = content.apply(lambda text: text.lower().split())

In [59]:
tokens = [token for tokens_list in tokens_lists for token in tokens_list]

In [60]:
matrix, vocab = co_occurrence_matrix(tokens)

## Consult Bigram Frequency

In [61]:
consultable_matrix = matrix.tocsr()

In [62]:
def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

### Example

In [63]:
w1 = 'poucos'
w2 = 'recursos'
consult_frequency(w1, w2)

3

## Top-3 em ordem decrescente

In [101]:
def top3(w1):
    top3 = {}
    for w in consultable_matrix[vocab[w1]].indices:
        if(len(top3) < 3):
            top3[w] = consultable_matrix[vocab[w1], w]
        else:
            if(consultable_matrix[vocab[w1], w] > consultable_matrix[vocab[w1], min(top3, key=top3.get)]):
                del top3[min(top3, key=top3.get)]
                top3[w] = consultable_matrix[vocab[w1], w]
                
    words = [str(list(vocab.keys())[list(vocab.values()).index(list(top3.keys())[0])]), str(list(vocab.keys())[list(vocab.values()).index(list(top3.keys())[1])]), str(list(vocab.keys())[list(vocab.values()).index(list(top3.keys())[2])])]
    return words
                

In [102]:
print(top3("presidente"))

['do', 'dilma', 'da']


## Expandir consulta original

In [103]:
def getIDs(search):
    searchWords = search.split(" ")
    wordIDs = []
    if(len(searchWords) >=2):
        booleanOperator = searchWords[1]
        wordIDs1 = []
        wordIDs2 = []
        wordIDs3 = []
        wordIDs4 = []
        for i in range(0, len(tokens_lists)):
            for w in tokens_lists[i]:
                if(searchWords[0].lower() == w.lower()):
                    wordIDs1.append(i)
                if(searchWords[2].lower() == w.lower()):
                    wordIDs2.append(i)
                if(searchWords[3].lower() == w.lower()):
                    wordIDs3.append(i)
                if(searchWords[4].lower() == w.lower()):
                    wordIDs4.append(i)
                if((i in wordIDs1) and (i in wordIDs2) and (i in wordIDs3) and (i in wordIDs4)):
                    break
        if(booleanOperator == "AND"):
            wordIDs = set(wordIDs1).intersection(wordIDs2)
        else:
            wordIDs = set(wordIDs1).union(wordIDs2)
    else:
        for i in range(0, len(tokens_lists)):
            for w in range(0, len(token_lists)):
                if(searchWords[0].lower() == token_lists[w].lower()):
                    wordIDs.append(news.values[i][2])  
                    break
    return wordIDs


In [104]:
def expandQuery(word, top3):
    query = word + " OR " + top3[0] + " OR " + top3[1] + " OR " + top3[2]
    print(query)
    print(getIDs(query))

In [None]:
top3 = top3("dilma")
print("As 3 palavras que mais co-ocorrem com a palavra 'dilma' são: " + top3[0] + ", " + top3[1] + ", " + top3[2])
expandQuery("dilma", top3)

As 3 palavras que mais co-ocorrem com a palavra 'dilma' são: rousseff,, e, rousseff
dilma OR rousseff, OR e OR rousseff
