## Matrix and Vocabulary Construction

In [147]:
import pandas as pd

import numpy as np

from scipy import sparse

import nltk
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [148]:
news = pd.read_csv("./estadao_noticias_eleicao.csv", encoding="utf-8")

In [149]:
content = news.titulo + " " + news.subTitulo + " " + news.conteudo
content = content.fillna("")

In [150]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

#### Removing punctuation

In [151]:
tokenizer = RegexpTokenizer(r'\w+')
tokens_lists = content.apply(lambda text: tokenizer.tokenize(text.lower()))

#### Removing stopwords

In [152]:
stopword_ = stopwords.words('portuguese')
filtered_tokens = tokens_lists.apply(lambda tokens: [token for token in tokens if token not in stopword_])

#### Transforming list of lists into one list

In [153]:
tokens = [token for tokens_list in filtered_tokens for token in tokens_list]

In [154]:
matrix, vocab = co_occurrence_matrix(tokens)

## Consult Bigram Frequency

In [155]:
consultable_matrix = matrix.tocsr()

In [156]:
def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

### Example

In [157]:
w1 = 'poucos'
w2 = 'recursos'
consult_frequency(w1, w2)

3

## Top-3 em ordem decrescente

In [158]:
def top3(w1):
    top3 = {}
    for w in consultable_matrix[vocab[w1]].indices:
        if(len(top3) < 3):
            top3[w] = consultable_matrix[vocab[w1], w]
        else:
            if(consultable_matrix[vocab[w1], w] > consultable_matrix[vocab[w1], min(top3, key=top3.get)]):
                del top3[min(top3, key=top3.get)]
                top3[w] = consultable_matrix[vocab[w1], w]            
    words = [list(vocab.keys())[list(vocab.values()).index(list(top3.keys())[0])], list(vocab.keys())[list(vocab.values()).index(list(top3.keys())[1])], list(vocab.keys())[list(vocab.values()).index(list(top3.keys())[2])]]
    print(str(words[0]) + ": " + str(list(top3.keys())[0]) + ", " + str(words[1]) + ": " + str(list(top3.keys())[1]) + ", " + str(words[2]) + ": " + str(list(top3.keys())[2]))
    return words
                

In [159]:
top3("presidente")

UnicodeEncodeError: 'ascii' codec can't encode character u'\xfa' in position 3: ordinal not in range(128)

## Expandir consulta original

In [None]:
def getIDs(search):
    searchWords = search.split(" ")
    wordIDs = []
    if(len(searchWords) >=2):
        booleanOperator = searchWords[1]
        wordIDs1 = []
        wordIDs2 = []
        wordIDs3 = []
        wordIDs4 = []
        for i in range(0, len(tokens_lists)):
            for w in tokens_lists[i]:
                if((searchWords[0].lower() == w.lower()) and (i not in wordIDs1)):
                    wordIDs1.append(i)
                if((searchWords[2].lower() == w.lower()) and (i not in wordIDs2)):
                    wordIDs2.append(i)
                if((searchWords[3].lower() == w.lower()) and (i not in wordIDs3)):
                    wordIDs3.append(i)
                if((searchWords[4].lower() == w.lower()) and (i not in wordIDs4)):
                    wordIDs4.append(i)
                if((i in wordIDs1) and (i in wordIDs2) and (i in wordIDs3) and (i in wordIDs4)):
                    break
        if(booleanOperator == "AND"):
            wordIDs = set(wordIDs1).intersection(wordIDs2)
            wordIDs = set(wordIDs).intersection(wordIDs3)
            wordIDs = set(wordIDs).intersection(wordIDs4)
        else:
            wordIDs = set(wordIDs1).union(wordIDs2)
            wordIDs = set(wordIDs).union(wordIDs3)
            wordIDs = set(wordIDs).union(wordIDs4)
    else:
        for i in range(0, len(tokens_lists)):
            for w in range(0, len(token_lists)):
                if(searchWords[0].lower() == token_lists[w].lower()):
                    wordIDs.append(news.values[i][2])  
                    break
    return wordIDs


In [None]:
def expandQuery(word, top3):
    query = word + " OR " + top3[0] + " OR " + top3[1] + " OR " + top3[2]
    print(query)
    return len(getIDs(query))

## Consulta 1: dilma

In [None]:
expandQuery("dilma", top3("dilma"))

## Consulta 2: presidente

In [None]:
expandQuery("presidente", top3("presidente"))

## Consulta 3: votos

In [137]:
expandQuery("votos", top3("votos"))

votos OR válidos OR contra OR candidato


2207

#### Quais os termos retornados para a expansão de cada consulta?
Para consulta "dilma" foram retornados: "rousseff", "disse" e "é".
Para consulta "presidente" foram retornados: "república", "dilma" e "luiz".
Para consulta "votos" foram retornados: "válidos", "contra" e "candidato".

#### Você acha que esses termos são de fato relacionados com a consulta original? Justifique.
Sim, pois podemos ver claramente que temos uma ligação entre as consultas e seus termos retornados. Por exemplo, na consulta "presidente" foram retornados os dois ultimos presidentes do Brasil (Dilma e Luiz Inácio).

#### Compare os documentos retornados para a consulta original com a consulta expandida. Quais resultados você acha que melhor capturam a necessidade de informação do usuário? Por que?
