In [16]:
import pandas as pd
from nltk import word_tokenize
from collections import Counter
import sys

In [17]:
data = pd.read_csv("noticias_estadao.csv")

In [18]:
data.head()

Unnamed: 0,titulo,conteudo,idNoticia
0,PT espera 30 mil pessoas em festa na Esplanada,BRASÍLIA - Após o desgaste provocado com o lan...,1
1,Alckmin toma posse de olho no Planalto,"Reeleito em outubro, o governador tucano Geral...",2
2,Seis obstáculos e desafios do segundo mandato ...,1. Rearranjo das contas A nova equipe econôm...,3
3,Veja os desafios dos governadores que assumem ...,"No Acre, governador reeleito quer erradicar an...",4
4,PT impulsiona cerimônia de posse da Dilma nas ...,"Os perfis da presidente Dilma Rousseff, nas re...",5


In [19]:
tokens_by_rows = data.apply(lambda row : set(word_tokenize(row["conteudo"].lower())), axis=1)

In [20]:
def create_vocab(data, tokens_by_rows):
    vocab_index = {}

    for i in range(len(tokens_by_rows)):
        for key in tokens_by_rows[i]:
            if key in vocab_index:
                vocab_index[key].append(data.at[i, 'idNoticia'])
            else:
                vocab_index[key] = [data.at[i, 'idNoticia']]
    return vocab_index

In [21]:
vocab_index = create_vocab(data, tokens_by_rows)

In [22]:
def get_token_less_docs(tokens, vocab_index):
    less_docs_size = sys.maxsize
    token_less_docs = None
    for token in tokens:
        if len(vocab_index[token]) < less_docs_size:
            less_docs_size = len(vocab_index[token])
            token_less_docs = token
    return token_less_docs

In [54]:
def get_documents_by_tokens(text, operator,vocab_index):
    
    tokens = word_tokenize(text)
    resp = []
    
    if operator == "and":
        
        token_less_docs = get_token_less_docs(tokens, vocab_index)
        tokens.remove(token_less_docs)
        
        smaller_list = get_documents_by_token(token_less_docs, vocab_index)
        lists_to_check = [get_documents_by_token(token, vocab_index) for token in tokens]
        
        for item in smaller_list:
            counter = 0
            for list_to_check in lists_to_check:
                if item not in list_to_check:
                    break
                else:
                    counter+=1
            if counter == len(lists_to_check):
                resp.append(item)       
            
    elif operator == "or":
        for token in tokens:
            print(token)
            print(get_documents_by_token(token, vocab_index))
        resp = list(set(sum([get_documents_by_token(token, vocab_index) for token in tokens], [])))
    
    return resp

In [24]:
def get_documents_by_token(token, vocab_index):
    try:
        resp = vocab_index[token]
    except:
        resp = []
    return resp

## passando o operador como parâmetro

In [26]:
get_documents_by_tokens("dois brasil farinha", "and", vocab_index)

farinha
[1, 2181, 2513, 3990, 4550, 5559, 7091, 7111, 7115, 7215]
['dois', 'brasil']
[[2, 4, 5, 9, 11, 13, 16, 19, 20, 21, 24, 25, 29, 31, 32, 33, 36, 43, 45, 46, 49, 52, 61, 65, 69, 73, 78, 79, 85, 93, 104, 107, 110, 112, 113, 115, 123, 127, 129, 131, 135, 136, 138, 142, 151, 154, 155, 158, 162, 169, 186, 194, 195, 199, 201, 205, 208, 213, 216, 217, 219, 220, 221, 225, 228, 233, 236, 240, 241, 242, 243, 246, 247, 248, 249, 250, 254, 257, 259, 265, 268, 273, 275, 281, 283, 284, 285, 286, 287, 289, 293, 297, 305, 310, 313, 314, 316, 317, 321, 322, 324, 325, 326, 331, 332, 336, 338, 341, 343, 345, 347, 349, 350, 358, 359, 360, 363, 365, 366, 367, 368, 375, 377, 378, 390, 392, 403, 412, 413, 414, 415, 421, 422, 425, 427, 428, 441, 442, 443, 445, 458, 461, 465, 466, 468, 469, 471, 472, 473, 476, 485, 486, 487, 488, 489, 491, 500, 503, 504, 505, 507, 510, 511, 513, 514, 518, 520, 523, 525, 530, 542, 556, 557, 563, 564, 565, 566, 567, 569, 570, 573, 588, 589, 590, 596, 600, 604, 605, 613, 61

[4550, 7111]

In [47]:
def search(str_search, vocab_index):
    resp = []
    
    and_lists = str_search.split("OR")
    and_lists = [and_list.replace("AND", " ") for and_list in and_lists]
    
    for and_list in and_lists:
        resp.append(get_documents_by_tokens(and_list, "and", vocab_index))
    
    return list(set(sum(resp, [])))

## Exemplos para teste

In [57]:
search("farinha", vocab_index)


[1, 2181, 4550, 7111, 7115, 7215, 2513, 7091, 3990, 5559]

In [58]:
search("politica", vocab_index)

[2819, 4803, 1030, 7431, 2255, 1168, 7183, 787, 4275, 891, 6013]

In [59]:
search("farinha AND politica", vocab_index)

[]

In [60]:
search("brasil AND farinha OR politica", vocab_index)

[2819,
 4803,
 4550,
 7111,
 1030,
 7431,
 7215,
 1168,
 2255,
 7183,
 787,
 4275,
 3990,
 891,
 6013]

In [61]:
search("brasil AND farinha OR politico OR cozinha and casa", vocab_index)

[4550, 7111, 1512, 1835, 4299, 7215, 5041, 3990, 4409, 1852, 989]

In [51]:
search("farinha AND brasil", vocab_index)

farinha
[1, 2181, 2513, 3990, 4550, 5559, 7091, 7111, 7115, 7215]
['brasil']
[[4, 5, 6, 8, 9, 10, 11, 12, 14, 16, 19, 22, 23, 26, 27, 30, 32, 33, 36, 37, 38, 39, 41, 43, 45, 59, 62, 64, 65, 67, 68, 76, 84, 85, 89, 90, 94, 95, 101, 102, 104, 105, 108, 110, 115, 117, 120, 122, 126, 127, 130, 133, 135, 136, 138, 140, 146, 149, 150, 155, 156, 157, 158, 159, 166, 168, 170, 172, 173, 175, 178, 180, 182, 186, 188, 195, 197, 198, 199, 200, 203, 204, 206, 211, 213, 217, 219, 220, 221, 228, 230, 231, 236, 238, 247, 255, 260, 265, 269, 272, 275, 276, 280, 287, 288, 291, 292, 299, 300, 302, 303, 304, 306, 307, 311, 312, 315, 318, 325, 326, 327, 328, 333, 334, 343, 345, 354, 355, 357, 359, 363, 364, 365, 366, 379, 381, 383, 386, 398, 400, 403, 404, 405, 406, 407, 416, 422, 424, 426, 427, 430, 431, 432, 433, 438, 444, 445, 449, 451, 454, 455, 456, 457, 458, 461, 464, 467, 471, 475, 485, 489, 492, 497, 498, 500, 504, 506, 507, 510, 511, 512, 516, 519, 523, 524, 525, 528, 530, 531, 533, 535, 539, 540,

[7215, 4550, 3990, 7111]