In [1]:
from app.search import Search
from app.invertedIndex import InvertedIndex

## Inverted Index
In order to create our index we need to specify the pathfile for our dump file, and optionaly a text file with stopwords to be ignored.

In [2]:
index = InvertedIndex("data/ptwiki-v2.trec.xml", "data/stopwords.txt")

## Saving Index
In order to save our index as a compressed json file, we can use the method `export_as_json` from our class InvertedIndex.

optionaly, passing the filename we want to use for our index file and if we want to compress it using the gzip algorithm.

In [3]:
index.export_as_json("index", compressed=True)

## Creating our Queries
We simply create a search object using our text query and the index we want to search on.

In [4]:
search_estados = Search("Estado", index)
search_unidos = Search("Unidos", index)

search_nomes = Search("nomes", index)
search_biblicos = Search("bíblicos", index)

search_winston = Search("Winston", index)
search_churchill = Search("Churchill", index)

## Filtering Searches
Create and print *AND* and *OR* operations on each pair of queries we're interested on, more specifically:
1. nomes, bíblicos (AND e OR);
2. Estados, Unidos (AND e OR);
3. Winston, Churchill (AND e OR)

In [5]:
nomes_and_biblicos = search_biblicos & search_nomes
nomes_or_biblicos = search_biblicos | search_nomes

print("nomes AND bíblicos ->{}\n\nnomes OR bíblicos->{}".format(nomes_and_biblicos, nomes_or_biblicos))

nomes AND bíblicos ->{'298', '685', '136', '821', '997', '758'}

nomes OR bíblicos->{'512', '658', '417', '299', '817', '576', '145', '425', '61', '608', '219', '402', '702', '961', '709', '773', '4', '470', '194', '979', '898', '752', '494', '8', '656', '89', '160', '396', '593', '269', '321', '758', '260', '179', '351', '820', '6', '853', '685', '678', '562', '762', '422', '152', '993', '867', '528', '737', '249', '839', '568', '739', '835', '667', '793', '537', '578', '613', '607', '60', '610', '706', '580', '574', '28', '547', '305', '643', '662', '479', '878', '73', '234', '849', '513', '35', '67', '14', '942', '438', '427', '688', '655', '204', '136', '464', '476', '546', '151', '614', '767', '250', '573', '798', '682', '307', '557', '442', '212', '770', '301', '478', '641', '844', '881', '701', '886', '391', '183', '410', '298', '101', '56', '504', '976', '134', '792', '16', '339', '314', '759', '822', '990', '103', '142', '523', '555', '349', '848', '980', '59', '224', '290', '

In [6]:
estados_and_unidos = search_estados & search_unidos
estados_or_unidos = search_estados | search_unidos

print("Estados AND Unidos -> {}\n\nEstados OR Unidos -> {}".format(estados_and_unidos, estados_or_unidos))

Estados AND Unidos -> {'592', '299', '61', '646', '599', '473', '402', '702', '45', '451', '961', '34', '505', '205', '238', '470', '194', '955', '979', '7', '752', '794', '494', '384', '125', '292', '29', '343', '8', '656', '89', '9', '593', '96', '66', '758', '6', '196', '678', '30', '562', '762', '852', '654', '690', '647', '737', '851', '249', '529', '807', '739', '335', '569', '578', '615', '613', '2', '434', '645', '610', '706', '580', '574', '223', '28', '736', '297', '590', '131', '643', '662', '479', '149', '170', '234', '513', '35', '62', '438', '319', '427', '655', '204', '322', '151', '614', '767', '250', '573', '682', '386', '698', '442', '212', '770', '641', '42', '750', '634', '785', '117', '102', '407', '814', '492', '56', '134', '130', '676', '164', '208', '142', '980', '224', '290', '891', '90', '604', '583', '172', '833', '167', '768', '32', '536', '86', '803', '284', '902', '143', '279', '225', '781', '600', '763', '426', '507', '54', '411', '964', '841', '981', '41

In [7]:
winston_and_churchill = search_winston & search_churchill 
winston_or_churchill = search_winston | search_churchill 

print("Winston AND Churchill -> {}\n\nWinston OR Churchill -> {}".format(winston_and_churchill, winston_or_churchill))

Winston AND Churchill -> {'844', '562', '647', '578'}

Winston OR Churchill -> {'90', '322', '832', '844', '841', '562', '578', '647', '292'}
