In [24]:
import json
import spacy
from collections import defaultdict

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
with open('data/tfidf.json') as f:
     data = json.load(f)

In [4]:
with open('data/vocab.json') as f:
     vocab = json.load(f)

In [5]:
vocab[0:5]

['include', 'help', 'encourage', 'hundredth', 'tuberculosis']

In [7]:
data[0].keys()

dict_keys(['title', 'text', 'url', 'tokenized_text', 'tfidf_vec'])

In [11]:
inverted_index = {}
for i, word in enumerate(vocab):
    inverted_index[word] = []
    for doc in data:
        if doc['tfidf_vec'][i]!= 0:
            inverted_index[word].append((doc['title'],doc['tfidf_vec'][i] ))


In [14]:
inverted_index['hiv']

[('Pandemic', 0.016854449066591115),
 ('Epidemiology of HIV/AIDS', 0.08655461864405645),
 ('HIV/AIDS', 0.046550383136299264),
 ('HIV/AIDS in Yunnan', 0.03892930271132992),
 ('Viral load', 0.03463788351480536),
 ('Virus', 0.004791951205207277)]

In [15]:
for s in data:
    if s["title"] == 'Virus':
        print(s["text"])

A virus is a submicroscopic infectious agent that replicates only inside the living cells of an organism. Viruses infect all types of life forms, from animals and plants to microorganisms, including bacteria and archaea.
Since Dmitri Ivanovsky's 1892 article describing a non-bacterial pathogen infecting tobacco plants and the discovery of the tobacco mosaic virus by Martinus Beijerinck in 1898, more than 6,000  virus species have been described in detail of the millions of types of viruses in the environment. Viruses are found in almost every ecosystem on Earth and are the most numerous type of biological entity. The study of viruses is known as virology, a subspeciality of microbiology.
When infected, a host cell is forced to rapidly produce thousands of identical copies of the original virus. When not inside an infected cell or in the process of infecting a cell, viruses exist in the form of independent particles, or virions, consisting of: (i) the genetic material, i.e., long molecu

In [16]:
def tokenizer(input_string):
    # tokenize, clean, lemmatize
    text_lowercased = nlp(input_string.lower())
    tokens_without_stopwords = [word for word in text_lowercased if not word.is_stop and not word.is_punct and len (word.dep_.strip()) != 0]
    
    token_lemmatized = [token.lemma_ for token in tokens_without_stopwords]
    return token_lemmatized

In [22]:
def search(query,  index = inverted_index):
    query_tokens = tokenizer(query)
    newlist = []
    for token in query_tokens:
        newlist.extend(inverted_index[token])
        
    output = defaultdict(int)
    for k, v in newlist:
        output[k]+= v
    results = [(x,y) for x, y in output.items()]
    return sorted(results, key = lambda x:x[1], reverse=True)

In [25]:
title, score = search(query = 'world health organization')[0]
for d in data:
    if d['title'] == title:
        print(s['text'])

A virus is a submicroscopic infectious agent that replicates only inside the living cells of an organism. Viruses infect all types of life forms, from animals and plants to microorganisms, including bacteria and archaea.
Since Dmitri Ivanovsky's 1892 article describing a non-bacterial pathogen infecting tobacco plants and the discovery of the tobacco mosaic virus by Martinus Beijerinck in 1898, more than 6,000  virus species have been described in detail of the millions of types of viruses in the environment. Viruses are found in almost every ecosystem on Earth and are the most numerous type of biological entity. The study of viruses is known as virology, a subspeciality of microbiology.
When infected, a host cell is forced to rapidly produce thousands of identical copies of the original virus. When not inside an infected cell or in the process of infecting a cell, viruses exist in the form of independent particles, or virions, consisting of: (i) the genetic material, i.e., long molecu

In [27]:
search(query = 'ebola')


[('Plague of Cyprian', 0.047499062175213644),
 ('Science diplomacy and pandemics', 0.027286695292144007)]

In [None]:
for d in data:
    if (d['title'] == 'Science diplomacy and pandemics'):
        print()

In [None]:
#save the inverted index to json file