In [245]:
import json
import itertools
from collections import Counter
from collections import defaultdict
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity

In [111]:
nlp = spacy.load('en_core_web_sm')

In [112]:
with open('data/data.json', 'r') as outfile:
    data = json.load(outfile)

In [113]:
print(data[0].keys())

dict_keys(['title', 'text', 'url'])


In [114]:
text = data[0]['text']

In [115]:
text

'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 (SARS-CoV-2) and HIV/AIDS.'

In [116]:
text_tokenized = nlp(text.lower())
for token in text_tokenized[:5]:
    print(type(token), token.text, token.pos_, token.dep_)

<class 'spacy.tokens.token.Token'> a DET det
<class 'spacy.tokens.token.Token'> pandemic ADJ nsubj
<class 'spacy.tokens.token.Token'> ( PUNCT punct
<class 'spacy.tokens.token.Token'> from ADP prep
<class 'spacy.tokens.token.Token'> greek PROPN amod


In [117]:
unclassified_tokens = [(token.lemma_, token.dep_)
                      for token 
                      in text_tokenized
                      if token.dep_ == '']

print(unclassified_tokens)

[]


In [118]:
#remove stop words and punctuation

In [119]:
tokens_without_sw = [ word for word in text_tokenized if not word.is_stop and not word.is_punct]
tokens_without_sw[0:10]

[pandemic,
 greek,
 πᾶν,
 pan,
 δῆμος,
 demos,
 people,
 epidemic,
 infectious,
 disease]

In [120]:
#lemmatize tokens

In [121]:
token_lemmas = [token.lemma_ for token in tokens_without_sw if token.dep_]

In [122]:
token_lemmas[0:10]

['pandemic',
 'greek',
 'πᾶν',
 'pan',
 'δῆμος',
 'demos',
 'people',
 'epidemic',
 'infectious',
 'disease']

In [123]:
def tokenizer(document):
    text_lowercased = nlp(document.lower())
    tokens_without_sw = [word for word in text_lowercased if not word.is_stop and not word.is_punct]
    
    token_lemmas = [token.lemma_ for token in tokens_without_sw if token.dep_]
    return token_lemmas

In [124]:
for doc in data:
    doc['tokenized_text'] = tokenizer(doc['text'])

In [125]:
#build corpus vocabulary

In [126]:
tokenized_texts = [i['tokenized_text'] for i in data]
vocab = list(itertools.chain(*tokenized_texts))
vocab = list(set(vocab))

In [157]:
docs_token_counter = []
for docs in data:
    docs_token_counter.append(Counter(docs['tokenized_text']))

In [159]:
# For each token in corpus vocabulary, count in how many documents it occurs
number_docs_with_token  = {}
for token in vocab:
    count_docs = sum([1 for doc in docs_token_counter if token in doc.keys()])
    number_docs_with_token[token] = count_docs

In [164]:
number_docs_with_token['hiv']

6

In [180]:
#compute tfidfs of a document
for i, doc in enumerate(docs_token_counter):
    tfidf_vec = []
    for token in vocab:
        #term frequency
        tf = doc[token] / len(data[i]['tokenized_text'])
        
        #document frequency
        idf = np.log(len(data)/(number_docs_with_token[token]))
        
        tfidf = tf * idf
        tfidf_vec.append(tfidf)
        
    data[i]['tf_idf'] = tfidf_vec

In [184]:
sum(data[2]['tf_idf'])

2.527419345612542

In [185]:
#vectorize query

In [186]:
query = 'highest pandemic casualities'

In [192]:
def vectorize(query, vocab = vocab):
    query_tokenized = tokenizer(query)
    query_token_counter = Counter(query_tokenized)
    query_vec = []
    for token in vocab:
        tf = query_token_counter[token] / len(query_tokenized)
        idf = np.log(len(data) / number_docs_with_token[token])
        tf_idf = tf*idf
        query_vec.append(tf_idf)
    return query_vec

In [196]:
#search document

In [209]:
def search_tfidf(query, data = data):
    #vectorize query
    query_vec = vectorize(query)
    query_arr = np.array(query_vec)
    ranks = []
    for doc in data:
        doc_rank = {}
        doc_arr = np.array(doc['tf_idf'])
        rank = cosine_similarity(query_arr.reshape(1,-1), doc_arr.reshape(1,-1))[0][0]
        if rank > 0:
            doc_rank['title'] = doc['title']
            doc_rank['rank'] = rank
            ranks.append(doc_rank)
    
    return sorted(ranks, key= lambda k: k['rank'], reverse = True)

In [210]:
search_tfidf(query,data)

[{'title': 'Spanish flu', 'rank': 0.13481326476860545},
 {'title': 'Pandemic prevention', 'rank': 0.0710735794641128},
 {'title': 'Targeted immunization strategies', 'rank': 0.057426489066527045},
 {'title': 'COVID-19 pandemic', 'rank': 0.04639029115406248},
 {'title': 'Cholera', 'rank': 0.044987345029628534},
 {'title': 'Pandemic', 'rank': 0.044775690846696845},
 {'title': 'Superspreader', 'rank': 0.04035859935317057},
 {'title': 'HIV/AIDS in Yunnan', 'rank': 0.03473486866775819},
 {'title': 'Pandemic Severity Assessment Framework',
  'rank': 0.03344527594939134},
 {'title': 'Viral load', 'rank': 0.02775773390173221},
 {'title': 'Pandemic severity index', 'rank': 0.024403808152607624},
 {'title': 'Epidemiology of HIV/AIDS', 'rank': 0.018764088153593414},
 {'title': 'Crimson Contagion', 'rank': 0.017821964411586344},
 {'title': 'Plague of Cyprian', 'rank': 0.013316847683256168},
 {'title': 'PREDICT (USAID)', 'rank': 0.013002251981818312},
 {'title': 'Swine influenza', 'rank': 0.0086495

In [211]:
search_tfidf('ebola')

[{'title': 'Plague of Cyprian', 'rank': 0.11852960348756272},
 {'title': 'Science diplomacy and pandemics', 'rank': 0.07107905738603662}]

In [212]:
for d in data:
    if d['title'] == 'Science diplomacy and pandemics':
        print (d['text'])

Science diplomacy is the collaborative efforts by local and global entities to solve global issues using science and technology as a base. In science diplomacy, collaboration takes place to advance science but science can also be used to facilitate diplomatic relations. This allows even conflicting nations to come together through science to find solutions to global issues. Global organizations, researchers, public health officials, countries, government officials, and clinicians have previously worked together to create effective measures of infection control and subsequent treatment. They continue to do so through sharing of resources, research data, ideas, and by putting into effect laws and regulations that can further advance scientific research. Without the collaborative efforts of such entities, the world would not have the vaccines and treatments we now possess for diseases that were once considered deadly such as tuberculosis, tetanus, polio, influenza, etc. Historically, scie

In [234]:
inverted_index = {}
for i, word in enumerate(vocab):
    inverted_index[word] = []
    for doc in data:
        if doc['tf_idf'][i] !=0 :
            inverted_index[word].append((doc['title'], doc['tf_idf'][i]))

In [235]:
inverted_index['aids']

[('Epidemiology of HIV/AIDS', 0.014841816146758571),
 ('HIV/AIDS', 0.025606532601028128),
 ('HIV/AIDS in Yunnan', 0.056828532877720314)]

In [236]:
for d in data:
    if d['title'] == 'HIV/AIDS in Yunnan':
        print(d['text'])

The People's Republic of China's first reported AIDS case was identified in 1985 in a dying tourist. In 1989, the first indigenous cases were reported as an outbreak in 146 infected heroin users in Yunnan province, near China's southwest border.Yunnan is the area most affected by HIV/AIDS in China. In 1989 first infections appeared among needle sharing drug users near the Burmese border. Up until 1993, the disease had remained a problem in the border areas before mobile people (truck drivers, construction and migrant workers and travelers) brought the virus further into the country. In 1995, the provinces of Sichuan and Xinjiang reported their first HIV cases, and by 1998, the virus had spread all over China.
Low awareness of the disease among China's general population appears to be a major culprit. Most Chinese consider HIV/AIDS as a foreign issue, and even educated people are less knowledgeable of the virus, its transmission and prevention, than people in other countries. Until rece

In [284]:
def search(query, index = inverted_index):
    query_tokens = tokenizer(query)
    
    new_list = []
    for token in query_tokens:
        new_list.extend(index[token])

    output = defaultdict(int)
    for k,v in new_list:
        output[k] += v
        
    results = [(x,y) for x , y in output.items()]
    
    return sorted(new_list, key = lambda x: x[1], reverse= True)

In [285]:
title, score = search(query = "world health organization")[0]

In [286]:
title

'Event 201'

In [287]:
for d in data:
    if d['title'] == title:
        print (d['text'])

The Johns Hopkins Center for Health Security (abbreviated CHS; previously the UPMC Center for Health Security, the Center for Biosecurity of UPMC, and the Johns Hopkins Center for Civilian Biodefense Strategies) is an independent, nonprofit organization of the Johns Hopkins Bloomberg School of Public Health, and part of the Environmental Health and Engineering department. It is concerned with the areas of health consequences from epidemics and disasters as well as averting biological weapons development, and implications of biosecurity for the bioeconomy. It is a think tank that does policy research and gives policy recommendations to the United States government as well as the World Health Organization and the UN Biological Weapons Convention.


In [266]:
score

0.11049890590702932

In [290]:
search(query = "Ebola virus")

[('Virus', 0.06659622698501509),
 ('Plague of Cyprian', 0.047499062175213644),
 ('Viral load', 0.033601611922777795),
 ('Crimson Contagion', 0.03351433500869266),
 ('Disease X', 0.03072147375796827),
 ('Swine influenza', 0.02730797667374957),
 ('Science diplomacy and pandemics', 0.027286695292144007),
 ('HIV/AIDS in Yunnan', 0.022636875400608197),
 ('Plague of Cyprian', 0.01592965305968725),
 ('HIV/AIDS', 0.013600020003527455),
 ('Spanish flu', 0.012838824854076293),
 ('Epidemiology of HIV/AIDS', 0.005912036187100423),
 ('COVID-19 pandemic', 0.0050011701466459975)]