In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

In [1]:
import json
import itertools
from collections import Counter

import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
with open('data/tokenized.json') as f:
     data = json.load(f)


In [4]:
print (data[0].keys())

dict_keys(['title', 'text', 'url', 'tokenized_text'])


In [10]:
tokenized_text = [i['tokenized_text'] for i in data]
vocab = list(itertools.chain(*tokenized_text))
vocab = list(set(vocab))

In [11]:
len(vocab)

1494

In [12]:
# Save the vocabulary
with open('./vocab.json', 'w') as outfile:
    json.dump(vocab, outfile)

In [13]:
#term frequency
token_counter = []
for doc in data:
    tokenized = doc['tokenized_text']
    token_counter.append(Counter(tokenized))

In [14]:
token_counter[1].keys()

dict_keys(['hiv', 'aid', 'human', 'immunodeficiency', 'virus', 'consider', 'author', 'global', 'pandemic', 'currently', 'use', 'term', 'epidemic', 'describe', '2018', 'approximately', '37.9', 'million', 'people', 'infect', 'globally.there', '770,000', 'death', 'aids', '2018.the', '2015', 'burden', 'disease', 'study', 'report', 'publish', 'lancet', 'estimate', 'incidence', 'infection', 'peak', '1997', '3.3', 'year', 'fall', 'rapidly', '2005', '2.6', 'remain', 'stable', '2015.sub', 'saharan', 'africa', 'region', 'affected', 'estimated', '61', 'new', 'occur', 'prevalence', 'ratio', 'western', 'central', 'europe', 'north', 'america', 'low', 'decline', 'mortality', '17', 'see', '0.06', '2000', '0.03', '2017', 'strong', 'steady', 'reduction', 'eastern', 'southern', 'push', '0.11', '0.04', 'progress', 'gradual', 'asia', 'pacific', '0.05', 'latin', 'caribbean', 'middle', 'east', '0.08', '0.09', 'south', 'large', 'population', 'country', 'world', '7.06', 'tanzania', '4.5', 'tanzanian', 'adult',

In [15]:
#document frequency
document_frequency = {}
for word in vocab:
    count = sum(1 for doc in token_counter if word in doc.keys())
    document_frequency[word] = count

In [17]:
document_frequency['hiv']

6

In [59]:
#compute tfidf

for i, doc in enumerate(token_counter):
    tfidf_vec = []
    for token in vocab:
        tf = doc[token] / len(data[i]['tokenized_text'])
        
        idf = np.log(len(data)/ document_frequency[token])
        
        tfidf = tf * idf
        tfidf_vec.append(tfidf)
    data[i]['tfidf_vec'] = tfidf_vec

In [60]:
with open('summaries.json','w') as json_file:
    json.dump(data,json_file)

In [62]:
#vectorize query

In [63]:
query = "highest pandemic casualties"

In [69]:
def tokenizer(document):
    text_lowercased = nlp(document.lower())
    text_without_sw = [word for word in text_lowercased 
                      if not word.is_stop
                      and not word.is_punct
                      and len(word.dep_.strip())!=0]
    
    token_lemmatized = [token.lemma_
                       for token in text_without_sw]
    
    return token_lemmatized

In [74]:
def vectorize(query, vocab = vocab):
    query_tokenized = tokenizer(query)
    query_token_counter = Counter(query_tokenized)
    query_vec = []
    for token in vocab:
        tf = query_token_counter[token] /len(query_tokenized)
        idf = np.log(len(data) / document_frequency[token])
        tf_idf = tf * idf
        query_vec.append(tf_idf)
        
    return query_vec

In [75]:
vectorize(query)

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [77]:
#build a search function
def search_tfidf(query,docs):
    query_vec = vectorize(query)
    query_arr = np.array(query_vec)
    
     # Build a list of results and their cosine similarity scores
    rankings = []
    for doc in docs:
        doc_rank = {}
        doc_arr = np.array(doc['tfidf_vec'])
        rank = cosine_similarity(query_arr.reshape(1,-1), doc_arr.reshape(1,-1))[0][0]
        if rank > 0:
            doc_rank['title'] = doc['title']
            doc_rank['rank'] = rank
            rankings.append(doc_rank)
            
    return sorted(rankings, key = lambda k: k['rank'], reverse = True)

In [78]:
ranking = search_tfidf(query, data)

In [79]:
ranking

[{'title': 'Unified Victim Identification System',
  'rank': 0.06241706017122516},
 {'title': 'Spanish flu', 'rank': 0.04839028796130693},
 {'title': 'Pandemic prevention', 'rank': 0.025521537041607523},
 {'title': 'Targeted immunization strategies', 'rank': 0.020611811420762342},
 {'title': 'COVID-19 pandemic', 'rank': 0.01671917552289672},
 {'title': 'Cholera', 'rank': 0.016147089734099104},
 {'title': 'Pandemic', 'rank': 0.016073981374157997},
 {'title': 'Superspreader', 'rank': 0.014504005105625184},
 {'title': 'HIV/AIDS in Yunnan', 'rank': 0.012468322544898668},
 {'title': 'Pandemic Severity Assessment Framework',
  'rank': 0.01200526311930191},
 {'title': 'Viral load', 'rank': 0.009963512530115872},
 {'title': 'Pandemic severity index', 'rank': 0.008759140594635537},
 {'title': 'Epidemiology of HIV/AIDS', 'rank': 0.006736388500618247},
 {'title': 'Crimson Contagion', 'rank': 0.006397733043761992},
 {'title': 'Plague of Cyprian', 'rank': 0.004779751602928559},
 {'title': 'PREDICT 

In [81]:
search_tfidf('ebola', data)

[{'title': 'Plague of Cyprian', 'rank': 0.11852960348756272},
 {'title': 'Science diplomacy and pandemics', 'rank': 0.07107905738603662}]

In [82]:
for s in data:
    if (s['title'] == 'Plague of Cyprian'):
        print(s['text'])

The Plague of Cyprian was a pandemic that afflicted the Roman Empire about from AD 249 to 262. The plague is thought to have caused widespread manpower shortages for food production and the Roman army, severely weakening the empire during the Crisis of the Third Century. Its modern name commemorates St. Cyprian, bishop of Carthage, an early Christian writer who witnessed and described the plague. The agent of the plague is highly speculative because of sparse sourcing, but suspects have included smallpox, pandemic influenza and viral hemorrhagic fever (filoviruses) like the Ebola virus.
