In [1]:
!pip install spacy
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import json
import collections
import spacy 
import sklearn
import numpy as np
import pandas as pd

In [3]:
with open("./data.json","r") as file:
    data=json.load(file)

In [4]:
nlp=spacy.load("en_core_web_sm")

In [110]:
# for token in doc:
#     print(token.text)
#     print(token.i)
#     print(token.lower_)
#     print(token.pos_)
#     print(token.lemma_)
#     print(token.is_alpha)
#     print(token.is_stop)
#     print("\n")

In [5]:
def preprocess(text):
    text=text.lower()
    doc=nlp(text)
    tokens=[]
    for token in doc:
        if not token.is_stop and token.pos_ not in ("PUNCT","SPACE") and token.is_alpha and token.lower_==token.lemma_:
            tokens.append(token.lemma_)
    return tokens

tokenized_text=[preprocess(articles['text']) for articles in data]

In [6]:
data_1=[{**items,'tokenized_text':token} for items,token in zip(data,tokenized_text)]

In [7]:
json_data = json.dumps(data_1)
with open('data_1.json', 'w') as f:
    f.write(json_data)

In [8]:
vocab=[]
for items in tokenized_text:
    vocab+=items

In [9]:
from collections import Counter
c=dict(Counter(vocab))

In [10]:
#TF-IDF calculation
import math

total_docs=len(data_1)
def tf_calc(tokenized_text):
    length_doc=len(tokenized_text)
    c=Counter(tokenized_text)
    tf=[]
    for word in vocab:
        if word in tokenized_text:
            tf.append(c[word]/length_doc)
        else:
            tf.append(0)
    return tf

tf=[tf_calc(a['tokenized_text']) for a in data_1]

idf_dict={}
len_docs=len(data_1)
for word in vocab:
    word_count=0
    for doc in [x['tokenized_text'] for x in data_1]:
        if word in doc:
            word_count+=1
    
    idf_dict[word]=math.log(len_docs/word_count)


def idf_calc(tokenized_text):
    return [idf_dict[word] if word in tokenized_text else 0 for word in vocab]

idf=[idf_calc(a['tokenized_text']) for a in data_1]

def tf_idf_calc(tf,idf):
    return np.multiply(np.array(tf), np.array(idf))
    
data_2=[{**items,'tf_idfs':tf_idf_calc(tf,idf)} for items,tf,idf in zip(data_1,tf,idf)]

In [96]:
query_text="symptoms of swine flu"

In [97]:
query_text_tokenized=preprocess(query_text)

In [92]:
Inverted_index={items:[(a['title'],a['tf_idfs'][vocab.index(items)]) for a in data_2 if items in a['tokenized_text']] for items in vocab}

In [93]:
Inverted_index

{'pandemic': [('Pandemic', 0.02697671072795339),
  ('Epidemiology of HIV/AIDS', 0.005181502365430073),
  ('Antonine Plague', 0.00999725162271214),
  ('Cholera', 0.002777014339642261),
  ('COVID-19 pandemic', 0.01634166130635638),
  ('Crimson Contagion', 0.024049992111241467),
  ('HIV/AIDS', 0.0024004700224026323),
  ('Pandemic prevention', 0.07389272938526363),
  ('Pandemic Severity Assessment Framework', 0.04302614622433072),
  ('Pandemic severity index', 0.029643013532460412),
  ('Plague of Cyprian', 0.021244159698263296),
  ('PREDICT (USAID)', 0.02655519962282912),
  ('1929–1930 psittacosis pandemic', 0.010363004730860145),
  ('Science diplomacy and pandemics', 0.0072014100672078965),
  ('Spanish flu', 0.021709360275597528),
  ('Swine influenza', 0.010711341024334435),
  ('Unified Victim Identification System', 0.0037600282651793445)],
 'greek': [('Pandemic', 0.05171581806383305)],
 'πᾶν': [('Pandemic', 0.05171581806383305)],
 'pan': [('Pandemic', 0.05171581806383305)],
 'δῆμος': [(

In [37]:
# from functools import reduce

# def search_terms(terms):
#     return reduce(set.intersection,(Inverted_index[term] for term in terms))

In [98]:
from collections import defaultdict

def search_inverted_index(terms):
    terms_list=[]
    for term in terms:
        terms_list+=Inverted_index[term]
    doc_imp=defaultdict(list)
    
    for i in terms_list:
        doc_imp[i[0]].append(i[1])
    
    doc_imp={a:reduce(lambda x,y:x+y,b) for a,b in doc_imp.items()}
    return sorted(doc_imp.items(),key=lambda x:x[1],reverse=True)

In [99]:
search_inverted_index(query_text_tokenized)

[('Swine influenza', 0.3943904596730489),
 ('Spanish flu', 0.1006989957581831),
 ('Pandemic', 0.029711145665104622),
 ('Unified Victim Identification System', 0.016564621034527358)]

In [104]:
with open("./example_queries.json","r") as file:
    queries_data=json.load(file)

with open("./example_query_results.json","r") as file:
    queries_results=json.load(file)

In [102]:
queries_data

['black death',
 'zoonotic diseases',
 'swine flu',
 'cholera transmission',
 'classification of viruses',
 'economic impact of pandemics',
 'pandemic prevention organizations',
 'spread of infectious diseases',
 'prevention of viral infections',
 'common symptoms of coronavirus']

In [106]:
[{query:search_inverted_index(preprocess(query))} for query in queries_data]

[{'black death': [('Pandemic', 0.07788500259696608),
   ('Spanish flu', 0.024068009132662504),
   ('Cholera', 0.021551093144933094),
   ('Antonine Plague', 0.019395983830439785),
   ('1929–1930 psittacosis pandemic', 0.013403728663312048)]},
 {'zoonotic diseases': [('Swine influenza', 0.054757925008764405)]},
 {'swine flu': [('Swine influenza', 0.3943904596730489),
   ('Spanish flu', 0.1006989957581831),
   ('Pandemic', 0.029711145665104622),
   ('Unified Victim Identification System', 0.016564621034527358)]},
 {'cholera transmission': [('Cholera', 0.17035798891615594),
   ('Swine influenza', 0.041562822493799535),
   ('HIV/AIDS in Yunnan', 0.021982115007831756),
   ('1929–1930 psittacosis pandemic', 0.013403728663312048),
   ('Basic reproduction number', 0.010846438326232773),
   ('Virus', 0.008863756051545064)]},
 {'classification of viruses': [('Pandemic severity index',
    0.1515393738614643)]},
 {'economic impact of pandemics': [('Pandemic Severity Assessment Framework',
    0.02

In [105]:
queries_results

[{'query': 'black death',
  'relevant_article_titles': [['Pandemic', 0.047336756359133265],
   ['Cholera', 0.014518164813892178],
   ['Antonine Plague', 0.013233865618817103],
   ['Epidemiology of HIV/AIDS', 0.011947239794765438],
   ['Bills of mortality', 0.008868054280650635],
   ['Spanish flu', 0.008602012652231115],
   ['1929–1930 psittacosis pandemic', 0.008231591054766618],
   ['Pandemic Severity Assessment Framework', 0.008039264160963658],
   ['HIV/AIDS', 0.006826994168437393],
   ['COVID-19 pandemic', 0.005060007442488892],
   ['Swine influenza', 0.004675006876212563]]},
 {'query': 'zoonotic diseases',
  'relevant_article_titles': [['Swine influenza', 0.035414092804581326],
   ['Disease X', 0.029604135108640295],
   ['Pandemic', 0.022322198426744867],
   ['Pandemic prevention', 0.013871651879477165],
   ['HIV/AIDS', 0.013486328216158356],
   ['Targeted immunization strategies', 0.0105545177343848],
   ['Science diplomacy and pandemics', 0.01032995352727023],
   ['HIV/AIDS in Y