In [4]:
!pip install spacy
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import json
import collections
import spacy 
import sklearn
import numpy as np
import pandas as pd

In [6]:
with open("./data.json","r") as file:
    data=json.load(file)

In [7]:
nlp=spacy.load("en_core_web_sm")

In [110]:
# for token in doc:
#     print(token.text)
#     print(token.i)
#     print(token.lower_)
#     print(token.pos_)
#     print(token.lemma_)
#     print(token.is_alpha)
#     print(token.is_stop)
#     print("\n")

In [8]:
def preprocess(text):
    text=text.lower()
    doc=nlp(text)
    tokens=[]
    for token in doc:
        if not token.is_stop and token.pos_ not in ("PUNCT","SPACE") and token.is_alpha and token.lower_==token.lemma_:
            tokens.append(token.lemma_)
    return tokens

tokenized_text=[preprocess(articles['text']) for articles in data]

In [9]:
data_1=[{**items,'tokenized_text':token} for items,token in zip(data,tokenized_text)]

In [10]:
json_data = json.dumps(data_1)
with open('data_1.json', 'w') as f:
    f.write(json_data)

In [11]:
vocab=[]
for items in tokenized_text:
    vocab+=items

In [16]:
from collections import Counter
c=dict(Counter(vocab))

In [90]:
#TF-IDF calculation
import math

total_docs=len(data_1)
def tf_calc(tokenized_text):
    length_doc=len(tokenized_text)
    c=Counter(tokenized_text)
    tf=[]
    for word in vocab:
        if word in tokenized_text:
            tf.append(c[word]/length_doc)
        else:
            tf.append(0)
    return tf

tf=[tf_calc(a['tokenized_text']) for a in data_1]

idf_dict={}
len_docs=len(data_1)
for word in vocab:
    word_count=0
    for doc in [x['tokenized_text'] for x in data_1]:
        if word in doc:
            word_count+=1
    
    idf_dict[word]=math.log(len_docs/word_count)


def idf_calc(tokenized_text):
    return [idf_dict[word] if word in tokenized_text else 0 for word in vocab]

idf=[idf_calc(a['tokenized_text']) for a in data_1]

def tf_idf_calc(tf,idf):
    return np.multiply(np.array(tf), np.array(idf))
    
data_2=[{**items,'tf_idfs':tf_idf_calc(tf,idf)} for items,tf,idf in zip(data_1,tf,idf)]

In [91]:
data_2

[{'title': 'Pandemic',
  'text': 'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 

In [92]:
query_text="When did the Spanish flu happen?"

In [93]:
query_text_tokenized=preprocess(query_text)

In [94]:
def cosine_similarity(u,v):
    dot_product=np.dot(u,v)
    mag_u=np.sqrt(np.dot(u,u))
    mag_v=np.sqrt(np.dot(v,v))
    return dot_product/mag_u*mag_v

query_text_tfidf=tf_idf_calc(tf_calc(query_text_tokenized),idf_calc(query_text_tokenized))
query_text_tfidf

array([0., 0., 0., ..., 0., 0., 0.])

In [110]:
def query_similarity_computation(query):
    query_text_tokenized=preprocess(query_text)
    query_text_tfidf=tf_idf_calc(tf_calc(query_text_tokenized),idf_calc(query_text_tokenized))
    query_similarity=[cosine_similarity(query_text_tfidf,v['tf_idfs']) for v in data_2]
    data_3=[{'title':items['title'],'cosine_similarity':v} for items,v in zip(data_2,query_similarity)]
    return sorted(data_3,key=lambda k:k['cosine_similarity'],reverse=True)

In [111]:
data_3=query_similarity_computation(query_text)

In [112]:
data_3

[{'title': 'Swine influenza', 'cosine_similarity': 0.7119915632810382},
 {'title': 'Spanish flu', 'cosine_similarity': 0.23664106462492535},
 {'title': 'Pandemic', 'cosine_similarity': 0.12802291994796772},
 {'title': 'Unified Victim Identification System',
  'cosine_similarity': 0.05780574877263571},
 {'title': 'Epidemiology of HIV/AIDS', 'cosine_similarity': 0.0},
 {'title': 'Antonine Plague', 'cosine_similarity': 0.0},
 {'title': 'Basic reproduction number', 'cosine_similarity': 0.0},
 {'title': 'Bills of mortality', 'cosine_similarity': 0.0},
 {'title': 'Cholera', 'cosine_similarity': 0.0},
 {'title': 'COVID-19 pandemic', 'cosine_similarity': 0.0},
 {'title': 'Crimson Contagion', 'cosine_similarity': 0.0},
 {'title': 'Disease X', 'cosine_similarity': 0.0},
 {'title': 'Event 201', 'cosine_similarity': 0.0},
 {'title': 'HIV/AIDS', 'cosine_similarity': 0.0},
 {'title': 'HIV/AIDS in Yunnan', 'cosine_similarity': 0.0},
 {'title': 'Pandemic prevention', 'cosine_similarity': 0.0},
 {'titl