In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('covid_qa.csv')
data.head(5)

Unnamed: 0,question,answer
0,Can I get COVID-19 from animals when travellin...,Although the current spread and growth of the ...
1,How can I protect myself and others?,The best way to prevent illness from COVID-19 ...
2,Where did COVID-19 come from?,"It was first found in Wuhan City, Hubei Provin..."
3,Can my pet or other animals get sick from COVI...,There is currently no evidence to suggest that...
4,How can I protect my child from COVID-19?,By having them practice the same things you ha...


In [2]:
import nltk
import spacy

from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize

from matplotlib import pyplot as plt

In [4]:
nlp = spacy.load('en')

In [46]:
def tokenize(doc, lemmatized=True, stopword=True, punctuation=True):
    
    tokens=[]
    
    d = nlp(doc)
    for token in d:
        keep = True
        t = token.text
        if lemmatized:
            t = token.lemma_
        if not stopword and token.is_stop:
            keep = False
        if not punctuation and token.is_punct:
            keep = False
        if keep and t.strip()!='':
            tokens.append(t.lower())
    return tokens

In [58]:
def compute_tfidf(docs, lemmatized=True, stopword=True, punctuation=True):
     
    doc_tokens = {idx:nltk.FreqDist(tokenize(doc,lemmatized,stopword,punctuation)) for idx, doc in enumerate(docs)}
    
    dtm = pd.DataFrame.from_dict(doc_tokens,orient='index')
    dtm = dtm.sort_index(axis=0)
    dtm = dtm.fillna(0)
    
    words = dtm.columns.tolist()
    
    tf = dtm.values
    doc_len = tf.sum(axis=1)
    tf = np.divide(tf, doc_len[:,None])
    
    df = np.sum(np.where(tf>0,1,0),axis=0)
    
    smoothed_idf = np.log(np.divide(len(doc)+1,df+1))+1
    
    tf_idf = normalize(tf*smoothed_idf)
    
    smoothed_tf_idf = normalize(tf*smoothed_idf)
    
    return smoothed_tf_idf, smoothed_idf, words 

In [60]:
def vectorize_doc(doc, words, idf, lemmatized=True, stopword=True, punctuation=True):
    
    doc_tokens=nltk.FreqDist(tokenize(doc,lemmatized,stopword,punctuation))
    
    vect = np.array([doc_tokens[w] if w in doc_tokens else 0 for w in words])
    vect = vect/np.sum(vect)
    vect = vect*idf
    vect = vect/np.sqrt(np.sum(np.square(vect)))
    
    return vect

In [62]:
def find_answer(doc_vect, tf_idf, docs):
    
    sim = np.sum(doc_vect*tf_idf,axis=1)
    top_index = sim.argsort()[::-1][0:3]
    top_docs = docs[top_index]
    
    return top_docs

In [64]:
if __name__ == "__main__":
    
    data = pd.read_csv("covid_qa.csv")
    docs = data.apply(lambda x: x["question"] + " " + x["answer"], axis=1)
    
    tf_idf, idf, words = compute_tfidf(docs, lemmatized=True, stopword=True, punctuation=False)
    
    #Try Questions here
    doc = "what is the symptom?"
    vect = vectorize_doc(doc, words, idf, lemmatized=True, stopword=True, punctuation=False)
    
    answers = find_answer(vect, tf_idf, docs)
    for a in answers:
        print(a, "\n")

Are symptoms of COVID-19 different in children? No. They have the same symptoms, though it may be milder in children. Reported symptoms include cold-like symptoms, such as fever, runny nose, and cough. Vomiting and diarrhea have also been reported. There is more to be learned about how COVID-19 affects children. 

What are the symptoms of COVID-19? Reported illnesses have ranged from mild symptoms to severe illness and death from confirmed COVID-19 cases. Confirmed means that these people were tested positive for COVID-19. These are the symptoms that can appear 2 to 14 days after exposure: fever, cough, shortness of breath, pneumonia in both lungs. Those who are infected with COVID-19 may also have little to no symptoms.   However, if you're experiencing difficulty breathing or shortness of breath, persistent pain or pressure in the chest, new confusion or inability to arouse, and/or bluish lips or face - please seek medical attention immediately. This is not an all inclusive list - if