Question: What do people want to know about coronavirus?

Step 1: Webscraping
- Get frequently asked question from CDC webpage
- Store question answer pairs in a dictionary

In [2]:
import requests
from bs4 import BeautifulSoup
from functools import reduce

In [3]:
#CDC questions

urls = [
    "https://www.cdc.gov/coronavirus/2019-ncov/faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/hcp/faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/prevent-getting-sick/cloth-face-cover-faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/tribal/faq-burial-practice.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/schools-childcare/schools-faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/colleges-universities/faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/large-events/event-planners-and-attendees-faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/retirement/faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/correction-detention/faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/hcp/faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/travelers/faqs.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/covid-data/faq-surveillance.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/general-business-faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/wildland-firefighters-faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/law-enforcement-agencies-faq.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/community/homeless-shelters/faqs.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/php/water.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/lab/biosafety-faqs.html",
    "https://www.cdc.gov/coronavirus/2019-ncov/lab/testing-laboratories.html"   
]
questions = []
results = {}

for url in urls:
    
    page = requests.get(url).text
    soup = BeautifulSoup(page)
    
    for content in soup.find_all(class_="card bar"):
        question = content.find("span").text
        questions.append(question)
        answer = content.find("p").text

        p = content.find_all("li")
        if p != None:
            for i in range(0, len(p)):
                answer += " " + p[i].text 
                if (i != len(p) - 1):
                    answer += ","
                    
        results[question] = answer


In [4]:
#WHO questions

questionsHubUrl = "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/question-and-answers-hub"
page = requests.get(questionsHubUrl).text
soup = BeautifulSoup(page, "lxml")

urls = [str("http://www.who.int" + link['href']) for link in soup.find_all(class_="sf-list-vertical__item") if link.has_attr('href')]

for url in urls:
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    for content in soup.find_all(class_="sf-accordion__panel"):
        question = content.find('a').text.strip()
        questions.append(question)
        answer = [c.text.strip() for c in content.find_all('p')]
        answer = reduce(lambda x,y: x + ' ' + y, answer)
        results[question] = answer

In [5]:
#FDA questions

url = "https://www.fda.gov/medical-devices/emergency-situations-medical-devices/faqs-testing-sars-cov-2"
page = requests.get(url).text
soup = BeautifulSoup(page, "lxml")

qTemp = []
aTemp = []

for content in soup.find_all(class_="panel-title"):
    question = content.find('a')
    if question != None:
        qTemp.append(question.text[3:].strip().split('?')[0])

for content in soup.find_all(class_="panel-body"):
    answer = [c.text.strip() for c in content.find_all('p')]
    answer = reduce(lambda x,y: x + ' ' + y, answer)
    if answer != None and answer[0] == 'A':
        aTemp.append(answer[3:])

for i in range(len(qTemp)):
    questions.append(qTemp[i])
    results[qTemp[i]] = aTemp[i] 


In [6]:
print(len(results))

615


Step 2: Data Cleaning

In [7]:
#Removing stopwords, correcting typos
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

In [8]:
stopWords = set(stopwords.words("english"))

In [9]:
def clean1(sentence):
    sentence = TextBlob(sentence).correct()
    words = list(sentence.words)
    words = [w for w in words if w.isalpha()] #Remove punctuation
    return [w for w in words if not w in stopWords] #Return nonstop word sentence

In [10]:
#Lemmatize and remove duplicates
from textblob import Word

In [11]:
def clean2(words):
    return set([Word(w).lemmatize() for w in words])

Step 4: Compare simplified question to question in our dataset, store results

In [12]:
import pickle


# with open("simplified_qs.txt", "wb") as file:
#     pickle.dump([clean2(clean1(q)) for q in questions], file)
    
simplifiedQs = pickle.load(open("simplified_qs.txt", "rb"))



In [13]:
#Tokenize words
import nltk.tokenize

def calculateComparisonScore(a, b):
    score = 0
    commonElementWithTags = nltk.pos_tag(set(a) & set(b))
    for w in commonElementWithTags:
        if w[1] == 'NNP':
            score += 3
        elif w[1] == "NN":
            score += 2
        elif w[1] == 'VB':
            score += 1
        else:
            score += 0.1
    return score

In [None]:
#Trying to chunk noun phrases
import numpy as np
from nltk.tokenize import PunktSentenceTokenizer

def findChunk(question, regex):
    tokenized = nltk.sent_tokenize(question)
    words = nltk.word_tokenize(tokenized[0])
    tagged = nltk.pos_tag(words)
    cp = nltk.RegexpParser(regex)
    chunks = cp.parse(tagged)
    chunks = [chunk for chunk in chunks if chunk[0][1] == 'NNP']
    for chunk in chunks:
        val = ""
        for value in chunk:
            val += str(value[0]) + " "
        print(val)
        
findChunk("What is the John Stamos compared to the CDC", "NNP: {<NNP>+}") 

In [34]:
inputQuestion = "corona"

cleaned = clean2(clean1(inputQuestion))

bestMatch = 0
best = 0


for i in range(0, len(simplifiedQs)):
    commonAmount = calculateComparisonScore(simplifiedQs[i], cleaned)
    if commonAmount > best:
        best = commonAmount
        bestMatch = i
        
if best == 0:
    print("Sorry but we don't seem to have an answer for your specific question")
else:
    print(questions[bestMatch] + "\n")
    print(results[questions[bestMatch]])


Sorry but we don't seem to have an answer for your specific question
