# Challenge 2: Dangerous Situtations


In [1]:
## Import python packages
import pandas as pd
import numpy as np
import math
from collections import defaultdict

from sklearn.feature_extraction.text import  TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from gensim import corpora
from gensim import models
from gensim import similarities

In [3]:
# get a list of english stop words
en_stops = set(stopwords.words('english'))

## Setup: initialize some constants

In [4]:

GENERAL_RULES = ['Wear a mask',
    'Stay 6 feet from others',
    'Avoid crowds',
    'Avoid poorly ventilated spaces',
    'Wash your hands often',
    'Cover coughs and sneezes',
    'Clean and disinfect frequently touched surfaces daily',
    'Monitor your health daily',
    'Get vaccinated']

rule_shortNames = ['wear_mask',
    'social_distance',
    'avoid_crowds',
    'poor_ventilation',
    'wash_hands',
    'cover_coughs',
    'disinfect_surfaces',
    'monitor_health',
    'vaccine']

PROMPTS = ["If you are sick with COVID-19",
           "If you are an older adult",
           "If you have asthma",
           "If you are at home caring for a newborn and are diagnosed with or test positive for COVID-19"]

PROMPT_KEYS = ['sick', 'older_adult', 'asthma', 'covid_with_newborn']

## Parse the data
Parse the CDC guidelines text file into a dataframe with two columns.  One collumn is the title headers (start with ***) and second column are the sentences that fall under that header
(header index helps id the same header)

In [44]:

def textToDataFrame(text, delimiter):
    """ 
    Takes in input text, splits by delimiter, returns as pandas dataframe with headings as keys e.g. [index, heade, text]
    """
    textArray = text.split(delimiter)
    df = pd.DataFrame(columns=["headerIndex","header", "text"])
    headerIndex = 0
    for line in textArray:
        if len(line) > 0:
            # print(headerIndex)
            # finds the first line in the section and uses that as the heading
            firstNewlineIndex = line.find("\n")
            header = line[0:firstNewlineIndex]
            # print(header)
            # puts the remaining text into dataframe
            df2 = pd.DataFrame({'headerIndex':headerIndex, 'header': header, 'text':(line[firstNewlineIndex + 1:]).replace("\xa0", " ").split("\n")})
            # combines new dataframe with the return dataframe
            df = df.append(df2, ignore_index=True)
            headerIndex += 1
    return df

filename = './data/CDCGuidelines.txt'

with open(filename, encoding="utf8") as myFile:
    data = myFile.read()

df = textToDataFrame(data, "***")
df.sample(10)


Unnamed: 0,headerIndex,header,text
1524,31,Long-Term Effects of COVID-19,Cardiovascular: inflammation of the heart muscle
3523,54,Holiday Tips,Safer Celebrations
2479,43,COVID-19 in Newly Resettled Refugee Populations,"Lack of access to television, radio, or Intern..."
5201,78,What to Do if Your Pet Tests Positive for the ...,While most pets appear to show only mild sympt...
2421,43,COVID-19 in Newly Resettled Refugee Populations,
4867,72,COVID-19 in Children and Teens,"Diarrhea, vomiting, or stomachache"
3346,52,Personal and Social Activities,"Before you go, call and ask if all staff are w..."
3672,58,Visiting Parks and Recreational Facilities,
2160,39,"Pregnancy, Breastfeeding, and Caring for Newborns",If you or someone you know has COVID-19 emerge...
4817,71,For Parents: Multisystem Inflammatory Syndrome...,What we don’t know about MIS-C


In [45]:
headers = df['header'].unique()
#just for nice output in notebook
pd.DataFrame(headers)

Unnamed: 0,0
0,Things to know about the COVID-19 Pandemic
1,Symptoms of Coronavirus
2,COVID-19 Testing Overview
3,Test for Current Infection (Viral Test)
4,Test for Past Infection
...,...
76,COVID-19 And Animals
77,If You Have Pets
78,What to Do if Your Pet Tests Positive for the ...
79,Guidance to Reduce the Risk of SARS-CoV-2 Spre...


## Find best matching titles to the prompts
Next step is to see if we can automatically sort through all the titles 
and find ones that are closely related to one of our categories
to do this, we need to do some preprocessing on the titles to:
 - remove stop words, such as: the, of, to, etc.
 - remove punctuation (i.e., commas, periods, colons, parens)
 - lower case everything
 - remove 's' at end of words to reduce plurals
 

In [46]:
def removeStopWords(text):
    returnText = ''
    for w in text.split(' '):
        if w not in en_stops and len(w)>0:
            returnText += ' ' + w
    returnText = returnText.strip()
    # print(returnText)
    return returnText

# remove words that appear only once
frequency = defaultdict(int)
combined = list(df['text']) + list(df['header']) + GENERAL_RULES

for text in combined:
    for token in text.split(' '):
        frequency[token] += 1
        
def removeSingleOccurances(text):
    returnText = ''
    for w in text.split(' '):
        if frequency[token] > 1:
            returnText += ' ' + w
    returnText = returnText.strip()
    # print(returnText)
    return returnText

def removePlurals(text):
    returnText = ''
    for w in text.split(' '):
        if w.endswith('s'):
            w = w[:-1]
        returnText += ' ' + w
    returnText = returnText.strip()
    return returnText
    
def nlpCleanup(df, columnName):
    df[columnName] = df[columnName].str.replace('\d+', '',regex=True) # for digits
    df[columnName] = df[columnName].str.replace(r'(\b\w{1,2}\b)', '',regex=True) # for word length lt 2
    df[columnName] = df[columnName].str.replace('[^\w\s]', '',regex=True) # for punctuation 
    df[columnName] = df[columnName].apply(removeStopWords)
    df[columnName] = df[columnName].apply(removeSingleOccurances)
    df[columnName] = df[columnName].apply(removePlurals)
    df[columnName] = df[columnName].str.lower()
    return df

# first make copy of unaltered text, this will be needed later
df['header_orig'] = df['header']
df['text_orig'] = df['text']

df = nlpCleanup(df, columnName='header')
df = nlpCleanup(df, columnName='text')

df.sample(10)


Unnamed: 0,headerIndex,header,text,header_orig,text_orig
160,5,home testing,follow manufacturer instruction exactly order ...,At-Home Testing,Follow the manufacturer’s instructions exactly...
820,18,cleaning disinfecting your home,clean hand taking glove handling used item,Cleaning and Disinfecting Your Home,Clean hands after taking off gloves or handlin...
3056,50,large gathering,eat outdoor possible you les likely get spread...,Large Gatherings,"Eat outdoors, if possible. You are less likely..."
2733,48,running essential errand,stay least feet away other shopping line,Running Essential Errands,Stay at least 6 feet away from others while sh...
3972,60,attending sporting event,,Attending Sporting Events,
4904,73,help stop spread covid children,,Help Stop the Spread of COVID-19 in Children,
1575,32,older adult,,Older Adults,
3151,51,small gathering,limit crowding area food served,Small Gatherings,Limit crowding in areas where food is served.
75,2,covid testing overview,viral test tell current infection,COVID-19 Testing Overview,A viral test tells you if you have a current i...
2653,47,returning work,what length time interacting people,Returning to Work,What’s the length of time that you will be int...


Similaryly process the general rules and prompts

In [47]:
genRulesDf = pd.DataFrame(GENERAL_RULES, columns=['rule'])

genRulesDf = nlpCleanup(genRulesDf, columnName='rule')

genRulesDf.head()

Unnamed: 0,rule
0,wear mask
1,stay feet other
2,avoid crowd
3,avoid poorly ventilated space
4,wash hand often


In [48]:
promptDf = pd.DataFrame(PROMPTS, columns=['prompt'])
promptDf = nlpCleanup(promptDf, columnName='prompt')

promptDf.head()

Unnamed: 0,prompt
0,sick covid
1,older adult
2,asthma
3,home caring newborn diagnosed test positive covid


## Now vectorize the headers and prompts using TFIDF

learn the bag of words using the titles, and apply the same vectorizer to prompts

In [49]:
headers = list(pd.Series(df['header'].unique()))

vectorizer = TfidfVectorizer(
    analyzer = 'word',
    # ngram_range=(2,2),
    lowercase = True,
    strip_accents='unicode',
    stop_words='english'
)

headerVects = vectorizer.fit_transform(headers).toarray()
vectorizer.get_feature_names()

['activitie',
 'additional',
 'adult',
 'air',
 'animal',
 'answer',
 'associated',
 'asthma',
 'attending',
 'avoid',
 'beache',
 'behavioral',
 'breastfeeding',
 'care',
 'caregiver',
 'caring',
 'cause',
 'cdc',
 'certain',
 'checklist',
 'children',
 'cleaning',
 'close',
 'communitie',
 'community',
 'condition',
 'contact',
 'coronaviru',
 'covid',
 'crew',
 'cruise',
 'current',
 'deciding',
 'dementia',
 'developmental',
 'disabilitie',
 'disease',
 'disinfecting',
 'disorder',
 'distancing',
 'doctor',
 'drug',
 'effect',
 'errand',
 'essential',
 'event',
 'experiencing',
 'facilitie',
 'facility',
 'food',
 'gathering',
 'getting',
 'glove',
 'guidance',
 'guide',
 'hand',
 'handler',
 'healthy',
 'help',
 'helping',
 'hiring',
 'holiday',
 'home',
 'homelessnes',
 'household',
 'housing',
 'improve',
 'improving',
 'indoor',
 'infection',
 'inflammatory',
 'international',
 'isolate',
 'know',
 'large',
 'learn',
 'likely',
 'live',
 'living',
 'longterm',
 'mask',
 'medica

In [50]:
promptList = list(promptDf['prompt'].values)

promptVects = vectorizer.transform(promptList).toarray()

## Now measure similarity
Using cosine similarities between the tfidf vectors
Return the best matching header for each prompt

In [51]:
def cosineSimilarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude
    

sims = []
for j, h in enumerate(headers):
    d = []
    for i, p in enumerate(promptList):
        a = headerVects[j,:]
        b = promptVects[i,:]
        # print(a)
        # print(b)
        cs = cosineSimilarity(a, b)
        d.append(cs)
        # print(cs)
        # break
    sims.append(d)
        
sims = np.array(sims)

bestMatches = np.argmax(sims, axis=0)

relevantHeaders = []
for m in bestMatches:
    relevantHeaders.append(headers[m])

In [52]:
# set the category for each row in df that matches the relevant 
# header found above
df['category'] = ''

for i, h in enumerate(relevantHeaders):
    df['category'][df['header'] == h] = PROMPT_KEYS[i]


### Next step is to limit to rows that match one of the 4 categories

In [53]:
df = df[np.logical_not(df['category']=='')]

headerDf = df[['headerIndex', 'header', 'category']]
headerDf = headerDf.drop_duplicates()

# print out those rows
headerDf

Unnamed: 0,headerIndex,header,category
1127,25,what you are sick,sick
1533,32,older adult,older_adult
2138,39,pregnancy breastfeeding caring newborn,covid_with_newborn
2396,42,people moderate severe asthma,asthma


## Find actionable instructions
Next we want to limit these texts to just ones that start with a verb
Using the part-of-speech tagging capability of nltk

In [54]:
def startsWithVerb(s):
    # returns if the first word in the sentence is a verb = this is a 
    # shortcut way to check for actionable instructions
    ret = False
    if len(s)>0:
        s=s.lower()
        tag_pos_string = pos_tag(word_tokenize(s))
        firstWordPartOfSpeech = tag_pos_string[0][1]
        ret = firstWordPartOfSpeech in ('VB', 'VBP')
    
    return ret

df['text_orig'] = df['text_orig'].str.replace('@', '', regex=False)
df['text_orig'] = df['text_orig'].str.replace('*', '', regex=False)

df['actionable'] = df['text_orig'].apply(startsWithVerb)

df.sample(10)

Unnamed: 0,headerIndex,header,text,header_orig,text_orig,category,actionable
1667,32,older adult,symptom covid get touch healthcare provider wi...,Older Adults,"If you have symptoms of COVID-19, get in touch...",older_adult,False
2411,42,people moderate severe asthma,,People with Moderate to Severe Asthma,,asthma,False
1606,32,older adult,consider level risk,Older Adults,Consider the Level of Risk,older_adult,True
2404,42,people moderate severe asthma,take everyday precaution like washing hand avo...,People with Moderate to Severe Asthma,Take everyday precautions like washing your ha...,asthma,True
1698,32,older adult,,Older Adults,,older_adult,False
1625,32,older adult,keep item hand use venturing mask tissue hand ...,Older Adults,Keep these items on hand and use them when ven...,older_adult,True
1593,32,older adult,then wash hand,Older Adults,Then wash your hands.,older_adult,False
1684,32,older adult,aware single reading higher multiple reading r...,Older Adults,be aware that a single reading higher than 10...,older_adult,True
1686,32,older adult,develop care plan,Older Adults,Develop a Care Plan,older_adult,True
1193,25,what you are sick,immediately wash hand soap water least second ...,What to Do If You Are Sick,Immediately wash your hands with soap and wate...,sick,False


In [55]:
# limit to those that start with verb to give actionable instruction
df = df[df['actionable']==True]
df.sample(10)

Unnamed: 0,headerIndex,header,text,header_orig,text_orig,category,actionable
2262,39,pregnancy breastfeeding caring newborn,consider healthy caregiver covid increased ris...,"Pregnancy, Breastfeeding, and Caring for Newborns",Consider having a healthy caregiver who does n...,covid_with_newborn,True
1190,25,what you are sick,cover cough sneeze,What to Do If You Are Sick,Cover your coughs and sneezes.,sick,True
2154,39,pregnancy breastfeeding caring newborn,wash hand soap water least second soap water a...,"Pregnancy, Breastfeeding, and Caring for Newborns",Wash your hands with soap and water for at lea...,covid_with_newborn,True
2283,39,pregnancy breastfeeding caring newborn,ask child care program plan place protect baby...,"Pregnancy, Breastfeeding, and Caring for Newborns",Ask your child care program about the plans th...,covid_with_newborn,True
1175,25,what you are sick,call call ahead local emergency facility notif...,What to Do If You Are Sick,Call 911 or call ahead to your local emergency...,sick,True
1671,32,older adult,check your symptom,Older Adults,Check Your Symptoms,older_adult,True
1684,32,older adult,aware single reading higher multiple reading r...,Older Adults,be aware that a single reading higher than 10...,older_adult,True
1703,32,older adult,remember importance staying physically active ...,Older Adults,Remember the importance of staying physically ...,older_adult,True
1131,25,what you are sick,keep track symptom,What to Do If You Are Sick,Keep track of your symptoms.,sick,True
2168,39,pregnancy breastfeeding caring newborn,get recommended vaccine getting recommended va...,"Pregnancy, Breastfeeding, and Caring for Newborns",Get recommended vaccines. Getting the recommen...,covid_with_newborn,True


In [56]:
# this shows the number of instructions by category
# note that we cannot have more than 20 instructions per category
# TODO: we have too many instructions for each category right now...
# this is probably because there are some duplicated instructions within each category
# need to do a self-similarity comparison to remove dups
df.groupby('category').count()

Unnamed: 0_level_0,headerIndex,header,text,header_orig,text_orig,actionable
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
asthma,3,3,3,3,3,3
covid_with_newborn,38,38,38,38,38,38
older_adult,22,22,22,22,22,22
sick,16,16,16,16,16,16


## Now compare general to specific rules
Next we want to compare the general rules against the rules that passed all 
previous processing steps and find the union.  Using the LSI model to find phrase 
similarity.  When a phrase is similar enough drop that general rule, 
else add that general rule for that category.
We'll also use phrase similarity to remove duplicate instructions within each category

In [57]:
# this creates a dictionary and bag of words encoding based on all text
headers = list(pd.Series(df['header'].unique()).str.split())
texts = list(df['text'].str.split()) 
generalRules = list(genRulesDf['rule'].str.split())
allWords = texts + headers + generalRules
dictionary = corpora.Dictionary(allWords)
# create corpus based on text (not headers)
corpus = [dictionary.doc2bow(text) for text in texts]
# create 2-d lsi model using gensim
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity(lsi[corpus])

## TODO 
Next we need to remove duplicate/highly overlapping instructions within this list
For example, "Keep track of your symptoms." and "Monitor your symptoms." are 
very similar instructions given for sick people.  
We need to remove the dups within each category.

### Add a column to show similarity of each row to the general rules

In [58]:
for i, gen in enumerate(genRulesDf.values):
    gen = gen[0].split()
    vec_bow = dictionary.doc2bow(gen)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    df[rule_shortNames[i]] = sims

df

Unnamed: 0,headerIndex,header,text,header_orig,text_orig,category,actionable,wear_mask,social_distance,avoid_crowds,poor_ventilation,wash_hands,cover_coughs,disinfect_surfaces,monitor_health,vaccine
1131,25,what you are sick,keep track symptom,What to Do If You Are Sick,Keep track of your symptoms.,sick,True,0.995549,0.953137,0.0,0.0,0.274072,0.966456,0.277825,0.958774,0.960937
1141,25,what you are sick,take care,What to Do If You Are Sick,Take care of yourself.,sick,True,0.932023,0.834367,0.0,0.0,0.001166,0.999860,0.005071,0.999932,0.999992
1142,25,what you are sick,get rest stay hydrated take overthecounter med...,What to Do If You Are Sick,Get rest and stay hydrated. Take over-the-coun...,sick,True,0.973809,0.903749,0.0,0.0,0.142148,0.992229,0.146013,0.988296,0.989443
1154,25,what you are sick,tell close contact may exposed covid infected ...,What to Do If You Are Sick,Tell your close contacts that they may have be...,sick,True,0.938728,0.844670,0.0,0.0,0.020130,0.999997,0.024034,0.999530,0.999737
1158,25,what you are sick,see covid animal question pet,What to Do If You Are Sick,See COVID-19 and Animals if you have questions...,sick,True,0.923836,0.822055,0.0,0.0,-0.020806,0.999252,-0.016901,0.999947,0.999838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2312,39,pregnancy breastfeeding caring newborn,cover baby head allow baby get hot sign baby m...,"Pregnancy, Breastfeeding, and Caring for Newborns",Do not cover your baby’s head or allow your ba...,covid_with_newborn,True,0.944050,0.853011,0.0,0.0,0.035902,0.999837,0.039805,0.998923,0.999251
2313,39,pregnancy breastfeeding caring newborn,smoke allow anyone smoke around baby,"Pregnancy, Breastfeeding, and Caring for Newborns",Do not smoke or allow anyone to smoke around y...,covid_with_newborn,True,0.956205,0.872735,0.0,0.0,0.074889,0.998371,0.078783,0.996349,0.996978
2403,42,people moderate severe asthma,make sure least day supply medicine,People with Moderate to Severe Asthma,Make sure that you have at least a 30-day supp...,asthma,True,0.544769,0.709711,0.0,0.0,0.979242,0.220170,0.980026,0.192387,0.199949
2404,42,people moderate severe asthma,take everyday precaution like washing hand avo...,People with Moderate to Severe Asthma,Take everyday precautions like washing your ha...,asthma,True,0.633506,0.782736,0.0,0.0,0.951084,0.325888,0.952283,0.298915,0.306266


## Find overlapping/very similar phrases
Using a threshold to find where the general rules are duplicated in the specific rules.  This threshold was derived using some trial and error.

In [59]:
THRESHOLD = 0.9999 

dfs = []

for cat in PROMPT_KEYS:
    print('--------')
    print(cat)
    tempDf = df[df['category'] == cat]
    text = tempDf['text']
    
    for i, rule in enumerate(rule_shortNames):
        dupRule = np.any(tempDf[rule] > THRESHOLD)

        if not dupRule:
            newRule = pd.DataFrame([[cat, GENERAL_RULES[i]]], columns=['category', 'text_orig'])
            tempDf = pd.concat([tempDf,newRule])
        else:
            print('found dup:', rule)
            
    dfs.append(tempDf[['category','text_orig']])

finalDf = pd.concat(dfs)

--------
sick
found dup: social_distance
found dup: wash_hands
found dup: cover_coughs
found dup: disinfect_surfaces
found dup: monitor_health
found dup: vaccine
--------
older_adult
found dup: cover_coughs
found dup: monitor_health
found dup: vaccine
--------
asthma
found dup: wash_hands
found dup: disinfect_surfaces
--------
covid_with_newborn
found dup: wear_mask
found dup: wash_hands
found dup: cover_coughs
found dup: disinfect_surfaces
found dup: monitor_health
found dup: vaccine


### format final output per HTM spec and save to csv

In [60]:
finalDf.columns = ['situation', 'rules']

finalDf

Unnamed: 0,situation,rules
1131,sick,Keep track of your symptoms.
1141,sick,Take care of yourself.
1142,sick,Get rest and stay hydrated. Take over-the-coun...
1154,sick,Tell your close contacts that they may have be...
1158,sick,See COVID-19 and Animals if you have questions...
...,...,...
2312,covid_with_newborn,Do not cover your baby’s head or allow your ba...
2313,covid_with_newborn,Do not smoke or allow anyone to smoke around y...
0,covid_with_newborn,Stay 6 feet from others
0,covid_with_newborn,Avoid crowds


In [61]:
fileName = './submission/Challenge2_submission.csv'

finalDf.to_csv(fileName, index=False)

In [63]:
## Show final number of rules per situation

In [64]:
"""
check counts by category
"""
print(finalDf.groupby('situation').count())

                    rules
situation                
asthma                 10
covid_with_newborn     41
older_adult            28
sick                   19
