# Challenge 2: Dangerous Situtations


In [195]:
## Import python packages
import pandas as pd
import numpy as np
import math
from collections import defaultdict

from sklearn.feature_extraction.text import  TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from gensim import corpora
from gensim import models
from gensim import similarities

In [196]:
# get a list of english stop words
en_stops = set(stopwords.words('english'))

## Setup: initialize some constants

In [197]:

GENERAL_RULES = ['Wear a mask',
    'Stay 6 feet from others',
    'Avoid crowds',
    'Avoid poorly ventilated spaces',
    'Wash your hands often',
    'Cover coughs and sneezes',
    'Clean and disinfect frequently touched surfaces daily',
    'Monitor your health daily',
    'Get vaccinated']

rule_shortNames = ['wear_mask',
    'social_distance',
    'avoid_crowds',
    'poor_ventilation',
    'wash_hands',
    'cover_coughs',
    'disinfect_surfaces',
    'monitor_health',
    'vaccine']

PROMPTS = ["If you are sick with COVID-19",
           "If you are an older adult",
           "If you have asthma",
           "If you are at home caring for a newborn and are diagnosed with or test positive for COVID-19"]

PROMPT_KEYS = ['sick', 'older_adult', 'asthma', 'covid_with_newborn']

## Parse the data
Parse the CDC guidelines text file into a dataframe with two columns.  One collumn is the title headers (start with ***) and second column are the sentences that fall under that header
(header index helps id the same header)

In [198]:

def textToDataFrame(text, delimiter):
    """ 
    Takes in input text, splits by delimiter, returns as pandas dataframe with headings as keys e.g. [index, heade, text]
    """
    textArray = text.split(delimiter)
    df = pd.DataFrame(columns=["headerIndex","header", "text"])
    headerIndex = 0
    for line in textArray:
        if len(line) > 0:
            # print(headerIndex)
            # finds the first line in the section and uses that as the heading
            firstNewlineIndex = line.find("\n")
            header = line[0:firstNewlineIndex]
            # print(header)
            # puts the remaining text into dataframe
            df2 = pd.DataFrame({'headerIndex':headerIndex, 'header': header, 'text':(line[firstNewlineIndex + 1:]).replace("\xa0", " ").split("\n")})
            # combines new dataframe with the return dataframe
            df = df.append(df2, ignore_index=True)
            headerIndex += 1
    return df

filename = './data/CDCGuidelines.txt'

with open(filename, encoding="utf8") as myFile:
    data = myFile.read()

df = textToDataFrame(data, "***")
df.sample(10)


Unnamed: 0,headerIndex,header,text
4167,63,Travel During COVID-19,"If your test is positive, isolate yourself to ..."
3807,59,Playing Sports,Bring your own water to minimize use and touch...
3419,53,Tips for Voters to Reduce Spread of COVID-19,Plan how you can get to your place to vote in ...
4768,70,Ship Crew Well-Being During COVID-19,Use gloves when:
1145,25,What to Do If You Are Sick,Call before you get medical care. Be sure to g...
3768,59,Playing Sports,Know your teammates
1126,24,Living in Shared Housing,
1995,36,People with Disabilities,Prepare
1326,28,Caring for Someone Sick at Home,Provide support
5331,79,Guidance to Reduce the Risk of SARS-CoV-2 Spre...,ENGINEERING CONTROLS\t


In [199]:
headers = df['header'].unique()
#just for nice output in notebook
pd.DataFrame(headers)

Unnamed: 0,0
0,Things to know about the COVID-19 Pandemic
1,Symptoms of Coronavirus
2,COVID-19 Testing Overview
3,Test for Current Infection (Viral Test)
4,Test for Past Infection
...,...
76,COVID-19 And Animals
77,If You Have Pets
78,What to Do if Your Pet Tests Positive for the ...
79,Guidance to Reduce the Risk of SARS-CoV-2 Spre...


## Find best matching titles to the prompts
Next step is to see if we can automatically sort through all the titles 
and find ones that are closely related to one of our categories
to do this, we need to do some preprocessing on the titles to:
 - remove stop words, such as: the, of, to, etc.
 - remove punctuation (i.e., commas, periods, colons, parens)
 - lower case everything
 - remove 's' at end of words to reduce plurals
 

In [200]:
def removeStopWords(text):
    returnText = ''
    for w in text.split(' '):
        if w not in en_stops and len(w)>0:
            returnText += ' ' + w
    returnText = returnText.strip()
    # print(returnText)
    return returnText

# remove words that appear only once
frequency = defaultdict(int)
combined = list(df['text']) + list(df['header']) + GENERAL_RULES

for text in combined:
    for token in text.split(' '):
        frequency[token] += 1
        
def removeSingleOccurances(text):
    returnText = ''
    for w in text.split(' '):
        if frequency[token] > 1:
            returnText += ' ' + w
    returnText = returnText.strip()
    # print(returnText)
    return returnText

def removePlurals(text):
    returnText = ''
    for w in text.split(' '):
        if w.endswith('s'):
            w = w[:-1]
        returnText += ' ' + w
    returnText = returnText.strip()
    return returnText
    
def nlpCleanup(df, columnName):
    df[columnName] = df[columnName].str.replace('\d+', '',regex=True) # for digits
    df[columnName] = df[columnName].str.replace(r'(\b\w{1,2}\b)', '',regex=True) # for word length lt 2
    df[columnName] = df[columnName].str.replace('[^\w\s]', '',regex=True) # for punctuation 
    df[columnName] = df[columnName].apply(removeStopWords)
    df[columnName] = df[columnName].apply(removeSingleOccurances)
    df[columnName] = df[columnName].apply(removePlurals)
    df[columnName] = df[columnName].str.lower()
    df[columnName] = df[columnName].str.strip()
    return df

# first make copy of unaltered text, this will be needed later
df['header_orig'] = df['header']
df['text_orig'] = df['text']

df = nlpCleanup(df, columnName='header')
df = nlpCleanup(df, columnName='text')

df.sample(10)


Unnamed: 0,headerIndex,header,text,header_orig,text_orig
4329,65,know when not travel avoid spreading covid,when postpone travel,Know When Not to Travel to Avoid Spreading COV...,When to Postpone your Travel
1344,28,caring someone sick home,,Caring for Someone Sick at Home,
256,6,contact tracing,test negative symptom continue stay away other...,Contact Tracing,If your test is negative and you do not have s...
5172,77,you have pet,united state evidence animal playing significa...,If You Have Pets,"In the United States, there is no evidence tha..."
3153,51,small gathering,wash dishe dishwasher hot soapy water immediat...,Small Gatherings,Wash dishes in the dishwasher or with hot soap...
1372,29,sick parent caregiver,child need stay home parent caregiver sick,Sick Parents and Caregivers,If a child needs to stay in the home with you ...
3502,54,holiday tip,keep spare mask case mask become wet moisture ...,Holiday Tips,Keep a spare mask in case your mask becomes we...
556,14,improve how your mask protect you,,Improve How Your Mask Protects You,
2808,49,protect yourself when using transportation,wear mask,Protect Yourself When Using Transportation,Wear Masks.
700,18,cleaning disinfecting your home,clean launder item according manufacturer inst...,Cleaning and Disinfecting Your Home,Clean or launder items according to the manufa...


#### Similarly process the general rules and prompts

In [201]:
genRulesDf = pd.DataFrame(GENERAL_RULES, columns=['rule'])

genRulesDf = nlpCleanup(genRulesDf, columnName='rule')

genRulesDf.head()

Unnamed: 0,rule
0,wear mask
1,stay feet other
2,avoid crowd
3,avoid poorly ventilated space
4,wash hand often


In [202]:
promptDf = pd.DataFrame(PROMPTS, columns=['prompt'])
promptDf = nlpCleanup(promptDf, columnName='prompt')

promptDf.head()

Unnamed: 0,prompt
0,sick covid
1,older adult
2,asthma
3,home caring newborn diagnosed test positive covid


## Now vectorize the headers and prompts using TFIDF

learn the bag of words using the titles, and apply the same vectorizer to prompts

In [203]:
headers = list(pd.Series(df['header'].unique()))

vectorizer = TfidfVectorizer(
    analyzer = 'word',
    # ngram_range=(2,2),
    lowercase = True,
    strip_accents='unicode',
    stop_words='english'
)

headerVects = vectorizer.fit_transform(headers).toarray()
vectorizer.get_feature_names()

['activitie',
 'additional',
 'adult',
 'air',
 'animal',
 'answer',
 'associated',
 'asthma',
 'attending',
 'avoid',
 'beache',
 'behavioral',
 'breastfeeding',
 'care',
 'caregiver',
 'caring',
 'cause',
 'cdc',
 'certain',
 'checklist',
 'children',
 'cleaning',
 'close',
 'communitie',
 'community',
 'condition',
 'contact',
 'coronaviru',
 'covid',
 'crew',
 'cruise',
 'current',
 'deciding',
 'dementia',
 'developmental',
 'disabilitie',
 'disease',
 'disinfecting',
 'disorder',
 'distancing',
 'doctor',
 'drug',
 'effect',
 'errand',
 'essential',
 'event',
 'experiencing',
 'facilitie',
 'facility',
 'food',
 'gathering',
 'getting',
 'glove',
 'guidance',
 'guide',
 'hand',
 'handler',
 'healthy',
 'help',
 'helping',
 'hiring',
 'holiday',
 'home',
 'homelessnes',
 'household',
 'housing',
 'improve',
 'improving',
 'indoor',
 'infection',
 'inflammatory',
 'international',
 'isolate',
 'know',
 'large',
 'learn',
 'likely',
 'live',
 'living',
 'longterm',
 'mask',
 'medica

In [204]:
promptList = list(promptDf['prompt'].values)

promptVects = vectorizer.transform(promptList).toarray()

## Now measure similarity
Using cosine similarities between the tfidf vectors
Return the best matching header for each prompt

In [205]:
def cosineSimilarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude
    

sims = []
for j, h in enumerate(headers):
    d = []
    for i, p in enumerate(promptList):
        a = headerVects[j,:]
        b = promptVects[i,:]
        # print(a)
        # print(b)
        cs = cosineSimilarity(a, b)
        d.append(cs)
        # print(cs)
        # break
    sims.append(d)
        
sims = np.array(sims)

bestMatches = np.argmax(sims, axis=0)

relevantHeaders = []
for m in bestMatches:
    relevantHeaders.append(headers[m])

In [206]:
# set the category for each row in df that matches the relevant 
# header found above
df['category'] = ''

for i, h in enumerate(relevantHeaders):
    df['category'][df['header'] == h] = PROMPT_KEYS[i]


### Next step is to limit to rows that match one of the 4 categories

In [207]:
df = df[np.logical_not(df['category']=='')]

headerDf = df[['headerIndex', 'header', 'category']]
headerDf = headerDf.drop_duplicates()

# print out those rows
headerDf

Unnamed: 0,headerIndex,header,category
1127,25,what you are sick,sick
1533,32,older adult,older_adult
2138,39,pregnancy breastfeeding caring newborn,covid_with_newborn
2396,42,people moderate severe asthma,asthma


## Find actionable instructions
Next we want to limit these texts to just ones that start with a verb
Using the part-of-speech tagging capability of nltk

In [209]:
def startsWithVerb(s):
    # returns if the first word in the sentence is a verb = this is a 
    # shortcut way to check for actionable instructions
    ret = False
    if len(s)>0:
        s=s.lower()
        tag_pos_string = pos_tag(word_tokenize(s))
        firstWordPartOfSpeech = tag_pos_string[0][1]
        ret = firstWordPartOfSpeech in ('VB', 'VBP')
    
    return ret

df['text_orig'] = df['text_orig'].str.replace('@', '', regex=False)
df['text_orig'] = df['text_orig'].str.replace('*', '', regex=False)

df['actionable'] = df['text_orig'].apply(startsWithVerb)

df.sample(25)

Unnamed: 0,headerIndex,header,text,header_orig,text_orig,category,actionable
1548,32,older adult,may even die,Older Adults,they may even die.,older_adult,False
2214,39,pregnancy breastfeeding caring newborn,,"Pregnancy, Breastfeeding, and Caring for Newborns",,covid_with_newborn,False
2402,42,people moderate severe asthma,prepare covid,People with Moderate to Severe Asthma,Prepare for COVID-19,asthma,False
1540,32,older adult,the risk severe illnes covid increase age olde...,Older Adults,The risk for severe illness with COVID-19 incr...,older_adult,False
1571,32,older adult,healthcare provider contact nearest community ...,Older Adults,"If you don’t have a healthcare provider, conta...",older_adult,False
1182,25,what you are sick,,What to Do If You Are Sick,,sick,False
1145,25,what you are sick,call get medical care sure get care trouble br...,What to Do If You Are Sick,Call before you get medical care. Be sure to g...,sick,False
1597,32,older adult,learn additional information adult disabilitie,Older Adults,Learn additional information for adults with d...,older_adult,False
2226,39,pregnancy breastfeeding caring newborn,care newborn follow hand washing mask recommen...,"Pregnancy, Breastfeeding, and Caring for Newborns","If they have to care for the newborn, they sho...",covid_with_newborn,False
2165,39,pregnancy breastfeeding caring newborn,talk healthcare provider stay healthy take car...,"Pregnancy, Breastfeeding, and Caring for Newborns",Talk to your healthcare provider about how to ...,covid_with_newborn,False


In [210]:
# limit to those that start with verb to give actionable instruction
df = df[df['actionable']==True]
df.sample(10)

Unnamed: 0,headerIndex,header,text,header_orig,text_orig,category,actionable
1158,25,what you are sick,see covid animal question pet,What to Do If You Are Sick,See COVID-19 and Animals if you have questions...,sick,True
2302,39,pregnancy breastfeeding caring newborn,ensure safe sleep baby,"Pregnancy, Breastfeeding, and Caring for Newborns",Ensure safe sleep for your baby,covid_with_newborn,True
1175,25,what you are sick,call call ahead local emergency facility notif...,What to Do If You Are Sick,Call 911 or call ahead to your local emergency...,sick,True
2168,39,pregnancy breastfeeding caring newborn,get recommended vaccine getting recommended va...,"Pregnancy, Breastfeeding, and Caring for Newborns",Get recommended vaccines. Getting the recommen...,covid_with_newborn,True
2297,39,pregnancy breastfeeding caring newborn,check baby growth feeding,"Pregnancy, Breastfeeding, and Caring for Newborns",Check your baby’s growth and feeding.,covid_with_newborn,True
2151,39,pregnancy breastfeeding caring newborn,take step prevent getting covid interact other,"Pregnancy, Breastfeeding, and Caring for Newborns",Take steps to prevent getting COVID-19 when yo...,covid_with_newborn,True
2186,39,pregnancy breastfeeding caring newborn,decide newborn rooming hospital,"Pregnancy, Breastfeeding, and Caring for Newborns",Decide if your newborn is rooming-in with you ...,covid_with_newborn,True
1558,32,older adult,change treatment plan without talking healthca...,Older Adults,Do not change your treatment plan without talk...,older_adult,True
1131,25,what you are sick,keep track symptom,What to Do If You Are Sick,Keep track of your symptoms.,sick,True
1707,32,older adult,get tip staying connected home english spanish...,Older Adults,Get tips on staying connected while at home in...,older_adult,True


In [211]:
# this shows the number of instructions by category
# note that we cannot have more than 20 instructions per category
# TODO: we have too many instructions for each category right now...
# this is probably because there are some duplicated instructions within each category
# need to do a self-similarity comparison to remove dups
df.groupby('category').count()

Unnamed: 0_level_0,headerIndex,header,text,header_orig,text_orig,actionable
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
asthma,3,3,3,3,3,3
covid_with_newborn,38,38,38,38,38,38
older_adult,22,22,22,22,22,22
sick,16,16,16,16,16,16


## Now compare general to specific rules
Next we want to compare the general rules against the rules that passed all 
previous processing steps and find the union.  Using the LSI model to find phrase 
similarity.  When a phrase is similar enough drop that general rule, 
else add that general rule for that category.
We'll also use phrase similarity to remove duplicate instructions within each category

Experimented with num_topics hyper parameter a bit.  Tried 2, 5, and 10.  10 provided reasonable results based on human interpretation of similarity.


In [212]:
# this creates a dictionary and bag of words encoding based on all text
headers = list(pd.Series(df['header'].unique()).str.split())
texts = list(df['text'].str.split()) 
generalRules = list(genRulesDf['rule'].str.split())
allWords = texts + headers + generalRules
dictionary = corpora.Dictionary(allWords)
# create corpus based on text (not headers)
corpus = [dictionary.doc2bow(text) for text in texts]
# create lsi model using gensim
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus])

## TODO 
Next we need to remove duplicate/highly overlapping instructions within this list
For example, "Keep track of your symptoms." and "Monitor your symptoms." are 
very similar instructions given for sick people.  
We need to remove the dups within each category.

### Add a column to show similarity of each row to the general rules

In [213]:
for i, gen in enumerate(genRulesDf.values):
    gen = gen[0].split()
    vec_bow = dictionary.doc2bow(gen)
    vec_lsi = lsi[vec_bow]
    sims = index[vec_lsi]
    df[rule_shortNames[i]] = sims

df

Unnamed: 0,headerIndex,header,text,header_orig,text_orig,category,actionable,wear_mask,social_distance,avoid_crowds,poor_ventilation,wash_hands,cover_coughs,disinfect_surfaces,monitor_health,vaccine
1131,25,what you are sick,keep track symptom,What to Do If You Are Sick,Keep track of your symptoms.,sick,True,-0.186307,0.362461,0.0,0.0,0.010062,0.326505,-0.014259,0.244942,0.183463
1141,25,what you are sick,take care,What to Do If You Are Sick,Take care of yourself.,sick,True,-0.010576,-0.031643,0.0,0.0,-0.007789,-0.329172,-0.083289,0.287246,-0.069353
1142,25,what you are sick,get rest stay hydrated take overthecounter med...,What to Do If You Are Sick,Get rest and stay hydrated. Take over-the-coun...,sick,True,-0.156679,0.792615,0.0,0.0,-0.020874,0.457620,-0.187422,-0.080773,0.790896
1154,25,what you are sick,tell close contact may exposed covid infected ...,What to Do If You Are Sick,Tell your close contacts that they may have be...,sick,True,0.053896,0.104484,0.0,0.0,0.005471,0.177780,-0.083671,-0.014206,0.069781
1158,25,what you are sick,see covid animal question pet,What to Do If You Are Sick,See COVID-19 and Animals if you have questions...,sick,True,0.205234,0.215658,0.0,0.0,0.011471,0.141091,-0.046450,-0.106139,0.036375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2312,39,pregnancy breastfeeding caring newborn,cover baby head allow baby get hot sign baby m...,"Pregnancy, Breastfeeding, and Caring for Newborns",Do not cover your baby’s head or allow your ba...,covid_with_newborn,True,0.065271,0.419098,0.0,0.0,-0.010373,0.815870,0.301226,-0.068340,0.640503
2313,39,pregnancy breastfeeding caring newborn,smoke allow anyone smoke around baby,"Pregnancy, Breastfeeding, and Caring for Newborns",Do not smoke or allow anyone to smoke around y...,covid_with_newborn,True,0.232250,0.224551,0.0,0.0,-0.016073,0.635955,0.348191,-0.064881,0.331330
2403,42,people moderate severe asthma,make sure least day supply medicine,People with Moderate to Severe Asthma,Make sure that you have at least a 30-day supp...,asthma,True,0.003816,0.298352,0.0,0.0,0.172642,-0.163970,-0.152979,-0.084192,-0.105751
2404,42,people moderate severe asthma,take everyday precaution like washing hand avo...,People with Moderate to Severe Asthma,Take everyday precautions like washing your ha...,asthma,True,0.052426,0.492487,0.0,0.0,0.706026,0.098933,0.422527,-0.150464,0.165037


## Find overlapping/very similar phrases
Using a threshold to find where the general rules are duplicated in the specific rules.  This threshold was derived using some trial and error.

In [188]:
THRESHOLD = 0.99

dfs = []

for cat in PROMPT_KEYS:
    print('--------')
    print(cat)
    tempDf = df[df['category'] == cat]
    text = tempDf['text'].values
    
    for i, rule in enumerate(rule_shortNames):
        dupRule = np.any(tempDf[rule] > THRESHOLD)

        if not dupRule:
            newRule = pd.DataFrame([[cat, GENERAL_RULES[i]]], columns=['category', 'text_orig'])
            tempDf = pd.concat([tempDf,newRule])
#             print('    ***')
#             print('    No dup found:', rule)
#             print('       ', list(tempDf['text'].values))
        else:
            print('    ***')
            print('    found dup:', rule)
            print('       ', list(tempDf['text'][tempDf[rule] > THRESHOLD].values))
            
    dfs.append(tempDf[['category','text_orig']])

finalDf = pd.concat(dfs)

--------
sick
    ***
    found dup: wash_hands
        ['clean hand often', 'wash hand']
    ***
    found dup: cover_coughs
        ['cover cough sneeze', 'cover mouth nose tissue cough sneeze']
--------
older_adult
--------
asthma
    ***
    found dup: wash_hands
        ['wash hand often soap water least second use hand sanitizer contain least alcohol']
--------
covid_with_newborn
    ***
    found dup: wash_hands
        ['wash hand soap water least second soap water available use hand sanitizer least alcohol', 'wash hand soap water least second holding caring newborn soap water available use hand sanitizer least alcohol', 'wash hand soap water least second touching newborn soap water available use hand sanitizer least alcohol', 'wash hand breastfeeding']


### format final output per HTM spec and save to csv

In [214]:
finalDf.columns = ['situation', 'rules']

finalDf.sample(len(finalDf))

Unnamed: 0,situation,rules
2312,covid_with_newborn,Do not cover your baby’s head or allow your ba...
2279,covid_with_newborn,Keep distance between your baby and people who...
0,covid_with_newborn,Get vaccinated
2313,covid_with_newborn,Do not smoke or allow anyone to smoke around y...
1625,older_adult,Keep these items on hand and use them when ven...
...,...,...
1567,older_adult,Call your healthcare provider about underlying...
1154,sick,Tell your close contacts that they may have be...
1209,sick,"Do not share dishes, drinking glasses, cups, e..."
1703,older_adult,Remember the importance of staying physically ...


In [183]:
fileName = './submission/Challenge2_submission.csv'

finalDf.to_csv(fileName, index=False)

## Show final number of rules per situation

In [184]:
"""
check counts by category
"""
print(finalDf.groupby('situation').count())

                    rules
situation                
asthma                 11
covid_with_newborn     45
older_adult            29
sick                   23
