### Parser

Here is the parser for user input.

In [1]:
import numpy as np
import re
import sklearn as sk
import pickle

In [8]:
def parser(text, punctuation=False):
    if punctuation:
        return np.array(re.findall('[\w_][^\s]*[\w_]|[\w_]', text.lower()))
    else:
        return np.array(re.findall('[\w_]+', text.lower()))

In [9]:
parser("sadadsddsa dsadsdas sadadsd")

array(['sadadsddsa', 'dsadsdas', 'sadadsd'], dtype='<U10')

In [10]:
parser("I need an influenza vaccine")

array(['i', 'need', 'an', 'influenza', 'vaccine'], dtype='<U9')

In [23]:
with open("../data/fluwords.pkl", 'rb') as pickle_file:
    flu_set = pickle.load(pickle_file)

In [24]:
flu_set

['fludshot',
 'influenza vzaccine',
 'slu',
 'olu vaccine',
 'influenzasvaccine',
 'flu vacczine',
 'flu vaccinea',
 'flu ohot',
 'nlu shot',
 'flfu shot',
 'knfluenza vaccine',
 'influenzia vaccine',
 'iyfluenza',
 'fblu vaccine',
 'inflvuenza',
 'flu vacciny',
 'flu shon',
 'influensa',
 'fluu shot',
 'influenza vacciwne',
 'flu tvaccine',
 'flushot',
 'influenza vacctine',
 'flu vaceine',
 'influenzb vaccine',
 'iofluenza vaccine',
 'fqlu',
 'flwu shot',
 'imnfluenza vaccine',
 'influenzag vaccine',
 'flu sjhot',
 'flur',
 'flu vaccdine',
 'influenzan',
 'flu maccine',
 'influenza accine',
 'influenze vaccine',
 'fl ushot',
 'flu wshot',
 'flu vaccline',
 'flu snot',
 'influenza vaccire',
 'flu spot',
 'influenza varccine',
 'flu sthot',
 'flut shot',
 'influenza mvaccine',
 'influenzai',
 'influenza vaccin',
 'influenzea',
 'influenpa',
 'influenzah',
 'infludnza vaccine',
 'iffluenza vaccine',
 'vlu vaccine',
 'flu shat',
 'influfenza',
 'influsenza vaccine',
 'influednza vaccine'

In [25]:
"flu shot" in flu_set

True

In [26]:
"flue" in flu_set

True

In [27]:
flu_related("influenza")

1

In [85]:
def flu_related(text):
    words = parser(text)
    word_count = 0
    for w in words:
        if w == "vaccine":
            word_count+=1
        elif w in flu_set:
            word_count+=1.5
    return word_count

In [82]:
flu_related("I have to get a flue shot")

3.0

In [83]:
flu_related("I have to get a flu shot influenza flu")

6.0

In [70]:
general_terms = {"pain", "injury", "accident",
                 "stress", "surgery", "depression",
                 "anxiety", "cold", "migraine",
                 "hospital", "bleeding", "vaccine",
                 "medicine"}

In [34]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [71]:
# set([edits1(i) for i in general_terms])
# for i in general_terms:
#     print(edits1(i))
term_list = [set().union(edits1(i)) for i in general_terms]
term_set =  set().union(*term_list)
term_set
pickle.dump(term_set, open("../data/generalwords.pkl", "wb"))
# {{}.update(term) for term in edits1(i) for i in general_terms}
# list(set().union(*d))

In [78]:
flu_terms = {"flu", "shot", "vaccine", "influenza", "fluenza"}

In [80]:
flu_list = [set().union(edits1(i)) for i in flu_terms]
flu_set =  set().union(*flu_list)
# flu_set
pickle.dump(term_set, open("fluwords.pkl", "wb"))

In [39]:
edits2("haha")

<generator object edits2.<locals>.<genexpr> at 0x117358a40>

In [72]:
pickle.dump(general_terms, open("terms.pkl", "wb"))

In [76]:
def general_disease(text):
    words = parser(text)
    word_count = 0
    for w in words:
        if w in general_terms:
            word_count+=1
    return word_count

In [88]:
x_test = ["I need a vaccine", "I need a flu shot",
          "I have to go to hospital", "I have to go to hospital for a flu shot",
          "I need medicine. My back is killing me", "I just saw a car crash",
          "I need coffee"]
for i in x_test:
    print(general_disease(i), flu_related(i))

1 1
0 3.0
1 0
1 3.0
1 0
0 0
0 0


### Compare Results

Using parser, words, and semi-ml model, return which class the text belongs to.

In [None]:
def classify(text):
    gen, flu = general_disease(text), flu_related(text)
    if gen>flu:
        return "general"
    elif gen == 0 and flu ==0:
        return "none"
    else:
        return "flu"

In [102]:
classify("i have the in")

'flu'

In [120]:
flu_only = {"sneezing", "nasal", "congestion", "flu", "shot", "influenza", "fluenza"}


flu_general = {"fever", "headache", "cough", "fatigue", "throat",
               "sore", "mucus", "vaccine", "cold", "migraine"}


general = {"stool", "cramps", "abdominal", "pain", "blood", "nausea", "diarrhea",
           "bloating", "injury", "accident", "stress", "surgery", "depression", "bleeding", "medicine", "hospital" }


### 2nd Try That Actually Works

In [121]:
with open("../data/fluonlywords.pkl", 'rb') as pickle_file:
    flu_only_set = pickle.load(pickle_file)
with open("../data/flugeneralwords.pkl", 'rb') as pickle_file:
    flu_general_set = pickle.load(pickle_file)
with open("../data/generalwords.pkl", 'rb') as pickle_file:
    general_set = pickle.load(pickle_file)

In [124]:
def flu_related_new(text):
    words = parser(text)
    word_count = 0
    for w in words:
        if w in flu_general_set:
            word_count+=1
        elif w in flu_set:
            word_count+=1.5
    return word_count

def general_disease_new(text):
    words = parser(text)
    word_count = 0
    for w in words:
        if w in flu_general_set:
            word_count+=1
        elif w in general_set:
            word_count+=1.5
    return word_count

# gen/flu>0.45
# gen/flu<0.05


def classify(text):
    gen, flu = general_disease_new(text), flu_related_new(text)
    if gen == 0 and flu ==0:
        return "none"
    elif flu >= gen:
        return "flu"
    else:
        return "general"

In [127]:
[classify("I have a flu"), classify("I have a fluenza"),
classify("I think my headache is killing me. There needs to be more hospital around here"), 
classify("There is a pain on my arm. I think I broke it.")]

['flu', 'flu', 'general', 'general']

In [128]:
classify("Where is the closest flu shot")

'flu'

In [129]:
classify("I'm running late. Where is the nearest cvs?")

'none'