### NLP Implementation for Flu Classifier

Here is the improved classifier with Natural Language Processing (nltk library). 
- Parsing is done using built-in functions.
- SnowballStemmer is picked for the stemming tool.

In [19]:
import pickle
import re

In [4]:
import nltk
from nltk.stem.porter import *

porterStemmer = PorterStemmer()

sentence = "Provision Maximum multiply owed caring on go gone going was this"
wordList = nltk.word_tokenize(sentence)

stemWords = [porterStemmer.stem(word) for word in wordList]

print(' '.join(stemWords))

provis maximum multipli owe care on go gone go wa thi


In [7]:
sentence = "I have a fever runny-nose and depression. Not that my headache is that bad."
wordList = nltk.word_tokenize(sentence)

stemWords = [porterStemmer.stem(word) for word in wordList]

print(' '.join(stemWords))

I have a fever runny-nos and depress . not that my headach is that bad .


In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
import nltk
from nltk.stem.snowball import SnowballStemmer

snowBallStemmer = SnowballStemmer("english")

sentence = "Provision Maximum multiply owed caring on go gone going was this"
wordList = nltk.word_tokenize(sentence)

stemWords = [snowBallStemmer.stem(word) for word in wordList]

print(' '.join(stemWords))

provis maximum multipli owe care on go gone go was this


In [6]:
sentence = "I have a fever runny-nose and depression. Not that my headache is that bad."
wordList = nltk.word_tokenize(sentence)
stemWords = [snowBallStemmer.stem(word) for word in wordList]

print(' '.join(stemWords))

i have a fever runny-nos and depress . not that my headach is that bad .


*Conclusion:* Both of the stemming tools are working fine. We'll just stick with SnowBall since it was initially advised as a better tool. 

In [10]:
keywords = {"sneezing", "nasal", "congestion", "flu", "shot", "influenza", "fluenza",
            "fever", "headache", "cough", "fatigue", "throat",
            "sore", "mucus", "vaccine", "cold", "migraine",
            "stool", "cramps", "abdominal", "pain", "blood", "nausea", "diarrhea",
            "bloating", "injury", "accident", "stress", "surgery", "depression",
            "bleeding", "medicine", "hospital"}
stem_keywords = [snowBallStemmer.stem(word) for word in keywords]
print(' '.join(stem_keywords))

pain blood fever hospit surgeri cramp accid injuri shot vaccin stress flu bleed headach nasal bloat diarrhea migrain sneez nausea stool congest depress throat cold fatigu mucus sore influenza fluenza cough medicin abdomin


In [11]:
# Actual keywords
flu_only = {"sneezing", "nasal", "congestion", "flu", "shot", "influenza", "fluenza"}
flu_general = {"fever", "headache", "cough", "fatigue", "throat",
               "sore", "mucus", "vaccine", "cold", "migraine"}
general = {"stool", "cramps", "abdominal", "pain", "blood", "nausea", "diarrhea",
           "bloating", "injury", "accident", "stress", "surgery", "depression", "bleeding", "medicine", "hospital"}

In [12]:
stem_flu_only = [snowBallStemmer.stem(word) for word in flu_only]
stem_flu_general = [snowBallStemmer.stem(word) for word in flu_general]
stem_general = [snowBallStemmer.stem(word) for word in general]

In [14]:
stem_flu_only, stem_flu_general, stem_general

(['influenza', 'fluenza', 'flu', 'sneez', 'congest', 'nasal', 'shot'],
 ['mucus',
  'fever',
  'sore',
  'cough',
  'migrain',
  'headach',
  'vaccin',
  'throat',
  'cold',
  'fatigu'],
 ['pain',
  'blood',
  'hospit',
  'depress',
  'stress',
  'accid',
  'bleed',
  'surgeri',
  'diarrhea',
  'medicin',
  'cramp',
  'nausea',
  'stool',
  'injuri',
  'abdomin',
  'bloat'])

In [15]:
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [20]:
flu_only_list = [set().union(edits1(i)) for i in stem_flu_only]
flu_only_set =  set().union(*flu_only_list)
# flu_only_set
pickle.dump(flu_only_set, open("../data/nlp_fluonlywords.pkl", "wb"))

In [21]:
flu_general_list = [set().union(edits1(i)) for i in stem_flu_general]
flu_general_set =  set().union(*flu_general_list)
# flu_general_set
pickle.dump(flu_general_set, open("../data/nlp_flugeneralwords.pkl", "wb"))

In [22]:
general_list = [set().union(edits1(i)) for i in stem_general]
general_set =  set().union(*general_list)
# general_set
pickle.dump(general_set, open("../data/nlp_generalwords.pkl", "wb"))

In [25]:
# Use nltk tokenizer. 

def flu_related_new(text):
    words = nltk.word_tokenize(text)
    word_count = 0
    for w in words:
        if w in flu_general_set:
            word_count+=1
        elif w in flu_only_set:
            word_count+=1.5
    return word_count

def general_disease_new(text):
    words = nltk.word_tokenize(text)
    word_count = 0
    for w in words:
        if w in flu_general_set:
            word_count+=1
        elif w in general_set:
            word_count+=1.5
    return word_count

# gen/flu>0.45
# gen/flu<0.05


def nlp_classify(text):
    gen, flu = general_disease_new(text), flu_related_new(text)
    if gen == 0 and flu ==0:
        return "neither"
    elif flu >= gen:
        return "flu"
    else:
        return "general"

In [27]:
nlp_classify("I have flu shot next to my apartment where the hospital just crashed")

'flu'

In [28]:
nlp_classify("I am in pcikle, please help me")

'neither'