# Context Based Sentence Classification

### This program classifies sentences based on the context and predicts whether a sentence might be related to a "patient" or "doctor" spoken sentences.


<strong> Run the cell below to import all the required modules

In [1]:
# Importing all the required modules and the helper functions 

import numpy as np 
import urllib.request
from bs4 import BeautifulSoup
from nltk import sent_tokenize
from nltk import word_tokenize
import re
from gensim.models import Word2Vec
import pickle 

# the following two modules are helper functions to generate features from the sentences
def get_feature(text,feature_dimension,wordset,model, label = None):
    features = None
    
    for sample in text:
        paragraph = sample.lower()
        sentences = sent_tokenize(paragraph)

        for sentence in sentences:
            feature_vector = np.zeros(feature_dimension)
            words = word_tokenize(sentence)

            count = 0
            for word in words:
                if word in wordset and word.isalnum():
                    count = count + 1
                    feature_vector = feature_vector + model[word]

            if count != 0:
                feature_vector = feature_vector / float(count)

                if label is not None:
                    feature_vector = np.append(feature_vector, label)

                feature_vector = feature_vector[np.newaxis]

                if features is None:
                    features = feature_vector
                else:
                    features = np.concatenate((features, feature_vector))

    return features


def generate_features(feature_dimension,wordset,model):

    with open("patient.txt") as patfile:
        patient = patfile.readlines()
    patfile.close()

    with open("doctor.txt") as docfile:
        doctor = docfile.readlines()
    docfile.close()

    patient_features = get_feature(patient,feature_dimension,wordset,model,label=0)
    doctor_features = get_feature(doctor,feature_dimension,wordset,model,label=1)

    features = np.concatenate((patient_features,doctor_features))
    return features

def predict(clf, text,feature_dimension, wordset, model):
    paragraph = text.lower()
    sentences = sent_tokenize(paragraph)   
    
    features = get_feature([text],feature_dimension,wordset,model)
    
    pred = clf.predict(features)
                       
    for i,item in enumerate(pred):
        if item == 0:
            ret = "patient"
        else:
            ret = "doctor"

        print("{} : {}".format(sentences[i],ret))
    
    print()
        
    return pred



## Data collection 

The data is crawled from www.askthedoctor.com.
The website contains data of questions asked by the patients, and the corresponding answers given by the doctor.
The data is categorized into different categories based on the diseases. Here, each of the category is looped and corresponding data is stored as "patient" data or "doctor" data
Run the code below to collect data from the above website. 
Two files "patient.txt" and "doctor.txt" are saved

<strong>Note that it might take several minutes depending on the internet connection. 

<strong>Skip the below cell if the data is already saved.

In [None]:

base_url = "https://www.askthedoctor.com/browse-medical-questions"
base_f = urllib.request.urlopen(base_url)
base_soup = BeautifulSoup(base_f,"lxml")

# categories of diseases 
categories = [(base_anchor["href"],base_anchor["title"]) for base_div in base_soup.findAll("div",{"class":"disease_column"}) for base_anchor in base_div.findAll("a",{"itemtype":"https://schema.org/category"})]


print("Collecting data ... ")

with open("patient.txt","w") as patientfile, open("doctor.txt", "w") as doctorfile:
    for category in categories:

        topic = category[1]
        print(topic)

        try:
            url = category[0]
            f = urllib.request.urlopen(url)
            soup = BeautifulSoup(f,"lxml")

            divs = soup.findAll('div',{"class":"question_az"})

            for i,div in enumerate(divs):
                inner_url = div.find('a')['href']
                inner_f = urllib.request.urlopen(inner_url)
                inner_soup = BeautifulSoup(inner_f,"lxml")

                question = inner_soup.find('span',{"class":"quesans"})
                question = question.text.replace(","," ")
                question = re.sub('[.]+', '.',question)


                for token in sent_tokenize(question):
                    if len(word_tokenize(token)) > 3:
                        patientfile.write("{}\n".format(token))

                answer = inner_soup.find('span', {"class": "answer quesans"})
                answer = answer.text.replace(""" \n(adsbygoogle = window.adsbygoogle || []).push({});""","").replace("\n"," ").replace("   "," ").replace(","," ")
                answer = re.sub('[.]+', '.',answer)

                for token in sent_tokenize(answer):
                    if len(word_tokenize(token)) > 3:
                        doctorfile.write("{}\n".format(token))

        except:
            print("Error ................ {}".format(topic))
            
patientfile.close()
doctorfile.close()

print("Data saved !")

## Word2Vec

Word2vec is a method of word embeddings where the words in a sentence are mapped to their corresponding vectors representation.
Here the whole dataset is considered, both patient and doctor spoken sentences to learn word embeddings.

A python library "gensim" is used to train a word2vec model

<strong> Run the code below to generate a word2vec model. 
<string> Skip the cell below if model is already created.

In [2]:
data_matrix = [] 

with open("patient.txt","r") as patfile, open("doctor.txt","r") as docfile:
    data_matrix = patfile.readlines()
    data_matrix.extend(docfile.readlines())
    
patfile.close()
docfile.close()

# converting the whole data into lower case
data_matrix = [sample.lower() for sample in data_matrix]

print("The Dataset consists of {} sentences.".format(len(data_matrix)))

# Formatting the data to provide as input to gensim package's word2vec model
words_matrix = []
for sample in data_matrix:
    sentences = sent_tokenize(sample)
    for sentence in sentences:
        words = word_tokenize(sentence)
        words_new = [word for word in words if word.isalnum()]
        words_matrix.append(words_new)
        
        
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Parameters required for training word2vec model
num_features = 300    # Word vector dimensionality
min_word_count = 5   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words


# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(words_matrix, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "word2vec_model"
model.save(model_name)

print("Word2Vec model saved !")

The Dataset consists of 25601 sentences.


2017-09-16 12:37:47,969 : INFO : collecting all words and their counts
2017-09-16 12:37:47,973 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-16 12:37:48,139 : INFO : PROGRESS: at sentence #10000, processed 149033 words, keeping 9961 word types


Training model...


2017-09-16 12:37:48,270 : INFO : PROGRESS: at sentence #20000, processed 294765 words, keeping 13963 word types
2017-09-16 12:37:48,330 : INFO : collected 15461 word types from a corpus of 375367 raw words and 25415 sentences
2017-09-16 12:37:48,335 : INFO : Loading a fresh vocabulary
2017-09-16 12:37:48,403 : INFO : min_count=5 retains 4644 unique words (30% of original 15461, drops 10817)
2017-09-16 12:37:48,406 : INFO : min_count=5 leaves 357715 word corpus (95% of original 375367, drops 17652)
2017-09-16 12:37:48,470 : INFO : deleting the raw counts dictionary of 15461 items
2017-09-16 12:37:48,477 : INFO : sample=0.001 downsamples 55 most-common words
2017-09-16 12:37:48,481 : INFO : downsampling leaves estimated 262582 word corpus (73.4% of prior 357715)
2017-09-16 12:37:48,494 : INFO : estimated required memory for 4644 words and 300 dimensions: 13467600 bytes
2017-09-16 12:37:48,554 : INFO : resetting layer weights
2017-09-16 12:37:48,797 : INFO : training model with 4 workers 

Word2Vec model saved !


## Testing Word2Vec Model 

Given a word, the model should be able to give similar words after being trained depending on the context words that appeared in sentences of the dataset.

In [3]:
model = Word2Vec.load("word2vec_model")

word_vectors = model.wv.syn0

print("The model has {} words in the vocabulary and the dimension of the vectors is {}".format(word_vectors.shape[0],word_vectors.shape[1]))


print("I\n{}\n".format(model.most_similar("i")))
print("swelling\n{}\n".format(model.most_similar("swelling")))
print("headache\n{}\n".format(model.most_similar("headache")))
print("fever\n{}\n".format(model.most_similar("fever")))

2017-09-16 12:37:59,313 : INFO : loading Word2Vec object from word2vec_model
2017-09-16 12:37:59,734 : INFO : loading wv recursively from word2vec_model.wv.* with mmap=None
2017-09-16 12:37:59,737 : INFO : setting ignored attribute syn0norm to None
2017-09-16 12:37:59,744 : INFO : setting ignored attribute cum_table to None
2017-09-16 12:37:59,749 : INFO : loaded word2vec_model
2017-09-16 12:37:59,800 : INFO : precomputing L2-norms of word weight vectors


The model has 4644 words in the vocabulary and the dimension of the vectors is 300
I
[('addressed', 0.6825687885284424), ('you', 0.67529296875), ('about', 0.6483484506607056), ('appointments', 0.6423941850662231), ('party', 0.6292402744293213), ('answered', 0.6146000027656555), ('anxious', 0.6127475500106812), ('helped', 0.6091926097869873), ('efforts', 0.6089515686035156), ('now', 0.5993724465370178)]

swelling
[('inflammation', 0.9732186198234558), ('ligaments', 0.9714838266372681), ('node', 0.9705648422241211), ('lymph', 0.9691843390464783), ('behind', 0.9660965204238892), ('itching', 0.9660079479217529), ('weakness', 0.9643855690956116), ('lymphedema', 0.9634043574333191), ('discomfort', 0.9624897241592407), ('heart', 0.9600017666816711)]

headache
[('vomiting', 0.988715410232544), ('muscle', 0.98659348487854), ('mild', 0.9836856126785278), ('severe', 0.9831716418266296), ('drowsiness', 0.9827138781547546), ('burning', 0.9813575148582458), ('pains', 0.9811801910400391), ('weakness'

## Saving the features

Each sentence vector is formed by averaging the vectors corresponding to each word in a sentence. 

The whole set of features is divided into train and test features.

<strong> This might take some time
<strong> Skip the cell if features are already saved 

In [4]:
model = Word2Vec.load("word2vec_model")
word_vectors = model.wv.syn0

feature_dimension = word_vectors.shape[1]

# all words in the vocabulary
wordset = set(model.wv.index2word)


print("Generating features ...")
features = generate_features(feature_dimension,wordset,model)

# dividing the dataset into train and test datasets.
indices = np.random.permutation(features.shape[0])
test_idx,training_idx = indices[:2000], indices[2000:]
test_features, train_features = features[test_idx,:], features[training_idx,:]

train_labels =  train_features[:,-1]
train_features = train_features[:,:-1]

test_labels =  test_features[:,-1]
test_features = test_features[:,:-1]

# Saving features 
np.save("train_features.npy",train_features)
np.save("train_labels.npy", train_labels)
np.save("test_features.npy",test_features)
np.save("test_labels.npy",test_labels)

print("Features saved")

2017-09-16 12:38:12,414 : INFO : loading Word2Vec object from word2vec_model
2017-09-16 12:38:12,768 : INFO : loading wv recursively from word2vec_model.wv.* with mmap=None
2017-09-16 12:38:12,770 : INFO : setting ignored attribute syn0norm to None
2017-09-16 12:38:12,774 : INFO : setting ignored attribute cum_table to None
2017-09-16 12:38:12,778 : INFO : loaded word2vec_model


Generating features ...
Features saved


## Training Classifier Model

A simple SVM classifier is trained by converting a sentence into vectors.

<strong> Run the following code a train a simple SVM classifier and save the model. This might take some time
<Strong> Skip the cell if model is already generated. 

In [5]:
# Loading train features
train_features = np.load("train_features.npy")
train_labels = np.load("train_labels.npy")


from sklearn.svm import SVC
clf = SVC(kernel="linear",C=100)

print("Training classifier ....")
clf.fit(train_features,train_labels)

import pickle 

with open("clf_model.pkl","wb") as clffile:
    pickle.dump(clf,clffile)
    
clffile.close()

print("Classifier Model saved !")

Training classifier ....
Classifier Model saved !


## Evaluating Classifier Model

In [6]:
with open("clf_model.pkl","rb") as clffile:
    clf = pickle.load(clffile)
    
clffile.close()
   
test_features = np.load("test_features.npy")
test_labels = np.load("test_labels.npy")


pred = clf.predict(test_features)

from sklearn.metrics import accuracy_score
acc = accuracy_score(test_labels,pred)*100

print("The accuracy for {} samples for the model : {}%".format(test_features.shape[0],acc))

The accuracy for 2000 samples for the model : 86.1%


## Trying out some random sentences

In [10]:
with open("clf_model.pkl","rb") as clffile:
    clf = pickle.load(clffile)

clffile.close()

model = Word2Vec.load("word2vec_model")
word_vectors = model.wv.syn0

feature_dimension = word_vectors.shape[1]

# all words in the vocabulary
wordset = set(model.wv.index2word)


text = "i still cough few times a day. what should i do?"
res = predict(clf, text,feature_dimension, wordset, model)

text = "i have severe pain in my abdomen. do i have to go to the doctor? wash your hands everytime and follow hygenic practices"
res = predict(clf, text,feature_dimension, wordset, model)

text = "i have a sore throat. it has been there for the past week."
res = predict(clf, text,feature_dimension, wordset, model)

text = "do you have sore throat? Does your throat feel itchy? Do you have flu?"
res = predict(clf, text,feature_dimension, wordset, model)


text = "you should apply neomycin ointment on your chin"
res = predict(clf, text,feature_dimension, wordset, model)


text = "do you think I have infection which is causing my blood pressure to rise? Yes, your blood pressure is increasing because of infection"
res = predict(clf, text,feature_dimension, wordset, model)

text = "Are you comfortable? If you are not comfortable, please let me know. No I am not comfortable and in too much in pain right now."
res = predict(clf, text,feature_dimension, wordset, model)


2017-09-16 12:53:55,008 : INFO : loading Word2Vec object from word2vec_model
2017-09-16 12:53:55,306 : INFO : loading wv recursively from word2vec_model.wv.* with mmap=None
2017-09-16 12:53:55,309 : INFO : setting ignored attribute syn0norm to None
2017-09-16 12:53:55,312 : INFO : setting ignored attribute cum_table to None
2017-09-16 12:53:55,318 : INFO : loaded word2vec_model


i still cough few times a day. : patient
what should i do? : patient

i have severe pain in my abdomen. : patient
do i have to go to the doctor? : patient
wash your hands everytime and follow hygenic practices : doctor

i have a sore throat. : patient
it has been there for the past week. : patient

do you have sore throat? : doctor
does your throat feel itchy? : doctor
do you have flu? : doctor

you should apply neomycin ointment on your chin : doctor

do you think i have infection which is causing my blood pressure to rise? : patient
yes, your blood pressure is increasing because of infection : doctor

are you comfortable? : doctor
if you are not comfortable, please let me know. : doctor
no i am not comfortable and in too much in pain right now. : patient

