## Importing Libraries

In [None]:
#Importing all the necessary libraries
import pandas as pd
import numpy as np
import string
import regex as re
import matplotlib.pyplot as plt
import nltk
import seaborn as sns


from pymed import PubMed

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , LSTM ,Embedding, Input, SpatialDropout1D, Flatten, SimpleRNN, Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

## Data Pre-processing

In [None]:


# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="MyTool", email="my@email.address")

from pprint import pprint

# Create a GraphQL query in plain text
conversations_keywords = ['symptoms', 'presence of', 'sign off', 'suggestion', 'clue',  'hint of']
diseases = ['diabetes', 'hypertension', 'arthritis', 'tuberculosis', 'pneumonia']
            #'peptic ulcer', 'gastroenteritis']
for disease in diseases:
    for keyword in conversations_keywords:
        query = '('+ disease +'[Title]) AND ('+ keyword + '[Text Word])'
        print('Excuting query: ' + query)
        
        # Execute the query against the API
        results = pubmed.query(query, max_results=2000)

        abstracts = []
        keywords = []
        
        # Loop over the retrieved articles
        for article in results:
            # Extract and format information from the article
            article_id = article.pubmed_id
            title = article.title
            publication_date = article.publication_date
            abstract = article.abstract
            abstracts.append((abstract, disease))

abstracts_df = pd.DataFrame(abstracts)
abstracts_df.to_csv('disease_data.csv')


In [None]:
# Read the CSV file that was created
disease_data = pd.read_csv('disease_data.csv')
disease_data

In [None]:
disease_data.columns

In [None]:
disease_data.shape

In [None]:
disease_data.head()

In [None]:
disease_data.columns= ['abstract' , 'disease']

In [None]:
# Checking the dataset based on disease
print(disease_data['disease'].value_counts())

In [None]:
#Removing the disease words from abstract data to prevent overfitting.

disease_data['abstract'] = disease_data['abstract'].str.replace('tuberculosis','')
disease_data['abstract'] = disease_data['abstract'].str.replace('arthritis','')
disease_data['abstract'] = disease_data['abstract'].str.replace('diabetes','')
disease_data['abstract'] = disease_data['abstract'].str.replace('peptic','')
disease_data['abstract'] = disease_data['abstract'].str.replace('ulcer','')
disease_data['abstract'] = disease_data['abstract'].str.replace('gastroenteritis','')
disease_data['abstract'] = disease_data['abstract'].str.replace('pneumonia','')
disease_data['abstract'] = disease_data['abstract'].str.replace('diabetes','')

disease_data.head()

In [None]:
# Dropping the na's
disease_data.dropna(inplace=True)

In [None]:
# Removing punctuations and numbers
disease_data["abstract"] = disease_data["abstract"].str.replace('[^a-zA-Z\s]', '')

In [None]:
# Lemmanization function creattion
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('omw-1.4')
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [None]:
# Typecasting the abstract column to string
disease_data["abstract"] = disease_data["abstract"].astype(str)

In [None]:
# Applying the lemmanization function to abstract column
disease_data['abstract'] = disease_data.abstract.apply(lemmatize_text)

In [None]:
disease_data.shape

In [None]:
disease_data.head()

In [None]:
disease_data['abstract'] = disease_data['abstract'].astype(str) 

In [None]:
# Splitting the data into training and test dataset (80/20)

train, test = train_test_split(disease_data, test_size=0.2)

## Machine Learning Models

In [None]:
# Pipeline to vectorize, applying Tfid transformation and Naive Bayes

nb = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('nb', MultinomialNB())])
nb = nb.fit(train.abstract, train.disease)

In [None]:
# Calculating the accuracy

y_pred = nb.predict(test.abstract)
print(accuracy_score(test.disease,y_pred)) # 0.8368491921005387

In [None]:
# Classification report

print(classification_report(test.disease,y_pred))

In [None]:
# Visualizing the confusion matrix

fig, ax = plt.subplots(figsize=(15,10))
s = sns.heatmap(confusion_matrix(test.disease,y_pred),xticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"] ,yticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"], annot=True, fmt='.4g' )
plt.ylabel('Actual label');
plt.xlabel('Predicted label');


In [None]:
# Testing on some user generated symptoms

p = nb.predict(['feelings of sickness or weakness, weight loss, fever, \
  and night sweats. coughing, chest pain, and the coughing up of blood.', 'I get really tired and have vision problem',
'Patient: Doctor, I’ve headache since yesterday evening.  Doctor: Have you taken any medicine so far?\ Patient: Saridon, but the headache hasn’t disappeared. \ Doctor: You’ve a running nose. Looks like your headacheis a result of\ sinus infection, and not the regular one that results from anxiety and fatigue. Lemme check.\  (The doctor checks the patient thoroughly.)\ Doctor: It’s quite clear that the infection in your sinus is the reason for your headache. I’ll prescribe an antibiotic to clear the infection and a pain reliever to relieve the pain.\ Patient: Thank you, doctor.'])
p

In [None]:
# Pipeline to vectorize, applying Tfid transformation and Linear Support Vector Classification

svc = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('svc', svm.LinearSVC())])
svc = svc.fit(train.abstract, train.disease)

In [None]:
# Calculating the accuracy

y_pred = svc.predict(test.abstract)
print(accuracy_score(test.disease,y_pred)) # 0.9429982046678635

In [None]:
print(classification_report(test.disease,y_pred))

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
s = sns.heatmap(confusion_matrix(test.disease,y_pred),xticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"] ,yticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"], annot=True, fmt='.4g' )
plt.ylabel('Actual label');
plt.xlabel('Predicted label');


In [None]:
# Testing on some user generated symptoms

p = svc.predict(['feelings of sickness or weakness, weight loss, fever, \
  and night sweats. coughing, chest pain, and the coughing up of blood.', 'I get really tired and have vision problem',
'Patient: Doctor, I’ve headache since yesterday evening.  Doctor: Have you taken any medicine so far?\ Patient: Saridon, but the headache hasn’t disappeared. \ Doctor: You’ve a running nose. Looks like your headacheis a result of\ sinus infection, and not the regular one that results from anxiety and fatigue. Lemme check.\  (The doctor checks the patient thoroughly.)\ Doctor: It’s quite clear that the infection in your sinus is the reason for your headache. I’ll prescribe an antibiotic to clear the infection and a pain reliever to relieve the pain.\ Patient: Thank you, doctor.'])
p

In [None]:
# Pipeline to vectorize, applying Tfid transformation and Logistic Regression

lr  = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('lr', LogisticRegression())])
lr = lr.fit(train.abstract, train.disease)

In [None]:
# Calculating the accuracy

y_pred = lr.predict(test.abstract)
print(accuracy_score(test.disease,y_pred)) # 0.9355924596050269

In [None]:
print(classification_report(test.disease,y_pred))

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
s = sns.heatmap(confusion_matrix(test.disease,y_pred),xticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"] ,yticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"], annot=True, fmt='.4g' )
plt.ylabel('Actual label');
plt.xlabel('Predicted label');


In [None]:
p = lr.predict(['Feelings of sickness or weakness, weight loss, fever, and night sweats', 
                'i feel thirsty most of the time, i have experienced weight loss, increase \
                in  appetite, blurry vision, numb, tingling hands', 
                'I have burning stomach, heartburn and nausea from last few days',
                'i feel pain and stiffness in my knees, I cannot move my knees much'])
p

In [None]:
# Pipeline to vectorize, applying Tfid transformation and Random Forest

rf  = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('rf', RandomForestClassifier())])
rf = rf.fit(train.abstract, train.disease)

In [None]:
y_pred = rf.predict(test.abstract)
print(accuracy_score(test.disease,y_pred)) # 0.9196588868940754

In [None]:
print(classification_report(test.disease,y_pred))

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
s = sns.heatmap(confusion_matrix(test.disease,y_pred),xticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"] ,yticklabels=["arthritis", "diabetes", "gastroenteritis", "hypertension", "peptic ulcer", "pneumonia", "tuberculosis"], annot=True, fmt='.4g' )
plt.ylabel('Actual label');
plt.xlabel('Predicted label');


In [None]:
# Testing on some user generated symptoms

p = rf.predict(['feelings of sickness or weakness, weight loss, fever, \
  and night sweats. coughing, chest pain, and the coughing up of blood.', 'I get really tired and have vision problem',
'Patient: Doctor, I’ve headache since yesterday evening.  Doctor: Have you taken any medicine so far?\ Patient: Saridon, but the headache hasn’t disappeared. \ Doctor: You’ve a running nose. Looks like your headacheis a result of\ sinus infection, and not the regular one that results from anxiety and fatigue. Lemme check.\  (The doctor checks the patient thoroughly.)\ Doctor: It’s quite clear that the infection in your sinus is the reason for your headache. I’ll prescribe an antibiotic to clear the infection and a pain reliever to relieve the pain.\ Patient: Thank you, doctor.'])
p

## Neural Networks

In [None]:
nltk.download('stopwords')

#Using the raw data to clean and tokenize again.

disease_data = disease_data.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(nltk.corpus.stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
disease_data['abstract'] = disease_data['abstract'].apply(clean_text)
disease_data['abstract'] = disease_data['abstract'].str.replace('\d+', '')

In [None]:
# Using different tokenizer for Neural Networks

MAX_NB_WORDS = 5000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 100
# This is fixed.
EMBEDDING_DIM = 50
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(disease_data['abstract'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(disease_data['abstract'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(disease_data['disease']).values
print('Shape of label tensor:', Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
def run_rnn_model(X_train, X_test, Y_train, Y_test):
    rnn_model = Sequential()
    rnn_model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    rnn_model.add(SpatialDropout1D(0.2))
    rnn_model.add(Bidirectional(LSTM(250, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
    rnn_model.add(Bidirectional(LSTM(250, dropout=0.2, recurrent_dropout=0.2)))
    rnn_model.add(Dense(7, activation='softmax'))
    rnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 10
    batch_size = 64

    history = rnn_model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    accr = rnn_model.evaluate(X_test,Y_test)
    print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show();
    return rnn_model, accr[1]

rnn_model = run_rnn_model(X_train, X_test, Y_train, Y_test)[0]    

In [None]:
accr = rnn_model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Y_pred = rnn_model.predict(X_test)

In [None]:
print(classification_report(Y_test, Y_pred.round()))

In [None]:
# Testing on some user generated symptoms

new_complaint = ['Feelings of sickness or weakness, weight loss, fever, and night sweats', 
                'i feel thirsty most of the time, i have experienced weight loss, increase \
                in  appetite, blurry vision, numb, tingling hands', 
                'I have burning stomach, heartburn and nausea from last few days',
                'i feel pain and stiffness in my knees, I cannot move my knees much'] 
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = rnn_model.predict(padded)
labels = ['arthritis', 'diabetes', 'gastroenteritis', 'hypertension', 'peptic ulcer', 'pneumonia', 'tuberculosis']
for p in pred:
  print(p, np.argmax(p))

In [None]:
MAX_NB_WORDS = 5000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 100
# This is fixed.
EMBEDDING_DIM = 50
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(disease_data['abstract'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(disease_data['abstract'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
Y = pd.get_dummies(disease_data['disease']).values
print('Shape of label tensor:', Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
def run_cnn_model(X_train, X_test, y_train, y_test):
    # Building the CNN Model
    cnn_model = Sequential()      # initilaizing the Sequential nature for CNN model
    # Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
    cnn_model.add(Embedding(MAX_NB_WORDS, 64, input_length=X.shape[1]))
    cnn_model.add(Conv1D(264, 3, padding='same', activation='relu'))
    cnn_model.add(Conv1D(232, kernel_size=3, activation='relu'))
    cnn_model.add(MaxPooling1D())
    cnn_model.add(Flatten())
    cnn_model.add(Dense(250, activation='relu'))
    cnn_model.add(Dense(7, activation='softmax'))
    cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    cnn_model.summary()

    epochs = 10
    batch_size = 64

    history = cnn_model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    accr = cnn_model.evaluate(X_test,Y_test)
    print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show();
    return cnn_model, accr[1]

cnn_model = run_cnn_model(X_train, X_test, Y_train, Y_test)[0]

In [None]:
accr = cnn_model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Y_pred = cnn_model.predict(X_test)

print(classification_report(Y_test, Y_pred.round()))


In [None]:
# Testing on some user generated symptoms

new_complaint = ['Feelings of sickness or weakness, weight loss, fever, and night sweats', 
                'i feel thirsty most of the time, i have experienced weight loss, increase \
                in  appetite, blurry vision, numb, tingling hands', 
                'I have burning stomach, heartburn and nausea from last few days',
                'i feel pain and stiffness in my knees, I cannot move my knees much'] 
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = cnn_model.predict(padded)
labels = ['arthritis', 'diabetes', 'gastroenteritis', 'hypertension', 'peptic ulcer', 'pneumonia', 'tuberculosis']
for p in pred:
  print(p, labels[np.argmax(p)])

### -------- ###