In [122]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import difflib

In [37]:
import pickle
import numpy as np

In [1]:
sample_list = ['Abdominal pain',
 'Abdominal redness',
 'Abdominal swelling',
 'Abnormal sweating',
 'Acne',
 'Allergy',
 'Anal Fissure',
 'Anal pain',
 'Anemia',
 'Anhedonia',
 'Ankle pain',
 'Anxiety',
 'Appendicitis',
 'Arm pain',
 'Arm swelling',
 'Arm weakness',
 'Armpit pain',
 'Armpit swelling',
 'Attention deficit',
 'Back pain',
 'Bad breath',
 'Bad or bitter taste',
 'Black or tarry stool',
 'Bladder Infection (UTI)',
 'Blood in stool',
 'Blood in urine',
 'Blurry vision',
 'Bronchitis',
 'Bulging eye',
 'Burning or painful urination',
 'Buttocks pain',
 'Calf pain',
 'Calf swelling',
 'Cheek pain',
 'Cheek swelling',
 'Chest pain',
 'Chills',
 'Confusion',
 'Constipation',
 'Cool bluish skin',
 'Cough',
 'COVID-19',
 'Decreased appetite',
 'Decreased hearing',
 'Decreased urination',
 'Decreased vision',
 'Dental pain',
 'Diarrhea',
 'Difficulty urinating',
 'Dizziness',
 'Drooping eyelid',
 'Dry mouth',
 'Dry mucous membranes',
 'Dry skin',
 'Ear discharge',
 'Ear pain',
 'Ear pressure',
 'Ear swelling',
 'Elbow pain',
 'Emotional stress',
 'Erectile dysfunction',
 'Excessive thirst',
 'Excessive urination',
 'Eye deviation',
 'Eye discharge',
 'Eye dryness',
 'Eye floaters',
 'Eye Infection',
 'Eye pain',
 'Eye redness',
 'Eyelid pain',
 'Eyelid redness',
 'Eyelid swelling',
 'Facial droop',
 'Facial lesions',
 'Facial numbness or tingling',
 'Facial pain',
 'Facial swelling',
 'Fainting (passing out)',
 'Fatigue',
 'Feeling cold',
 'Feeling down',
 'Feeling faint',
 'Fever',
 'Finger discoloration',
 'Finger pain',
 'Flank pain',
 'Flashing lights in vision',
 'Flatulence',
 'Food Poisoning',
 'Foot fungus',
 'Foot numbness or tingling',
 'Foot pain',
 'Foot redness',
 'Foot sores',
 'Foot swelling',
 'Forearm pain',
 'Foreign body in the eye',
 'Frequent burping',
 'Frequent night urination',
 'Frequent urination',
 'Gastroenteritis',
 'Genital lesions',
 'Goiter',
 'Groin pain',
 'Groin swelling',
 'Hair loss',
 'Hand numbness or tingling',
 'Hand pain',
 'Hand redness',
 'Hand swelling',
 'Headache',
 'Heart palpitations']


In [20]:
data = ' '.join(sample_list)

In [186]:
data = data.lower()

In [187]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

In [188]:
pickle.dump(tokenizer,open('tokenizer.pkl','wb'))

In [190]:
sequences = tokenizer.texts_to_sequences([data])[0]

In [191]:
sequences[:20]

[14, 1, 14, 6, 14, 2, 36, 37, 38, 39, 22, 40, 22, 1, 41, 42, 43, 1, 44, 45]

In [192]:
vocab_size = len(tokenizer.word_index)+1

In [193]:
vocab_size

126

In [194]:
sequence_col = []
for i in range(1,len(sequences)):
    words_num = sequences[i-1:i+1]
    sequence_col.append(words_num)

In [195]:
X = []
y = []

for seq in sequence_col:
    X.append(seq[0])
    y.append(seq[1])

X = np.array(X)
y = np.array(y)

In [196]:
y = to_categorical(y, num_classes=vocab_size)
y

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [197]:
pred_model = Sequential([
    Embedding(vocab_size, 10, input_length=1),
    LSTM(1000, return_sequences=True),
    LSTM(1000),
    Dense(1000, activation="relu"),
    Dense(vocab_size, activation="softmax")
])

In [198]:
 pred_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1, 10)             1260      
                                                                 
 lstm_4 (LSTM)               (None, 1, 1000)           4044000   
                                                                 
 lstm_5 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense_4 (Dense)             (None, 1000)              1001000   
                                                                 
 dense_5 (Dense)             (None, 126)               126126    
                                                                 
Total params: 13,176,386
Trainable params: 13,176,386
Non-trainable params: 0
_________________________________________________________________


In [199]:
model_checkpoint = ModelCheckpoint('model.h5', monitor='loss', verbose= 1, save_best_only=True, model='auto')
reduce_func = ReduceLROnPlateau(monitor='loss', factor=0.2, patience = 3, min_lr=0.0001, verbose =1 )

In [200]:
pred_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001),run_eagerly=True)

In [201]:
pred_model.fit(X,y, epochs=100, batch_size=64, callbacks=[model_checkpoint, reduce_func])

Epoch 1/100
Epoch 1: loss improved from inf to 4.83473, saving model to model.h5
Epoch 2/100
Epoch 2: loss improved from 4.83473 to 4.82144, saving model to model.h5
Epoch 3/100
Epoch 3: loss improved from 4.82144 to 4.78979, saving model to model.h5
Epoch 4/100
Epoch 4: loss improved from 4.78979 to 4.69862, saving model to model.h5
Epoch 5/100
Epoch 5: loss improved from 4.69862 to 4.53823, saving model to model.h5
Epoch 6/100
Epoch 6: loss improved from 4.53823 to 4.50811, saving model to model.h5
Epoch 7/100
Epoch 7: loss improved from 4.50811 to 4.44661, saving model to model.h5
Epoch 8/100
Epoch 8: loss improved from 4.44661 to 4.42842, saving model to model.h5
Epoch 9/100
Epoch 9: loss improved from 4.42842 to 4.41431, saving model to model.h5
Epoch 10/100
Epoch 10: loss improved from 4.41431 to 4.36716, saving model to model.h5
Epoch 11/100
Epoch 11: loss improved from 4.36716 to 4.32887, saving model to model.h5
Epoch 12/100
Epoch 12: loss improved from 4.32887 to 4.26396, sav

<keras.callbacks.History at 0x12342632e88>

In [202]:
pred_model.fit(X,y, epochs=100, batch_size=64, callbacks=[model_checkpoint, reduce_func])

Epoch 1/100
Epoch 1: loss did not improve from 1.92333
Epoch 2/100
Epoch 2: loss did not improve from 1.92333
Epoch 3/100
Epoch 3: loss improved from 1.92333 to 1.92169, saving model to model.h5
Epoch 4/100
Epoch 4: loss improved from 1.92169 to 1.92132, saving model to model.h5
Epoch 5/100
Epoch 5: loss improved from 1.92132 to 1.90623, saving model to model.h5
Epoch 6/100
Epoch 6: loss improved from 1.90623 to 1.90141, saving model to model.h5
Epoch 7/100
Epoch 7: loss did not improve from 1.90141
Epoch 8/100
Epoch 8: loss improved from 1.90141 to 1.89723, saving model to model.h5
Epoch 9/100
Epoch 9: loss improved from 1.89723 to 1.88831, saving model to model.h5
Epoch 10/100
Epoch 10: loss did not improve from 1.88831
Epoch 11/100
Epoch 11: loss improved from 1.88831 to 1.88168, saving model to model.h5
Epoch 12/100
Epoch 12: loss improved from 1.88168 to 1.87666, saving model to model.h5
Epoch 13/100
Epoch 13: loss improved from 1.87666 to 1.86746, saving model to model.h5
Epoch 1

<keras.callbacks.History at 0x1234eec8dc8>

In [203]:
pred_model.fit(X,y, epochs=100, batch_size=64, callbacks=[model_checkpoint, reduce_func])

Epoch 1/100
Epoch 1: loss did not improve from 1.63193
Epoch 2/100
Epoch 2: loss did not improve from 1.63193
Epoch 3/100
Epoch 3: loss improved from 1.63193 to 1.62872, saving model to model.h5
Epoch 4/100
Epoch 4: loss improved from 1.62872 to 1.62724, saving model to model.h5
Epoch 5/100
Epoch 5: loss did not improve from 1.62724
Epoch 6/100
Epoch 6: loss improved from 1.62724 to 1.62009, saving model to model.h5
Epoch 7/100
Epoch 7: loss did not improve from 1.62009
Epoch 8/100
Epoch 8: loss did not improve from 1.62009
Epoch 9/100
Epoch 9: loss did not improve from 1.62009
Epoch 10/100
Epoch 10: loss did not improve from 1.62009
Epoch 11/100
Epoch 11: loss improved from 1.62009 to 1.61916, saving model to model.h5
Epoch 12/100
Epoch 12: loss did not improve from 1.61916
Epoch 13/100
Epoch 13: loss improved from 1.61916 to 1.61647, saving model to model.h5
Epoch 14/100
Epoch 14: loss did not improve from 1.61647
Epoch 15/100
Epoch 15: loss improved from 1.61647 to 1.61046, saving m

<keras.callbacks.History at 0x12355d9e2c8>

In [204]:
pred_model.fit(X,y, epochs=200, batch_size=64, callbacks=[model_checkpoint, reduce_func])

Epoch 1/200
Epoch 1: loss did not improve from 1.51486
Epoch 2/200
Epoch 2: loss did not improve from 1.51486
Epoch 3/200
Epoch 3: loss improved from 1.51486 to 1.51325, saving model to model.h5
Epoch 4/200
Epoch 4: loss did not improve from 1.51325
Epoch 5/200
Epoch 5: loss did not improve from 1.51325
Epoch 6/200
Epoch 6: loss improved from 1.51325 to 1.50577, saving model to model.h5
Epoch 7/200
Epoch 7: loss did not improve from 1.50577
Epoch 8/200
Epoch 8: loss improved from 1.50577 to 1.50483, saving model to model.h5
Epoch 9/200
Epoch 9: loss did not improve from 1.50483
Epoch 10/200
Epoch 10: loss did not improve from 1.50483
Epoch 11/200
Epoch 11: loss improved from 1.50483 to 1.50398, saving model to model.h5
Epoch 12/200
Epoch 12: loss did not improve from 1.50398
Epoch 13/200
Epoch 13: loss improved from 1.50398 to 1.50196, saving model to model.h5
Epoch 14/200
Epoch 14: loss improved from 1.50196 to 1.50014, saving model to model.h5
Epoch 15/200
Epoch 15: loss improved fro

<keras.callbacks.History at 0x12355d74488>

In [220]:
def pred_next_words(text):
    for _ in range(3):
        x = difflib.get_close_matches(text.lower(),list(tokenizer.word_index.keys()))
        predicted_word = []
        text = x[0].lower()
        text_to_seq = tokenizer.texts_to_sequences([text])[0]
        text_to_seq = np.array(text_to_seq)
        preds = np.argsort(pred_model.predict(text_to_seq), axis=1)[:,-5:]
        for key,value in tokenizer.word_index.items():
            if value in preds:
                predicted_word.append(key)
        return predicted_word, text
        
    

In [221]:
word_to_search = "Eye"
next_predictions,word_to_search = pred_next_words(word_to_search)
[f"{word_to_search} {i}" for i in next_predictions if word_to_search.lower() != i.lower()]

['eye pain', 'eye frequent', 'eye infection', 'eye dryness', 'eye floaters']

In [222]:
word_to_search = "Foos"
next_predictions, word_to_search = pred_next_words(word_to_search)
[f"{word_to_search} {i}" for i in next_predictions]

['foot pain', 'foot swelling', 'foot redness', 'foot numbness', 'foot sores']