In [42]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [43]:
# read tab delimited data
data = pd.read_csv("./data/data_processed.txt", sep="\t")
# remove all na rows
data = data.dropna(axis=0, how="all")
# remove all na columns
data = data.dropna(axis=1, how="all")
# strip strings
data = data.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
data.head()

Unnamed: 0.1,Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,provisional_diagnosis,advised_investigations,advised_investigations.1,advised_investigations.2,management,surgical_management
0,0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,upto 4 weeks,allergy,antihistamine,nasal spray - steroid,,,
1,1,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,1-12 months,allergy,antihistamine,nasal spray - steroid,,,
2,2,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,1-12 years,allergy,antihistamine,nasal spray - steroid,,,
3,3,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,13-18 years,allergy,antihistamine,nasal spray - steroid,,,
4,4,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,19-25 years,allergy,antihistamine,nasal spray - steroid,,,


In [44]:
symptoms_corpus = (
    data.subjective_symptom.replace(np.nan, "")
    + "|"
    + data.associated_symptoms.replace(np.nan, "")
    + "|"
    + data.investigations_done.replace(np.nan, "")
    + "|"
    + data.gender.replace(np.nan, "")
    + "|"
    + data.age.replace(np.nan, "")
).dropna()

symptoms_corpus

0       nasal|sneezing|asthma|stress|enhancing factors...
1       nasal|sneezing|asthma|stress|enhancing factors...
2       nasal|sneezing|asthma|stress|enhancing factors...
3       nasal|sneezing|asthma|stress|enhancing factors...
4       nasal|sneezing|asthma|stress|enhancing factors...
                              ...                        
1182    abdomen|loose stools|certain foods/beverages|b...
1183    abdomen|loose stools|certain foods/beverages|b...
1184    abdomen|loose stools|certain foods/beverages|b...
1185    abdomen|loose stools|certain foods/beverages|b...
1186    abdomen|loose stools|certain foods/beverages|b...
Length: 1187, dtype: object

In [45]:
symptoms_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split="|",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
symptoms_tokeniser.fit_on_texts(symptoms_corpus)

symptoms_tokeniser.index_word

{1: 'male',
 2: 'female',
 3: 'nasal',
 4: 'blood tests',
 5: 'anus',
 6: 'abdomen',
 7: 'ct scan pns',
 8: 'vomiting',
 9: 'fever',
 10: 'blockage in nose',
 11: 'bleeding',
 12: 'pain in abdomen',
 13: 'blood ige',
 14: '1-12 years',
 15: '13-18 years',
 16: '19-25 years',
 17: '26-35 years',
 18: '36-50 years',
 19: 'upto 4 weeks',
 20: '1-12 months',
 21: '51-65 years',
 22: 'above 65 years',
 23: 'runny nose',
 24: 'anxiety',
 25: 'discharge',
 26: 'painless',
 27: 'stress',
 28: 'persistent',
 29: 'pain',
 30: 'hard stool',
 31: 'ultrasound',
 32: 'unable to smell',
 33: 'headache',
 34: 'leakage',
 35: 'per rectal exam',
 36: 'severe',
 37: 'sneezing',
 38: 'scrotum',
 39: 'sticky stools',
 40: 'swelling',
 41: 'color doppler',
 42: 'pain in rectum',
 43: 'local tenderness',
 44: 'reducing factors',
 45: 'bleeding profile',
 46: 'bad smell',
 47: 'increases on bending',
 48: 'while passing stool',
 49: 'cracked skin',
 50: 'itching',
 51: 'bloating',
 52: 'mucus',
 53: 'loose st

In [46]:
symptoms_sequences = symptoms_tokeniser.texts_to_sequences(symptoms_corpus)

symptoms_sequences


[[3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 19],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 20],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 14],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 15],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 16],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 17],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 18],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 21],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 2, 22],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 19],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 20],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 14],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 15],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 16],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 17],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 18],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 21],
 [3, 37, 82, 27, 59, 83, 84, 4, 13, 1, 22],
 [3, 37, 60, 59, 44, 4, 13, 2, 19],
 [3, 37, 60, 59, 44, 4, 13, 2, 20],
 [3, 37, 60, 59, 44, 4, 13, 2, 14],
 [3, 37, 60, 59, 44, 4, 13, 2, 15],
 [3, 37, 60, 59, 44, 4, 13, 2, 16],
 [3, 37, 60, 59, 44, 4, 13, 

In [47]:
symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    symptoms_sequences, padding="pre", maxlen=25
)
symptoms_padded = np.array(symptoms_padded)

symptoms_padded

array([[  0,   0,   0, ...,  13,   2,  19],
       [  0,   0,   0, ...,  13,   2,  20],
       [  0,   0,   0, ...,  13,   2,  14],
       ...,
       [  0,   0,   0, ..., 116,   1,  18],
       [  0,   0,   0, ..., 116,   1,  21],
       [  0,   0,   0, ..., 116,   1,  22]], dtype=int32)

In [48]:
causes_corpus = (
    data.subjective_symptom.replace(np.nan, "")
    + "|"
    + data.provisional_diagnosis.replace(np.nan, "")
).dropna()

causes_corpus

0                                nasal|allergy
1                                nasal|allergy
2                                nasal|allergy
3                                nasal|allergy
4                                nasal|allergy
                         ...                  
1182    abdomen|intolerance to foods/beverages
1183    abdomen|intolerance to foods/beverages
1184    abdomen|intolerance to foods/beverages
1185    abdomen|intolerance to foods/beverages
1186    abdomen|intolerance to foods/beverages
Length: 1187, dtype: object

In [49]:
causes_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split="\n",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
causes_tokeniser.fit_on_texts(causes_corpus)

causes_tokeniser.index_word


{1: 'nasal|nasal polyposis',
 2: 'nasal|allergy',
 3: 'anus|incontinence',
 4: 'nasal|vasomotor',
 5: 'nasal|deviated nasal septum',
 6: 'nasal|mass',
 7: 'nasal|cancer',
 8: 'nasal|sinusitis',
 9: 'anus|piles/haemorrhoids',
 10: 'nasal|nose pricking',
 11: 'nasal|hypertension',
 12: 'nasal|trauma',
 13: 'nasal|infection',
 14: 'nasal|rhinitis',
 15: 'nasal|csf leak',
 16: 'nasal|fracture',
 17: 'nasal|fungal sinusitis',
 18: 'anus|anal fissure',
 19: 'anus|prolapsed',
 20: 'anus|grade 1 and 2',
 21: 'anus|cancer',
 22: 'anus|proctalgia',
 23: 'anus|fistula',
 24: 'anus|std',
 25: 'anus|irritable bowel syndrome',
 26: 'anus|colitis',
 27: 'anus|dermatitis',
 28: 'anus|thread worms',
 29: 'anus|diaper rash',
 30: 'abdomen|gerd',
 31: 'abdomen|hyperacidity',
 32: 'abdomen|gastroenteritis',
 33: 'abdomen|appendicitis',
 34: 'abdomen|typhlitis',
 35: 'abdomen|right ureteric stone',
 36: 'abdomen|intestinal obstruction',
 37: 'abdomen|cholecystitis',
 38: 'abdomen|pancreatitis',
 39: 'abdom

In [50]:
causes_sequences = causes_tokeniser.texts_to_sequences(causes_corpus)

causes_sequences


[[2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1]

In [51]:
causes_padded = tf.keras.preprocessing.sequence.pad_sequences(
    causes_sequences, padding="pre"
)
causes_padded = np.array(causes_padded) - 1

causes_padded


array([[ 1],
       [ 1],
       [ 1],
       ...,
       [43],
       [43],
       [43]], dtype=int32)

In [52]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=len(symptoms_tokeniser.index_word) + 1,
            output_dim=16,
            # input_length=max(map(len, symptoms_padded)),
            input_length=25,
        ),
        # tf.keras.layers.LSTM(8),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        # tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(32, activation="relu"),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(len(causes_tokeniser.index_word), activation="softmax"),
    ]
)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 16)            2112      
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 57)                1881      
                                                                 
Total params: 4,537
Trainable params: 4,537
Non-trainable params: 0
_________________________________________________________________


In [53]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)
model.fit(
    x=symptoms_padded,
    y=causes_padded,
    epochs=200,
    callbacks=[tf.keras.callbacks.TensorBoard(log_dir="logs")],
)


Epoch 1/200
 1/38 [..............................] - ETA: 13s - loss: 4.0430 - accuracy: 0.0000e+00

2023-02-11 09:16:06.058132: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

<keras.callbacks.History at 0x29a94c3a0>

In [54]:
model.evaluate(x=symptoms_padded, y=causes_padded)




2023-02-11 09:17:24.922637: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




[0.5439066290855408, 0.6512215733528137]

In [55]:
with open(file="./data/symptoms_tokeniser.json", mode="w") as f:
    f.write(symptoms_tokeniser.to_json())

with open(file="./data/causes_tokeniser.json", mode="w") as f:
    f.write(causes_tokeniser.to_json())

model.save("./data/model.h5")

In [56]:
with open(file="./data/symptoms_tokeniser.json", mode="r") as f:
    loaded_symptoms_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(
        f.read()
    )
with open(file="./data/causes_tokeniser.json", mode="r") as f:
    loaded_causes_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(
        f.read())

loaded_model = tf.keras.models.load_model("./data/model.h5")
loaded_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 16)            2112      
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 57)                1881      
                                                                 
Total params: 4,537
Trainable params: 4,537
Non-trainable params: 0
_________________________________________________________________


In [57]:
test_subjective_symptom = ["nasal"]
test_associative_symptoms = ["runny nose", "blockage in nose"]
test_investigations_done = ["blood tests"]
test_gender = ["female"]
test_age = ["13-18 years"]

test_symptoms_corpus = (
    test_subjective_symptom
    + test_associative_symptoms
    + test_investigations_done
    + test_gender
    + test_age
)
test_symptoms_corpus = "|".join(test_symptoms_corpus)
print(test_symptoms_corpus)

test_symptoms_sequences = symptoms_tokeniser.texts_to_sequences([test_symptoms_corpus])
print(test_symptoms_sequences)

test_symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_symptoms_sequences, padding="pre", maxlen=25
)
test_symptoms_padded = np.array(test_symptoms_padded)
print(test_symptoms_padded)

test_causes_probabilities = loaded_model.predict(test_symptoms_padded)[0]
# print(test_causes_probabilities)

test_causes_rankings = np.argsort(test_causes_probabilities).tolist()
# print(test_causes_rankings)

# print(causes_tokeniser.index_word)
print(
    causes_tokeniser.index_word[test_causes_rankings[-1] + 1],
    round(test_causes_probabilities[test_causes_rankings[-1]] * 100, 2),
    "%",
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-2] + 1],
    round(test_causes_probabilities[test_causes_rankings[-2]] * 100, 2),
    "%",
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-3] + 1],
    round(test_causes_probabilities[test_causes_rankings[-3]] * 100, 2),
    "%",
)

nasal|runny nose|blockage in nose|blood tests|female|13-18 years
[[3, 23, 10, 4, 2, 15]]
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3 23 10  4  2
  15]]
nasal|nasal polyposis 45.25 %
nasal|deviated nasal septum 27.61 %
nasal|vasomotor 26.38 %


2023-02-11 09:17:25.546377: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
