In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# read tab delimited data
data = pd.read_csv("./data/data_processed.txt", sep="\t")
# remove all na rows
data = data.dropna(axis=0, how="all")
# remove all na columns
data = data.dropna(axis=1, how="all")
# strip strings
data = data.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
data.head()

Unnamed: 0.1,Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,provisional_diagnosis,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,upto 4 weeks,allergy,antihistamine,nasal spray - steroid,,,,,
1,1,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,1-12 months,allergy,antihistamine,nasal spray - steroid,,,,,
2,2,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,1-12 years,allergy,antihistamine,nasal spray - steroid,,,,,
3,3,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,13-18 years,allergy,antihistamine,nasal spray - steroid,,,,,
4,4,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,19-25 years,allergy,antihistamine,nasal spray - steroid,,,,,


In [3]:
symptoms_corpus = (
    data.subjective_symptom.replace(np.nan, "")
    + "|"
    + data.associated_symptoms.replace(np.nan, "")
    + "|"
    + data.investigations_done.replace(np.nan, "")
    + "|"
    + data.gender.replace(np.nan, "")
    + "|"
    + data.age.replace(np.nan, "")
).dropna()

symptoms_corpus

0       nasal|sneezing|asthma|stress|enhancing factors...
1       nasal|sneezing|asthma|stress|enhancing factors...
2       nasal|sneezing|asthma|stress|enhancing factors...
3       nasal|sneezing|asthma|stress|enhancing factors...
4       nasal|sneezing|asthma|stress|enhancing factors...
                              ...                        
2459    chest pain|dull ache chest wall|pain in chest ...
2460    chest pain|dull ache chest wall|pain in chest ...
2461    chest pain|dull ache chest wall|pain in chest ...
2462    chest pain|dull ache chest wall|pain in chest ...
2463    chest pain|dull ache chest wall|pain in chest ...
Length: 2464, dtype: object

In [4]:
symptoms_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split="|",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
symptoms_tokeniser.fit_on_texts(symptoms_corpus)

symptoms_tokeniser.index_word

{1: 'male',
 2: 'female',
 3: 'nasal',
 4: 'throat and mouth',
 5: 'blood tests',
 6: 'anus',
 7: 'abdomen',
 8: 'ct scan pns',
 9: 'vomiting',
 10: 'fever',
 11: 'blockage in nose',
 12: 'ear',
 13: '1-12 years',
 14: 'above 60 years',
 15: '13-18 years',
 16: '19-25 years',
 17: '26-35 years',
 18: '36-50 years',
 19: 'above 50 years',
 20: 'upto 4 weeks',
 21: '1-12 months',
 22: '51-65 years',
 23: 'above 65 years',
 24: 'bleeding',
 25: 'pain in abdomen',
 26: 'blood ige',
 27: 'runny nose',
 28: 'anxiety',
 29: 'discharge',
 30: 'painless',
 31: 'stress',
 32: 'persistent',
 33: 'pain',
 34: 'hard stool',
 35: 'musculoskeletal',
 36: 'ultrasound',
 37: 'unable to smell',
 38: 'headache',
 39: 'leakage',
 40: 'oral burning sensation',
 41: 'per rectal exam',
 42: 'severe',
 43: 'sneezing',
 44: 'scrotum',
 45: 'sticky stools',
 46: 'chest pain',
 47: 'swelling',
 48: 'color doppler',
 49: 'pain in rectum',
 50: 'local tenderness',
 51: 'reducing factors',
 52: 'bleeding profile',


In [5]:
symptoms_sequences = symptoms_tokeniser.texts_to_sequences(symptoms_corpus)

symptoms_sequences


[[3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 20],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 21],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 13],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 15],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 16],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 17],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 18],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 22],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 23],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 14],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 2, 19],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 20],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 21],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 13],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 15],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 16],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 17],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 18],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 22],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 23],
 [3, 43, 125, 31, 74, 126, 127, 5, 26, 1, 14],
 [3, 43, 125,

In [6]:
symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    symptoms_sequences, padding="pre", maxlen=25
)
symptoms_padded = np.array(symptoms_padded)

symptoms_padded

array([[  0,   0,   0, ...,  26,   2,  20],
       [  0,   0,   0, ...,  26,   2,  21],
       [  0,   0,   0, ...,  26,   2,  13],
       ...,
       [  0,   0,   0, ..., 244,   1,  23],
       [  0,   0,   0, ..., 244,   1,  14],
       [  0,   0,   0, ..., 244,   1,  19]], dtype=int32)

In [7]:
causes_corpus = (
    data.subjective_symptom.replace(np.nan, "")
    + "|"
    + data.provisional_diagnosis.replace(np.nan, "")
).dropna()

causes_corpus

0                       nasal|allergy
1                       nasal|allergy
2                       nasal|allergy
3                       nasal|allergy
4                       nasal|allergy
                    ...              
2459    chest pain|chest wall myalgia
2460    chest pain|chest wall myalgia
2461    chest pain|chest wall myalgia
2462    chest pain|chest wall myalgia
2463    chest pain|chest wall myalgia
Length: 2464, dtype: object

In [8]:
causes_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split="\n",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
causes_tokeniser.fit_on_texts(causes_corpus)

causes_tokeniser.index_word


{1: 'nasal|nasal polyposis',
 2: 'nasal|allergy',
 3: 'anus|incontinence',
 4: 'nasal|vasomotor',
 5: 'nasal|deviated nasal septum',
 6: 'nasal|mass',
 7: 'nasal|cancer',
 8: 'nasal|sinusitis',
 9: 'anus|piles/haemorrhoids',
 10: 'throat and mouth|oral submucous fibrosis osf',
 11: 'nasal|nose pricking',
 12: 'nasal|hypertension',
 13: 'nasal|trauma',
 14: 'nasal|infection',
 15: 'nasal|rhinitis',
 16: 'nasal|csf leak',
 17: 'nasal|fracture',
 18: 'nasal|fungal sinusitis',
 19: 'anus|anal fissure',
 20: 'anus|prolapsed',
 21: 'anus|grade 1 and 2',
 22: 'anus|cancer',
 23: 'anus|proctalgia',
 24: 'anus|fistula',
 25: 'anus|std',
 26: 'anus|irritable bowel syndrome',
 27: 'anus|colitis',
 28: 'anus|dermatitis',
 29: 'anus|thread worms',
 30: 'anus|diaper rash',
 31: 'abdomen|gerd',
 32: 'abdomen|hyperacidity',
 33: 'abdomen|gastroenteritis',
 34: 'abdomen|appendicitis',
 35: 'abdomen|typhlitis',
 36: 'abdomen|right ureteric stone',
 37: 'abdomen|intestinal obstruction',
 38: 'abdomen|cho

In [9]:
causes_sequences = causes_tokeniser.texts_to_sequences(causes_corpus)

causes_sequences


[[2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [4],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5],
 [5]

In [10]:
causes_padded = tf.keras.preprocessing.sequence.pad_sequences(
    causes_sequences, padding="pre"
)
causes_padded = np.array(causes_padded) - 1

causes_padded


array([[ 1],
       [ 1],
       [ 1],
       ...,
       [88],
       [88],
       [88]], dtype=int32)

In [11]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=len(symptoms_tokeniser.index_word) + 1,
            output_dim=16,
            # input_length=max(map(len, symptoms_padded)),
            input_length=25,
        ),
        # tf.keras.layers.LSTM(8),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        # tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(32, activation="relu"),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(len(causes_tokeniser.index_word), activation="softmax"),
    ]
)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

model.summary()

Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 16)            4272      
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 104)               3432      
                                                                 
Total params: 8,248
Trainable params: 8,248
Non-trainable params: 0
_________________________________________________________________


In [12]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)
model.fit(
    x=symptoms_padded,
    y=causes_padded,
    epochs=200,
    callbacks=[tf.keras.callbacks.TensorBoard(log_dir="logs")],
)


Epoch 1/200


2023-05-10 16:35:29.167515: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

<keras.callbacks.History at 0x29e50e3b0>

In [13]:
model.evaluate(x=symptoms_padded, y=causes_padded)




[0.46650293469429016, 0.6964285969734192]

In [14]:
with open(file="./data/symptoms_tokeniser.json", mode="w") as f:
    f.write(symptoms_tokeniser.to_json())

with open(file="./data/causes_tokeniser.json", mode="w") as f:
    f.write(causes_tokeniser.to_json())

model.save("./data/model.h5")

In [15]:
with open(file="./data/symptoms_tokeniser.json", mode="r") as f:
    loaded_symptoms_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(
        f.read()
    )
with open(file="./data/causes_tokeniser.json", mode="r") as f:
    loaded_causes_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(
        f.read())

loaded_model = tf.keras.models.load_model("./data/model.h5")
loaded_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 16)            4272      
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 104)               3432      
                                                                 
Total params: 8,248
Trainable params: 8,248
Non-trainable params: 0
_________________________________________________________________


In [16]:
test_subjective_symptom = ["nasal"]
test_associative_symptoms = ["runny nose", "blockage in nose"]
test_investigations_done = ["blood tests"]
test_gender = ["female"]
test_age = ["13-18 years"]

test_symptoms_corpus = (
    test_subjective_symptom
    + test_associative_symptoms
    + test_investigations_done
    + test_gender
    + test_age
)
test_symptoms_corpus = "|".join(test_symptoms_corpus)
print(test_symptoms_corpus)

test_symptoms_sequences = symptoms_tokeniser.texts_to_sequences([test_symptoms_corpus])
print(test_symptoms_sequences)

test_symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_symptoms_sequences, padding="pre", maxlen=25
)
test_symptoms_padded = np.array(test_symptoms_padded)
print(test_symptoms_padded)

test_causes_probabilities = loaded_model.predict(test_symptoms_padded)[0]
# print(test_causes_probabilities)

test_causes_rankings = np.argsort(test_causes_probabilities).tolist()
# print(test_causes_rankings)

# print(causes_tokeniser.index_word)
print(
    causes_tokeniser.index_word[test_causes_rankings[-1] + 1],
    round(test_causes_probabilities[test_causes_rankings[-1]] * 100, 2),
    "%",
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-2] + 1],
    round(test_causes_probabilities[test_causes_rankings[-2]] * 100, 2),
    "%",
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-3] + 1],
    round(test_causes_probabilities[test_causes_rankings[-3]] * 100, 2),
    "%",
)

nasal|runny nose|blockage in nose|blood tests|female|13-18 years
[[3, 27, 11, 5, 2, 15]]
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3 27 11  5  2
  15]]
nasal|vasomotor 39.6 %
nasal|nasal polyposis 35.3 %
nasal|deviated nasal septum 24.67 %
