In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# read tab delimited data
data = pd.read_csv("./data/data_processed.txt", sep="\t")
# remove all na rows
data = data.dropna(axis=0, how="all")
# remove all na columns
data = data.dropna(axis=1, how="all")
# strip strings
data = data.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)
data.head()

Unnamed: 0.1,Unnamed: 0,subjective_symptom,associated_symptoms,investigations_done,gender,age,provisional_diagnosis,advised_investigations,advised_investigations.1,advised_investigations.2,management,management.1,management.2,surgical_management
0,0,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,upto 4 weeks,allergy,antihistamine,nasal spray - steroid,,,,,
1,1,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,1-12 months,allergy,antihistamine,nasal spray - steroid,,,,,
2,2,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,1 year,allergy,antihistamine,nasal spray - steroid,,,,,
3,3,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,2 years,allergy,antihistamine,nasal spray - steroid,,,,,
4,4,nasal,sneezing|asthma|stress|enhancing factors|itchy...,blood tests|blood ige,female,3 years,allergy,antihistamine,nasal spray - steroid,,,,,


In [3]:
symptoms_corpus = (
    data.subjective_symptom.replace(np.nan, "")
    + "|"
    + data.associated_symptoms.replace(np.nan, "")
    + "|"
    + data.investigations_done.replace(np.nan, "")
    + "|"
    + data.gender.replace(np.nan, "")
    + "|"
    + data.age.replace(np.nan, "")
).dropna()

symptoms_corpus

0        nasal|sneezing|asthma|stress|enhancing factors...
1        nasal|sneezing|asthma|stress|enhancing factors...
2        nasal|sneezing|asthma|stress|enhancing factors...
3        nasal|sneezing|asthma|stress|enhancing factors...
4        nasal|sneezing|asthma|stress|enhancing factors...
                               ...                        
23161    chest pain|dull ache chest wall|pain in chest ...
23162    chest pain|dull ache chest wall|pain in chest ...
23163    chest pain|dull ache chest wall|pain in chest ...
23164    chest pain|dull ache chest wall|pain in chest ...
23165    chest pain|dull ache chest wall|pain in chest ...
Length: 23166, dtype: object

In [4]:
symptoms_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split="|",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
symptoms_tokeniser.fit_on_texts(symptoms_corpus)

symptoms_tokeniser.index_word

{1: 'male',
 2: 'female',
 3: 'nasal',
 4: 'throat and mouth',
 5: 'blood tests',
 6: 'anus',
 7: 'abdomen',
 8: 'ct scan pns',
 9: 'vomiting',
 10: 'fever',
 11: 'blockage in nose',
 12: 'ear',
 13: 'bleeding',
 14: 'pain in abdomen',
 15: 'blood ige',
 16: 'runny nose',
 17: 'anxiety',
 18: 'discharge',
 19: 'painless',
 20: 'stress',
 21: 'persistent',
 22: 'pain',
 23: 'hard stool',
 24: 'musculoskeletal',
 25: 'ultrasound',
 26: 'unable to smell',
 27: 'headache',
 28: 'leakage',
 29: 'oral burning sensation',
 30: 'per rectal exam',
 31: 'severe',
 32: 'sneezing',
 33: 'scrotum',
 34: 'sticky stools',
 35: 'chest pain',
 36: 'swelling',
 37: 'color doppler',
 38: 'pain in rectum',
 39: 'local tenderness',
 40: 'reducing factors',
 41: 'bleeding profile',
 42: 'bad smell',
 43: 'increases on bending',
 44: 'while passing stool',
 45: 'cracked skin',
 46: 'itching',
 47: 'bloating',
 48: 'mucus',
 49: 'loose stools',
 50: 'pain in right lower abdomen',
 51: 'painful swallowing',
 5

In [5]:
symptoms_sequences = symptoms_tokeniser.texts_to_sequences(symptoms_corpus)

symptoms_sequences

[[3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 215],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 216],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 155],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 156],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 157],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 158],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 159],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 160],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 161],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 162],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 163],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 164],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 165],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 166],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 167],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 168],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 169],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 170],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 171],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2, 172],
 [3, 32, 217, 20, 63, 218, 219, 5, 15, 2

In [6]:
symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    symptoms_sequences, padding="pre", maxlen=25
)
symptoms_padded = np.array(symptoms_padded)

symptoms_padded

array([[  0,   0,   0, ...,  15,   2, 215],
       [  0,   0,   0, ...,  15,   2, 216],
       [  0,   0,   0, ...,  15,   2, 155],
       ...,
       [  0,   0,   0, ..., 336,   1, 152],
       [  0,   0,   0, ..., 336,   1, 153],
       [  0,   0,   0, ..., 336,   1, 154]], dtype=int32)

In [7]:
causes_corpus = (
    data.subjective_symptom.replace(np.nan, "")
    + "|"
    + data.provisional_diagnosis.replace(np.nan, "")
).dropna()

causes_corpus

0                        nasal|allergy
1                        nasal|allergy
2                        nasal|allergy
3                        nasal|allergy
4                        nasal|allergy
                     ...              
23161    chest pain|chest wall myalgia
23162    chest pain|chest wall myalgia
23163    chest pain|chest wall myalgia
23164    chest pain|chest wall myalgia
23165    chest pain|chest wall myalgia
Length: 23166, dtype: object

In [8]:
causes_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split="\n",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
causes_tokeniser.fit_on_texts(causes_corpus)

causes_tokeniser.index_word

{1: 'nasal|nasal polyposis',
 2: 'nasal|allergy',
 3: 'anus|incontinence',
 4: 'nasal|vasomotor',
 5: 'nasal|deviated nasal septum',
 6: 'nasal|mass',
 7: 'nasal|cancer',
 8: 'nasal|sinusitis',
 9: 'anus|piles/haemorrhoids',
 10: 'throat and mouth|oral submucous fibrosis osf',
 11: 'nasal|nose pricking',
 12: 'nasal|hypertension',
 13: 'nasal|trauma',
 14: 'nasal|infection',
 15: 'nasal|rhinitis',
 16: 'nasal|csf leak',
 17: 'nasal|fracture',
 18: 'nasal|fungal sinusitis',
 19: 'anus|anal fissure',
 20: 'anus|prolapsed',
 21: 'anus|grade 1 and 2',
 22: 'anus|cancer',
 23: 'anus|proctalgia',
 24: 'anus|fistula',
 25: 'anus|std',
 26: 'anus|irritable bowel syndrome',
 27: 'anus|colitis',
 28: 'anus|dermatitis',
 29: 'anus|thread worms',
 30: 'anus|diaper rash',
 31: 'abdomen|gerd',
 32: 'abdomen|hyperacidity',
 33: 'abdomen|gastroenteritis',
 34: 'abdomen|appendicitis',
 35: 'abdomen|typhlitis',
 36: 'abdomen|right ureteric stone',
 37: 'abdomen|intestinal obstruction',
 38: 'abdomen|cho

In [9]:
causes_sequences = causes_tokeniser.texts_to_sequences(causes_corpus)

causes_sequences

[[2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2]

In [10]:
causes_padded = tf.keras.preprocessing.sequence.pad_sequences(
    causes_sequences, padding="pre"
)
causes_padded = np.array(causes_padded) - 1

causes_padded

array([[ 1],
       [ 1],
       [ 1],
       ...,
       [88],
       [88],
       [88]], dtype=int32)

In [11]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=len(symptoms_tokeniser.index_word) + 1,
            output_dim=16,
            # input_length=max(map(len, symptoms_padded)),
            input_length=25,
        ),
        # tf.keras.layers.LSTM(8),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        # tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(32, activation="relu"),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(len(causes_tokeniser.index_word), activation="softmax"),
    ]
)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

model.summary()

Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 16)            5744      
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 104)               3432      
                                                                 
Total params: 9,720
Trainable params: 9,720
Non-trainable params: 0
_________________________________________________________________


In [12]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)
model.fit(
    x=symptoms_padded,
    y=causes_padded,
    epochs=50,
    callbacks=[tf.keras.callbacks.TensorBoard(log_dir="logs")],
)

Epoch 1/50


2023-05-21 18:07:46.417403: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2a9582aa0>

In [13]:
model.evaluate(x=symptoms_padded, y=causes_padded)



[0.4599478840827942, 0.6976603865623474]

In [14]:
with open(file="./data/symptoms_tokeniser.json", mode="w") as f:
    f.write(symptoms_tokeniser.to_json())

with open(file="./data/causes_tokeniser.json", mode="w") as f:
    f.write(causes_tokeniser.to_json())

model.save("./data/model.h5")

In [15]:
with open(file="./data/symptoms_tokeniser.json", mode="r") as f:
    loaded_symptoms_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(
        f.read()
    )
with open(file="./data/causes_tokeniser.json", mode="r") as f:
    loaded_causes_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(f.read())

loaded_model = tf.keras.models.load_model("./data/model.h5")
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 16)            5744      
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 104)               3432      
                                                                 
Total params: 9,720
Trainable params: 9,720
Non-trainable params: 0
_________________________________________________________________


In [16]:
test_subjective_symptom = ["nasal"]
test_associative_symptoms = ["runny nose", "blockage in nose"]
test_investigations_done = ["blood tests"]
test_gender = ["female"]
test_age = ["13-18 years"]

test_symptoms_corpus = (
    test_subjective_symptom
    + test_associative_symptoms
    + test_investigations_done
    + test_gender
    + test_age
)
test_symptoms_corpus = "|".join(test_symptoms_corpus)
print(test_symptoms_corpus)

test_symptoms_sequences = symptoms_tokeniser.texts_to_sequences([test_symptoms_corpus])
print(test_symptoms_sequences)

test_symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_symptoms_sequences, padding="pre", maxlen=25
)
test_symptoms_padded = np.array(test_symptoms_padded)
print(test_symptoms_padded)

test_causes_probabilities = loaded_model.predict(test_symptoms_padded)[0]
# print(test_causes_probabilities)

test_causes_rankings = np.argsort(test_causes_probabilities).tolist()
# print(test_causes_rankings)

# print(causes_tokeniser.index_word)
print(
    causes_tokeniser.index_word[test_causes_rankings[-1] + 1],
    round(test_causes_probabilities[test_causes_rankings[-1]] * 100, 2),
    "%",
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-2] + 1],
    round(test_causes_probabilities[test_causes_rankings[-2]] * 100, 2),
    "%",
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-3] + 1],
    round(test_causes_probabilities[test_causes_rankings[-3]] * 100, 2),
    "%",
)

nasal|runny nose|blockage in nose|blood tests|female|13-18 years
[[3, 16, 11, 5, 2]]
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3 16 11  5
   2]]
nasal|deviated nasal septum 34.52 %
nasal|vasomotor 32.6 %
nasal|nasal polyposis 32.54 %
