In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf


In [2]:
data = pd.read_csv("./data/data.psv", sep="|")


In [3]:
data.describe()


Unnamed: 0,SUBJECTIVE SYMPTOM,OBJECTIVE SYMPTOM,GENDER,CAUSE
count,15,15,15,15
unique,2,8,2,8
top,vomiting,motion sickness; feeling queasy; feeling clamm...,female,motion sicknesses
freq,9,2,8,2


In [4]:
data.head()


Unnamed: 0,SUBJECTIVE SYMPTOM,OBJECTIVE SYMPTOM,GENDER,CAUSE
0,vomiting,motion sickness; feeling queasy; feeling clamm...,male,motion sicknesses
1,vomiting,motion sickness; feeling queasy; feeling clamm...,female,motion sicknesses
2,vomiting,first trimester in pregnancy; feeling nausea; ...,female,early pregnancy
3,vomiting,cramping; feeling nausea; diarrhoea,male,food poisoning
4,vomiting,cramping; feeling nausea; diarrhoea,female,food poisoning


In [5]:
data.columns = data.columns.str.lower().str.replace(" ", "_")


In [6]:
data.head()


Unnamed: 0,subjective_symptom,objective_symptom,gender,cause
0,vomiting,motion sickness; feeling queasy; feeling clamm...,male,motion sicknesses
1,vomiting,motion sickness; feeling queasy; feeling clamm...,female,motion sicknesses
2,vomiting,first trimester in pregnancy; feeling nausea; ...,female,early pregnancy
3,vomiting,cramping; feeling nausea; diarrhoea,male,food poisoning
4,vomiting,cramping; feeling nausea; diarrhoea,female,food poisoning


In [7]:
symptoms_corpus = (
    data.subjective_symptom + ";" + data.objective_symptom + ";" + data.gender
)
symptoms_corpus = list(
    map(lambda item: item.replace("; ", ";"), symptoms_corpus))
symptoms_corpus = list(
    map(lambda item: item.replace(" ;", ";"), symptoms_corpus))
symptoms_corpus


['vomiting;motion sickness;feeling queasy;feeling clammy;feeling sick to your stomach;male',
 'vomiting;motion sickness;feeling queasy;feeling clammy;feeling sick to your stomach;female',
 'vomiting;first trimester in pregnancy;feeling nausea;morning sickness;female',
 'vomiting;cramping;feeling nausea;diarrhoea;male',
 'vomiting;cramping;feeling nausea;diarrhoea;female',
 'vomiting;feeling pain in one side of head;feeling pain in both sides of head;pain that throbs or pulsates;sensitive to light and sound;sensitive to smell and touch;feeling nausea;male',
 'vomiting;feeling pain in one side of head;feeling pain in both sides of head;pain that throbs or pulsates;sensitive to light and sound;sensitive to smell and touch;feeling nausea;female',
 'vomiting;feeling nausea;male',
 'vomiting;feeling nausea;female',
 'pain;sharp shooting, searing, or stabbing pain;tingling sensations;numbness;extreme sensitivity to touch;insensitivity to heat or cold;male',
 'pain;sharp shooting, searing, or 

In [8]:
symptoms_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split=";",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
symptoms_tokeniser.fit_on_texts(symptoms_corpus)
symptoms_tokeniser.index_word


{1: 'vomiting',
 2: 'female',
 3: 'male',
 4: 'feeling nausea',
 5: 'pain',
 6: 'motion sickness',
 7: 'feeling queasy',
 8: 'feeling clammy',
 9: 'feeling sick to your stomach',
 10: 'cramping',
 11: 'diarrhoea',
 12: 'feeling pain in one side of head',
 13: 'feeling pain in both sides of head',
 14: 'pain that throbs or pulsates',
 15: 'sensitive to light and sound',
 16: 'sensitive to smell and touch',
 17: 'sharp shooting, searing, or stabbing pain',
 18: 'tingling sensations',
 19: 'numbness',
 20: 'extreme sensitivity to touch',
 21: 'insensitivity to heat or cold',
 22: 'shooting, burning, or stabbing pain',
 23: 'difficulty sleeping or resting',
 24: 'a chronic sensation of feeling unpleasant or abnormal',
 25: 'emotional problems as a result of chronic pain',
 26: 'spontaneous pain, or pain that occurs without a trigger',
 27: 'loss of sleep',
 28: 'difficulty expressing how you are feeling',
 29: 'neuromatrix of pain',
 30: 'anxiety',
 31: 'depression',
 32: 'fatigue',
 33: '

In [9]:
symptoms_sequences = symptoms_tokeniser.texts_to_sequences(symptoms_corpus)
symptoms_sequences


[[1, 6, 7, 8, 9, 3],
 [1, 6, 7, 8, 9, 2],
 [1, 36, 4, 37, 2],
 [1, 10, 4, 11, 3],
 [1, 10, 4, 11, 2],
 [1, 12, 13, 14, 15, 16, 4, 3],
 [1, 12, 13, 14, 15, 16, 4, 2],
 [1, 4, 3],
 [1, 4, 2],
 [5, 17, 18, 19, 20, 21, 3],
 [5, 17, 18, 19, 20, 21, 2],
 [5, 22, 23, 24, 25, 26, 27, 28, 3],
 [5, 22, 23, 24, 25, 26, 27, 28, 2],
 [5, 29, 30, 31, 32, 33, 34, 35, 3],
 [5, 29, 30, 31, 32, 33, 34, 35, 2]]

In [10]:
symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    symptoms_sequences, padding="pre"
)
symptoms_padded = np.array(symptoms_padded)
symptoms_padded


array([[ 0,  0,  0,  1,  6,  7,  8,  9,  3],
       [ 0,  0,  0,  1,  6,  7,  8,  9,  2],
       [ 0,  0,  0,  0,  1, 36,  4, 37,  2],
       [ 0,  0,  0,  0,  1, 10,  4, 11,  3],
       [ 0,  0,  0,  0,  1, 10,  4, 11,  2],
       [ 0,  1, 12, 13, 14, 15, 16,  4,  3],
       [ 0,  1, 12, 13, 14, 15, 16,  4,  2],
       [ 0,  0,  0,  0,  0,  0,  1,  4,  3],
       [ 0,  0,  0,  0,  0,  0,  1,  4,  2],
       [ 0,  0,  5, 17, 18, 19, 20, 21,  3],
       [ 0,  0,  5, 17, 18, 19, 20, 21,  2],
       [ 5, 22, 23, 24, 25, 26, 27, 28,  3],
       [ 5, 22, 23, 24, 25, 26, 27, 28,  2],
       [ 5, 29, 30, 31, 32, 33, 34, 35,  3],
       [ 5, 29, 30, 31, 32, 33, 34, 35,  2]], dtype=int32)

In [11]:
causes_corpus = data.subjective_symptom + "|" + data.cause
causes_corpus


0     vomiting|motion sicknesses
1     vomiting|motion sicknesses
2       vomiting|early pregnancy
3        vomiting|food poisoning
4        vomiting|food poisoning
5              vomiting|migraine
6              vomiting|migraine
7         vomiting|miscellaneous
8         vomiting|miscellaneous
9          pain|nociceptive pain
10         pain|nociceptive pain
11         pain|neuropathic pain
12         pain|neuropathic pain
13    pain|central sensitisation
14    pain|central sensitisation
dtype: object

In [12]:
causes_tokeniser = tf.keras.preprocessing.text.Tokenizer(
    split="\n",
    # oov_token="<OOV>",
    # filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    filters="",
)
causes_tokeniser.fit_on_texts(causes_corpus)
causes_tokeniser.index_word


{1: 'vomiting|motion sicknesses',
 2: 'vomiting|food poisoning',
 3: 'vomiting|migraine',
 4: 'vomiting|miscellaneous',
 5: 'pain|nociceptive pain',
 6: 'pain|neuropathic pain',
 7: 'pain|central sensitisation',
 8: 'vomiting|early pregnancy'}

In [13]:
causes_sequences = causes_tokeniser.texts_to_sequences(causes_corpus)
causes_sequences


[[1], [1], [8], [2], [2], [3], [3], [4], [4], [5], [5], [6], [6], [7], [7]]

In [14]:
causes_padded = tf.keras.preprocessing.sequence.pad_sequences(
    causes_sequences, padding="pre"
)
causes_padded = np.array(causes_padded) - 1
causes_padded


array([[0],
       [0],
       [7],
       [1],
       [1],
       [2],
       [2],
       [3],
       [3],
       [4],
       [4],
       [5],
       [5],
       [6],
       [6]], dtype=int32)

In [15]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=len(symptoms_tokeniser.index_word) + 1,
            output_dim=16,
            input_length=max(map(len, symptoms_padded)),
        ),
        # tf.keras.layers.LSTM(8),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GlobalAveragePooling1D(),
        # tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(32, activation="relu"),
        # tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(8, activation="softmax"),
    ]
)

model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

model.summary()


Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 16)             608       
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 8)                 264       
                                                                 
Total params: 1,416
Trainable params: 1,416
Non-trainable params: 0
_________________________________________________________________


2022-12-07 13:30:51.727717: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-07 13:30:51.727894: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [16]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)
model.fit(
    x=symptoms_padded,
    y=causes_padded,
    epochs=500,
    callbacks=[tf.keras.callbacks.TensorBoard(log_dir="logs")],
)


Epoch 1/500


2022-12-07 13:30:51.950450: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-07 13:30:52.249854: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 7

<keras.callbacks.History at 0x29bab29a0>

In [17]:
model.evaluate(x=symptoms_padded, y=causes_padded)




2022-12-07 13:31:01.525094: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[0.028227563947439194, 1.0]

In [18]:
test_symptoms = ["vomiting; cramping; feeling nausea; diarrhoea"]
test_gender = ["female"]

test_symptoms_corpus = [
    a + ";" + b for a in test_symptoms for b in test_gender]
test_symptoms_corpus = test_symptoms_corpus[0].split(";")
test_symptoms_corpus = [str(item).lower().strip()
                        for item in test_symptoms_corpus]
test_symptoms_corpus = ";".join(test_symptoms_corpus)
print(test_symptoms_corpus)

test_symptoms_sequences = symptoms_tokeniser.texts_to_sequences(
    [test_symptoms_corpus])
print(test_symptoms_sequences)

test_symptoms_padded = tf.keras.preprocessing.sequence.pad_sequences(
    test_symptoms_sequences, padding="pre", maxlen=max(map(len, symptoms_padded))
)
test_symptoms_padded = np.array(test_symptoms_padded)
print(test_symptoms_padded)

test_causes_probabilities = model.predict(test_symptoms_padded)[0]
print(test_causes_probabilities)

test_causes_rankings = np.argsort(test_causes_probabilities).tolist()
print(test_causes_rankings)

print(causes_tokeniser.index_word)
print(
    causes_tokeniser.index_word[test_causes_rankings[-1] + 1],
    round(test_causes_probabilities[test_causes_rankings[-1]] * 100, 2),
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-2] + 1],
    round(test_causes_probabilities[test_causes_rankings[-2]] * 100, 2),
)
print(
    causes_tokeniser.index_word[test_causes_rankings[-3] + 1],
    round(test_causes_probabilities[test_causes_rankings[-3]] * 100, 2),
)


vomiting;cramping;feeling nausea;diarrhoea;female
[[1, 10, 4, 11, 2]]
[[ 0  0  0  0  1 10  4 11  2]]
[1.9055639e-03 9.4886553e-01 9.9270073e-05 4.7563728e-02 5.1261450e-04
 8.7122899e-04 1.8821189e-05 1.6331261e-04]
[6, 2, 7, 4, 5, 0, 3, 1]
{1: 'vomiting|motion sicknesses', 2: 'vomiting|food poisoning', 3: 'vomiting|migraine', 4: 'vomiting|miscellaneous', 5: 'pain|nociceptive pain', 6: 'pain|neuropathic pain', 7: 'pain|central sensitisation', 8: 'vomiting|early pregnancy'}
vomiting|food poisoning 94.89
vomiting|miscellaneous 4.76
vomiting|motion sicknesses 0.19


2022-12-07 13:31:01.681066: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [19]:
with open(file="./data/symptoms_tokeniser.json", mode="w") as f:
    f.write(symptoms_tokeniser.to_json())

with open(file="./data/causes_tokeniser.json", mode="w") as f:
    f.write(causes_tokeniser.to_json())

model.save("./data/model.h5")


In [20]:
with open(file="./data/symptoms_tokeniser.json", mode="r") as f:
    loaded_symptoms_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(
        f.read()
    )
with open(file="./data/causes_tokeniser.json", mode="r") as f:
    loaded_causes_tokeniser = tf.keras.preprocessing.text.tokenizer_from_json(
        f.read())

loaded_model = tf.keras.models.load_model("./data/model.h5")
loaded_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 16)             608       
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 8)                 264       
                                                                 
Total params: 1,416
Trainable params: 1,416
Non-trainable params: 0
_________________________________________________________________


In [21]:
test_symptoms2 = [
    "vomiting; motion sickness; feeling queasy; feeling clammy; feeling sick to your stomach"
]
test_gender2 = ["male"]

test_symptoms_corpus2 = [
    a + ";" + b for a in test_symptoms2 for b in test_gender2]
test_symptoms_corpus2 = test_symptoms_corpus2[0].split(";")
test_symptoms_corpus2 = [str(item).lower().strip()
                         for item in test_symptoms_corpus2]
test_symptoms_corpus2 = ";".join(test_symptoms_corpus2)
print(test_symptoms_corpus2)

test_symptoms_sequences2 = loaded_symptoms_tokeniser.texts_to_sequences(
    [test_symptoms_corpus2]
)
print(test_symptoms_sequences2)

test_symptoms_padded2 = tf.keras.preprocessing.sequence.pad_sequences(
    test_symptoms_sequences2, padding="pre", maxlen=max(map(len, symptoms_padded))
)
test_symptoms_padded2 = np.array(test_symptoms_padded2)
print(test_symptoms_padded2)

test_causes_probabilities2 = loaded_model.predict(test_symptoms_padded2)[0]
print(test_causes_probabilities2)

test_causes_rankings2 = np.argsort(test_causes_probabilities2).tolist()
print(test_causes_rankings2)

print(loaded_causes_tokeniser.index_word)
print(
    loaded_causes_tokeniser.index_word[test_causes_rankings2[-1] + 1],
    round(test_causes_probabilities2[test_causes_rankings2[-1]] * 100, 2),
)
print(
    loaded_causes_tokeniser.index_word[test_causes_rankings2[-2] + 1],
    round(test_causes_probabilities2[test_causes_rankings2[-2]] * 100, 2),
)
print(
    loaded_causes_tokeniser.index_word[test_causes_rankings2[-3] + 1],
    round(test_causes_probabilities2[test_causes_rankings2[-3]] * 100, 2),
)


vomiting;motion sickness;feeling queasy;feeling clammy;feeling sick to your stomach;male
[[1, 6, 7, 8, 9, 3]]
[[0 0 0 1 6 7 8 9 3]]
[9.8055696e-01 4.0651774e-03 3.4805629e-03 2.5458355e-03 1.7725195e-05
 5.1258743e-04 1.5792497e-04 8.6631486e-03]
[4, 6, 5, 3, 2, 1, 7, 0]
{1: 'vomiting|motion sicknesses', 2: 'vomiting|food poisoning', 3: 'vomiting|migraine', 4: 'vomiting|miscellaneous', 5: 'pain|nociceptive pain', 6: 'pain|neuropathic pain', 7: 'pain|central sensitisation', 8: 'vomiting|early pregnancy'}
vomiting|motion sicknesses 98.06
vomiting|early pregnancy 0.87
vomiting|food poisoning 0.41


2022-12-07 13:31:01.941518: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [22]:
test_symptoms3 = [
    "vomiting; motion sickness; feeling queasy; feeling clammy; feeling sick to your stomach"
]
test_gender3 = ["male"]
test_symptoms_corpus3 = [
    a + ";" + b for a in test_symptoms3 for b in test_gender3]
test_symptoms_corpus3 = test_symptoms_corpus3[0].split(";")
test_symptoms_corpus3 = [str(item).lower().strip()
                         for item in test_symptoms_corpus3]
test_symptoms_corpus3 = ";".join(test_symptoms_corpus3)
print(test_symptoms_corpus3)

test_symptoms_sequences3 = loaded_symptoms_tokeniser.texts_to_sequences(
    [test_symptoms_corpus3]
)
print(test_symptoms_sequences3)

test_symptoms_padded3 = tf.keras.preprocessing.sequence.pad_sequences(
    test_symptoms_sequences3, padding="pre", maxlen=max(map(len, symptoms_padded))
)
test_symptoms_padded3 = np.array(test_symptoms_padded3)
print(test_symptoms_padded3)

test_causes_probabilities3 = loaded_model.predict(test_symptoms_padded3)[0]
print(test_causes_probabilities3)

test_causes_rankings3 = np.argsort(test_causes_probabilities3).tolist()
print(test_causes_rankings3)

print(
    loaded_causes_tokeniser.index_word[test_causes_rankings3[-1] + 1],
    round(test_causes_probabilities3[test_causes_rankings3[-1]] / 1, 4) * 100,
)
print(
    loaded_causes_tokeniser.index_word[test_causes_rankings3[-2] + 1],
    round(test_causes_probabilities3[test_causes_rankings3[-2]] / 1, 4) * 100,
)
print(
    loaded_causes_tokeniser.index_word[test_causes_rankings3[-3] + 1],
    round(test_causes_probabilities3[test_causes_rankings3[-3]] / 1, 4) * 100,
)


vomiting;motion sickness;feeling queasy;feeling clammy;feeling sick to your stomach;male
[[1, 6, 7, 8, 9, 3]]
[[0 0 0 1 6 7 8 9 3]]
[9.8055696e-01 4.0651774e-03 3.4805629e-03 2.5458355e-03 1.7725195e-05
 5.1258743e-04 1.5792497e-04 8.6631486e-03]
[4, 6, 5, 3, 2, 1, 7, 0]
vomiting|motion sicknesses 98.06
vomiting|early pregnancy 0.8699999999999999
vomiting|food poisoning 0.41000000000000003
