In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('data_notp.csv')

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
df['Disease_ID'] = label_encoder.fit_transform(df['Disease_ID'])

In [3]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['Patient_Description'])
total_words = len(tokenizer.word_index) + 1

In [5]:
train_sequences = tokenizer.texts_to_sequences(train_data['Patient_Description'])
test_sequences = tokenizer.texts_to_sequences(test_data['Patient_Description'])

In [6]:
train_padded_sequences = pad_sequences(train_sequences)
test_padded_sequences = pad_sequences(test_sequences, maxlen=train_padded_sequences.shape[1])

# Convert labels to one-hot encoding
train_labels = tf.keras.utils.to_categorical(train_data['Disease_ID'], num_classes=len(set(df['Disease_ID'])))
test_labels = tf.keras.utils.to_categorical(test_data['Disease_ID'], num_classes=len(set(df['Disease_ID'])))

In [7]:
train_padded_sequences.shape[1]

78

In [8]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=train_padded_sequences.shape[1]))
model.add(LSTM(100))
model.add(tf.keras.layers.Dense(256))
model.add(tf.keras.layers.Dense(128))
model.add(Dense(len(set(df['Disease_ID'])), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
model.fit(train_padded_sequences, train_labels, epochs=5, verbose=1, validation_data=(test_padded_sequences, test_labels))

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12585583590>

In [16]:
new_queries = ["I have Sneezing, Runny recently. The Runny is a bit troublesome. It's a bit inconvenient but not too disruptive to my daily activities. I've had similar issues in the past but never this persistent..", "I have Chest, Wheezing, Shortness in the last few days. The Wheezing is a bit bothersome. It's a bit inconvenient but not too disruptive to my daily activities. I haven't seen a doctor yet, but I'm keeping an eye on the symptoms.."]

In [17]:
# Tokenize and pad new queries
new_sequences = tokenizer.texts_to_sequences(new_queries)
new_padded_sequences = pad_sequences(new_sequences, maxlen=train_padded_sequences.shape[1])

# Make predictions
predictions = model.predict(new_padded_sequences)

# Convert predicted numerical classes back to original string values
predicted_classes = [label_encoder.classes_[tf.argmax(prediction).numpy()] for prediction in predictions]
print("Predicted Classes:", predicted_classes)

Predicted Classes: [18, 28]


In [18]:
model.save("disease_notp.keras")

In [19]:
import pickle

with open('disease_tokenizer_notp.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
with open('disease_label_encoder_notp.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)