<a href="https://colab.research.google.com/github/adithya-prabhu-22/Natural_language_processing_from_scratch/blob/main/disease_prediction_ANN_NLP_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
import nltk

In [7]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
df = pd.read_csv("/content/synthetic_long_queries_medical_data.csv")
print(df.head())

                                       symptom_query    medical_branch
0  This 56-year-old female has been dealing with ...        Cardiology
1  This 35-year-old female has been dealing with ...  General Medicine
2  Chief complaint is sudden memory loss. The iss...         Neurology
3  Chief complaint is a high fever. The issue has...  General Medicine
4  This 26-year-old male has been dealing with ps...       Dermatology


In [5]:
def preprocess(text):
  text=text.lower()
  text = re.sub(r"[^a-zA-Z\s]", "", text)
  tokens = word_tokenize(text)
  return tokens

In [8]:
df["tokens"] = df["symptom_query"].apply(preprocess)

In [9]:
df.head(2)

Unnamed: 0,symptom_query,medical_branch,tokens
0,This 56-year-old female has been dealing with ...,Cardiology,"[this, yearold, female, has, been, dealing, wi..."
1,This 35-year-old female has been dealing with ...,General Medicine,"[this, yearold, female, has, been, dealing, wi..."


In [10]:
sentences = df["tokens"].tolist()
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [11]:
def get_avg_embedding(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

df["embedding"] = df["tokens"].apply(lambda x: get_avg_embedding(x, w2v_model))

In [12]:
df.head(1)

Unnamed: 0,symptom_query,medical_branch,tokens,embedding
0,This 56-year-old female has been dealing with ...,Cardiology,"[this, yearold, female, has, been, dealing, wi...","[0.012486379, -0.08283161, -0.31297094, 0.0691..."


In [14]:
df['embedding'][0].shape

(100,)

In [16]:
X = np.vstack(df["embedding"].values)
y = df["medical_branch"].values

In [24]:
import tensorflow
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense

In [53]:
model=keras.models.Sequential()

In [54]:
# Define the input dimension based on the embedding size
input_dim = X.shape[1]

# Add a dense layer to the model
model.add(Dense(units=128, activation='relu', input_shape=(input_dim,)))

# Add an output layer with the number of units equal to the number of unique medical branches
num_classes = len(np.unique(y))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

In [55]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
from sklearn.preprocessing import LabelEncoder

# Encode the string labels to integers
label_encoder = LabelEncoder()
y_train_encoded_labels = label_encoder.fit_transform(y_train)
y_test_encoded_labels = label_encoder.transform(y_test)

In [66]:
from tensorflow.keras.utils import to_categorical

# Convert the encoded labels to one-hot encoded vectors
y_train_encoded = to_categorical(y_train_encoded_labels)
y_test_encoded = to_categorical(y_test_encoded_labels)

In [68]:
# Compile the model again before training to ensure the graph is built correctly
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(x_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(x_test, y_test_encoded))

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7045 - loss: 1.2162 - val_accuracy: 1.0000 - val_loss: 0.0875
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9995 - loss: 0.0587 - val_accuracy: 1.0000 - val_loss: 0.0252
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9999 - loss: 0.0200 - val_accuracy: 1.0000 - val_loss: 0.0128
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0109 - val_accuracy: 1.0000 - val_loss: 0.0082
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0066 - val_accuracy: 1.0000 - val_loss: 0.0051
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0048 - val_accuracy: 1.0000 - val_loss: 0.0037
Epoch 7/10
[1m250/250[0m 

<keras.src.callbacks.history.History at 0x789662f6bf20>

In [70]:
custom_query="i have read rashes near my leg and itchyness and head ache"
custom_query=preprocess(custom_query)
custom_embedding=get_avg_embedding(custom_query,w2v_model)
custom_embedding=np.array(custom_embedding).reshape(1,-1)
predictions = model.predict(custom_embedding)

# Get the index of the predicted class (highest probability)
predicted_class_index = np.argmax(predictions, axis=1)

# Use the label encoder to get the original class label
predicted_specialty = label_encoder.inverse_transform(predicted_class_index)

print("Predicted Medical Specialty:", predicted_specialty[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Predicted Medical Specialty: Gastroenterology
