In [3]:
# ========== basics ==========
import re
import json
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from typing import List, Union

warnings.filterwarnings("ignore")

# ==========  text  ==========
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# ==========   saving ==========
import joblib
import pickle as pkl
import ast

# ========== Scikit-learn:   ==========
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    mean_squared_error, r2_score, roc_curve, auc, ConfusionMatrixDisplay
)

# ========== Scikit-learn:   ==========
from sklearn.preprocessing import (
    LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# ========== Scikit-learn:   ==========
from sklearn.linear_model import (
    LinearRegression, Lasso, Ridge, ElasticNet, LassoCV, HuberRegressor, LogisticRegression
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors

# ========== Scikit-learn: ==========
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# ========== TensorFlow / Keras ==========
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import (
    Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D,
    LSTM, GRU, Bidirectional, Flatten, BatchNormalization , MaxPooling1D
)
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
df = pd.read_csv('healthcare.csv')
df.head()

Unnamed: 0,Symptoms,Disease
0,Abdominal aortic aneurysms often grow slowly w...,Abdominal aortic aneurysm
1,"The main sign of acanthosis nigricans is dark,...",Acanthosis nigricans
2,Achalasia symptoms generally appear gradually ...,Achalasia
3,The pain associated with Achilles tendinitis t...,Achilles tendinitis
4,Although it's possible to have no signs or sym...,Achilles tendon rupture


In [5]:
class TextPreprocessor:
    def __init__(self, text_column: str = "Symptoms", diagnosis_column: str = "Disease"):
        self.text_column = text_column
        self.diagnosis_column = diagnosis_column
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def remove_disease_from_symptoms(self, symptoms: str, disease: str) -> str:
        if pd.isna(symptoms) or pd.isna(disease):
            return symptoms
        pattern = re.compile(re.escape(disease), re.IGNORECASE)
        return pattern.sub("", symptoms)

    def clean_text(self, text: str) -> str:
        if pd.isna(text):
            return ""
        text = text.lower()
        text = re.sub(r"[^a-z\s]", "", text)  # Keep only letters and spaces
        text = re.sub(r"\s+", " ", text)      # Normalize whitespace
        return text.strip()

    def remove_stopwords_and_lemmatize(self, text: str) -> str:
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in self.stop_words]
        lemmatized = [self.lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(lemmatized)

    def full_preprocess(self, text: str) -> str:
        text = self.clean_text(text)
        return self.remove_stopwords_and_lemmatize(text)

    def preprocess_dataframe(self, df: pd.DataFrame) -> tuple:
        null_count = df[[self.text_column, self.diagnosis_column]].isna().sum().sum()
        df = df.dropna(subset=[self.text_column, self.diagnosis_column])
        duplicate_count = df.duplicated().sum()
        df = df.drop_duplicates()

        # Remove disease from symptoms
        df["cleaned_symptoms"] = df.apply(
            lambda row: self.remove_disease_from_symptoms(row[self.text_column], row[self.diagnosis_column]),
            axis=1
        )

        # Apply full cleaning: lowercase, remove non-alpha, stopwords, lemmatization
        df["cleaned_symptoms"] = df["cleaned_symptoms"].apply(self.full_preprocess)

        return df, null_count, duplicate_count, df.shape

In [6]:
preprocessor = TextPreprocessor(text_column="Symptoms", diagnosis_column="Disease")

df_cleaned, null_count, duplicate_count, final_shape = preprocessor.preprocess_dataframe(df)
df_cleaned.head()

Unnamed: 0,Symptoms,Disease,cleaned_symptoms
0,Abdominal aortic aneurysms often grow slowly w...,Abdominal aortic aneurysm,often grow slowly without noticeable symptom m...
1,"The main sign of acanthosis nigricans is dark,...",Acanthosis nigricans,main sign dark thick velvety skin body fold cr...
2,Achalasia symptoms generally appear gradually ...,Achalasia,symptom generally appear gradually get worse t...
3,The pain associated with Achilles tendinitis t...,Achilles tendinitis,pain associated typically begin mild ache back...
4,Although it's possible to have no signs or sym...,Achilles tendon rupture,although possible sign symptom people seek med...


In [7]:
print(f"Nulls removed: {null_count}")
print(f"Duplicates removed: {duplicate_count}")
print(f"Final shape: {final_shape}")

Nulls removed: 33
Duplicates removed: 61914
Final shape: (191904, 3)


In [8]:
X = df_cleaned["cleaned_symptoms"]
y = df_cleaned["Disease"]

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_raw)

# Only keep test samples whose labels are known to the encoder
known_labels_mask = y_test_raw.isin(label_encoder.classes_)
X_test = X_test_raw[known_labels_mask]
y_test = y_test_raw[known_labels_mask]
y_test_encoded = label_encoder.transform(y_test)

In [10]:
pkl.dump(label_encoder, open('label_encoder.pkl', 'wb'))

In [11]:
X_train_final, X_val_raw, y_train_final_encoded, y_val_encoded = train_test_split(
    X_train_raw, y_train_encoded, test_size=0.2, random_state=42
)

In [12]:
mlp_clf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', MLPClassifier(random_state=42, max_iter=1000))  
])

mlp_clf.fit(X_train_final, y_train_final_encoded)

In [13]:
y_train_pred = mlp_clf.predict(X_train_final)
y_val_pred = mlp_clf.predict(X_val_raw)

print("Training Accuracy:", accuracy_score(y_train_final_encoded, y_train_pred))
print("Validation Accuracy:", accuracy_score(y_val_encoded, y_val_pred))
print("Training Classification Report:\n", classification_report(y_train_final_encoded, y_train_pred))

Training Accuracy: 0.8480760149163803
Validation Accuracy: 0.8158931770070021
Training Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       1.00      1.00      1.00         1
           2       1.00      0.25      0.40         4
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         1
          18       1.00      1.00      1.00         3
          19       1.00      1.00      1.00         1
          22       1.00      1.00      1.00         1
          26       0.00 

In [14]:
y_test_pred = mlp_clf.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test_encoded, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test_encoded, y_test_pred))

Test Accuracy: 0.8226591908183948
Test Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3
          56       0.00      0.00      0.00         3
          58       0.00      0.00      0.00         1
         106       0.00      0.00      0.00         4
         201       0.00      0.00      0.00         0
         248       0.00      0.00      0.00         3
         253       0.00      0.00      0.00         0
         257       0.00      0.00      0.00         2
         275       0.00      0.00      0.00         4
         301       0.00      0.00      0.00         0
         311       0.00      0.00      0.00         2
         349       0.00      0.00      0.00         0
         374       0.00      0.00      0.00         1
         379       0.00      0.00      0.00         0
         388      

In [15]:
pkl.dump(mlp_clf, open('mlp_model.pkl', 'wb'))

In [16]:
log_reg_clf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000))
])

log_reg_clf.fit(X_train_final, y_train_final_encoded)

In [17]:
y_train_pred = log_reg_clf.predict(X_train_final)
y_val_pred = log_reg_clf.predict(X_val_raw)

print("Training Accuracy:", accuracy_score(y_train_final_encoded, y_train_pred))
print("Validation Accuracy:", accuracy_score(y_val_encoded, y_val_pred))
print("Training Classification Report:\n", classification_report(y_train_final_encoded, y_train_pred))

Training Accuracy: 0.8190493250175056
Validation Accuracy: 0.8111382510991695
Training Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3
          19       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         1
          26       0.00 

In [18]:
y_test_pred = log_reg_clf.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test_encoded, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test_encoded, y_test_pred))

Test Accuracy: 0.8157361341440943
Test Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         3
          56       0.00      0.00      0.00         3
          58       0.00      0.00      0.00         1
         106       0.00      0.00      0.00         4
         248       0.00      0.00      0.00         3
         257       0.00      0.00      0.00         2
         275       0.00      0.00      0.00         4
         311       0.00      0.00      0.00         2
         374       0.00      0.00      0.00         1
         388       0.00      0.00      0.00         1
         397       0.00      0.00      0.00         4
         555       0.00      0.00      0.00         1
         558       0.00      0.00      0.00         1
         571       0.00      0.00      0.00         1
         658      

In [19]:
pkl.dump(log_reg_clf, open('log_reg_model.pkl', 'wb'))

In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_final)

X_train_seq = tokenizer.texts_to_sequences(X_train_final)
X_val_seq = tokenizer.texts_to_sequences(X_val_raw)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

num_classes = len(label_encoder.classes_)
y_train_cat = to_categorical(y_train_final_encoded, num_classes=num_classes)
y_val_cat = to_categorical(y_val_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)

vocab_size = len(tokenizer.word_index) + 1

In [21]:
pkl.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [22]:
# Tokenization and padding (already done, but add post padding/truncating)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

embedding_dim = 128  # Increased embedding dimension

embedding_dim = 100  # smaller
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model_lstm.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))  # One layer
model_lstm.add(Dense(64, activation='relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(num_classes, activation='softmax'))

model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-5)

model_lstm.fit(X_train_pad, y_train_cat,
               epochs=10,
               batch_size=64,
               validation_data=(X_val_pad, y_val_cat),
               callbacks=[early_stop, reduce_lr])

Epoch 1/10
[1m1920/1920[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m227s[0m 115ms/step - accuracy: 0.0765 - loss: 5.0349 - val_accuracy: 0.4508 - val_loss: 2.1085 - learning_rate: 0.0010
Epoch 2/10
[1m1920/1920[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 123ms/step - accuracy: 0.4215 - loss: 2.2005 - val_accuracy: 0.6358 - val_loss: 1.3545 - learning_rate: 0.0010
Epoch 3/10
[1m1920/1920[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 125ms/step - accuracy: 0.5592 - loss: 1.5782 - val_accuracy: 0.7014 - val_loss: 1.0913 - learning_rate: 0.0010
Epoch 4/10
[1m1920/1920[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 128ms/step - accuracy: 0.6234 - loss: 1.3115 - val_accuracy: 0.7334 - val_loss: 0.9539 - learning_rate: 0.0010
Epoch 5/10
[1m1920/1920[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 127ms/step - accuracy: 0.6609 - loss: 1.1561 - val_accuracy: 0.7556 - val_loss: 0.8708 - learning_rate: 0.0010
Epoch 6/10
[1m1920/1920[0m [32m━━━━━━

<keras.src.callbacks.history.History at 0x1aede7e6260>

In [23]:
# Predict on the training set
y_train_pred_prob = model_lstm.predict(X_train_pad)
y_train_pred = np.argmax(y_train_pred_prob, axis=1)

# Only use target_names for classes present in y_train_final_encoded
unique_labels = np.unique(y_train_final_encoded)
target_names = label_encoder.inverse_transform(unique_labels)

print(classification_report(y_train_final_encoded, y_train_pred, labels=unique_labels, target_names=target_names))

[1m3839/3839[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 13ms/step
                                                                         precision    recall  f1-score   support

                                (vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         5
                                                             ACL injury       0.33      1.00      0.50         1
                                                                   AIDS       0.75      0.75      0.75         4
                                                                   ARDS       0.00      0.00      0.00         1
                                              Abdominal aortic aneurysm       0.00      0.00      0.00         1
                                             Abdominal aortic aneurysm        0.00      0.00      0.00         1
                                                      Abdominal hernia        0.00      0.00      0.00         1
                

In [24]:
# Evaluate the LSTM model on the test set
test_loss, test_accuracy = model_lstm.evaluate(X_test_pad, y_test_cat, verbose=1)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate predictions and classification report
y_test_pred_prob = model_lstm.predict(X_test_pad)
y_test_pred = np.argmax(y_test_pred_prob, axis=1)
print(classification_report(y_test_encoded, y_test_pred, labels=unique_labels, target_names=target_names))

[1m1188/1188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 13ms/step - accuracy: 0.7950 - loss: 0.5787
Test Loss: 0.5814
Test Accuracy: 0.7948
[1m1188/1188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step
                                                                         precision    recall  f1-score   support

                                (vertigo) Paroymsal  Positional Vertigo       0.00      0.00      0.00         2
                                                             ACL injury       0.00      0.00      0.00         0
                                                                   AIDS       0.00      0.00      0.00         1
                                                                   ARDS       0.00      0.00      0.00         0
                                              Abdominal aortic aneurysm       0.00      0.00      0.00         0
                                             Abdominal aortic aneurysm        0.00      0

In [25]:
pkl.dump(model_lstm, open('lstm_model.pkl', 'wb'))