In [1]:
# ========== basics ==========
import re
import json
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from typing import List, Union

warnings.filterwarnings("ignore")

# ==========  text  ==========
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# ==========   saving ==========
import joblib
import pickle as pkl
import ast

# ========== Scikit-learn:   ==========
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    mean_squared_error, r2_score, roc_curve, auc, ConfusionMatrixDisplay
)

# ========== Scikit-learn:   ==========
from sklearn.preprocessing import (
    LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, PolynomialFeatures
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# ========== Scikit-learn:   ==========
from sklearn.linear_model import (
    LinearRegression, Lasso, Ridge, ElasticNet, LassoCV, HuberRegressor, LogisticRegression
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors

# ========== Scikit-learn: ==========
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# ========== TensorFlow / Keras ==========
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import (
    Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D,
    LSTM, GRU, Bidirectional, Flatten, BatchNormalization , MaxPooling1D
)
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,diagnosis,text
0,personality disorder,"Patient reported these symptoms: depression ,h..."
1,attention deficit hyperactivity disorder (adhd),"Patient reported these symptoms: depression ,d..."
2,muscle spasm,Patient reported these symptoms: abnormal invo...
3,diverticulitis,Patient reported these symptoms: sharp abdomin...
4,polycystic ovarian syndrome (pcos),"Patient reported these symptoms: hot flashes ,..."


In [3]:
train.shape

(469195, 2)

In [None]:
class TextPreprocessor:
    def __init__(self, text_column: str = "text", diagnosis_column: str = "diagnosis"):
        self.text_column = text_column
        self.diagnosis_column = diagnosis_column
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

        self.diagnosis_patterns = [
            r"the patient may have .*",
            r"based on these symptoms.*",
            r"what disease may the patient have\?",
            r"may indicate.*",
        ]

    def remove_diagnosis(self, text: str) -> str:
        for pattern in self.diagnosis_patterns:
            text = re.sub(pattern, "", text, flags=re.IGNORECASE)
        return text

    def clean_and_tokenize(self, text: str) -> list:
        text = text.lower()
        text = self.remove_diagnosis(text)
        text = re.sub(r"[^a-z\s]", "", text)  # Remove punctuation and numbers
        tokens = word_tokenize(text)
        return tokens

    def remove_stopwords_and_lemmatize(self, tokens: list) -> list:
        return [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]

    def preprocess_text(self, text: str) -> str:
        tokens = self.clean_and_tokenize(text)
        clean_tokens = self.remove_stopwords_and_lemmatize(tokens)
        return ' '.join(clean_tokens)

    def preprocess_dataframe(self, df: pd.DataFrame) -> tuple:
        # Count nulls before dropping
        null_count = df[[self.text_column, self.diagnosis_column]].isna().sum().sum()

        # Drop missing values
        df = df.dropna(subset=[self.text_column, self.diagnosis_column])

        # Count duplicates before dropping
        duplicate_count = df.duplicated().sum()

        # Drop duplicates
        df = df.drop_duplicates()

        # Apply preprocessing
        df["cleaned_text"] = df[self.text_column].apply(self.preprocess_text)

        # Return cleaned df + stats
        return df, null_count, duplicate_count, df.shape


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ziad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
preprocessor = TextPreprocessor(text_column="text", diagnosis_column="diagnosis")

In [6]:
train_cleaned, nulls, dups, final_shape = preprocessor.preprocess_dataframe(train)

print(f"Missing values removed: {nulls}")
print(f"Duplicates removed: {dups}")
print(f"Final dataset shape: {final_shape}")

Missing values removed: 0
Duplicates removed: 105332
Final dataset shape: (363863, 3)


In [7]:
label_encoder = LabelEncoder()
train_cleaned['diagnosis'] = label_encoder.fit_transform(train_cleaned['diagnosis'])

In [8]:
pkl.dump(label_encoder, open('label_encoder.pkl', 'wb'))

In [9]:
# Filter out rare classes
X = train_cleaned["cleaned_text"]
y = train_cleaned["diagnosis"]
valid_classes = y.value_counts()[y.value_counts() >= 2].index
X = X[y.isin(valid_classes)]
y = y[y.isin(valid_classes)]

In [10]:
# Perform train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("clf", LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)

In [12]:
y_pred = pipeline.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n")

# Filter target names to include only the classes present in y_val
filtered_classes = [label_encoder.classes_[i] for i in sorted(y_val.unique())]
print(classification_report(y_val, y_pred, target_names=filtered_classes))

Validation Accuracy: 0.8324767767822789

Classification Report:

                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.94      0.94      0.94        16
                                        abdominal hernia       0.97      0.88      0.92       102
                                         abscess of nose       0.84      0.80      0.82        59
                                     abscess of the lung       0.00      0.00      0.00         2
                                  abscess of the pharynx       0.78      0.83      0.81        65
                                    acanthosis nigricans       0.00      0.00      0.00         3
                                               acariasis       1.00      0.50      0.67         4
                                               achalasia       1.00      0.20      0.33        10
                                                    

In [13]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,diagnosis,text
0,psychotic disorder,"Patient reported these symptoms: depression ,i..."
1,neurosis,Patient reported these symptoms: anxiety and n...
2,liver disease,"Patient reported these symptoms: nausea ,diarr..."
3,bursitis,"Patient reported these symptoms: arm pain ,kne..."
4,jaw disorder,"Patient reported these symptoms: headache ,too..."


In [14]:
test_cleaned, nulls, dups, final_shape = preprocessor.preprocess_dataframe(test)

print(f"Missing values removed: {nulls}")
print(f"Duplicates removed: {dups}")
print(f"Final dataset shape: {final_shape}")

Missing values removed: 0
Duplicates removed: 487
Final dataset shape: (24208, 3)


In [15]:
test_cleaned['diagnosis'] = label_encoder.transform(test_cleaned['diagnosis'])

In [16]:
X_test = test_cleaned["cleaned_text"]
y_test = test_cleaned['diagnosis']

In [17]:
# Filter out rare classes in test
test_class_counts = y_test.value_counts()
test_valid_classes = test_class_counts[test_class_counts >= 2].index
mask = y_test.isin(test_valid_classes)
X_test_filtered = X_test[mask]
y_test_filtered = y_test[mask]

In [18]:
# Predict
y_pred = pipeline.predict(X_test_filtered)
unique_labels = sorted(y_test_filtered.unique())
test_filtered_classes = [label_encoder.classes_[i] for i in unique_labels]

print("Test Accuracy:", accuracy_score(y_test_filtered, y_pred))
print(classification_report(y_test_filtered, y_pred, labels=unique_labels, target_names=test_filtered_classes))

Test Accuracy: 0.8503646610309962
                                                 precision    recall  f1-score   support

                      abdominal aortic aneurysm       1.00      1.00      1.00        14
                               abdominal hernia       1.00      0.94      0.97        36
                                abscess of nose       0.80      0.92      0.86        26
                            abscess of the lung       0.00      0.00      0.00         2
                         abscess of the pharynx       0.80      0.97      0.88        29
                           acanthosis nigricans       1.00      0.14      0.25         7
                                      acariasis       1.00      0.50      0.67         2
                                      achalasia       1.00      0.44      0.62         9
                                           acne       0.95      0.63      0.76        59
                              actinic keratosis       0.93      0.73      0

In [19]:
pkl.dump(pipeline, open('logistic_model.pkl', 'wb'))

In [20]:
mlp_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("clf", MLPClassifier(max_iter=1000, random_state=42))
])

mlp_pipeline.fit(X_train, y_train)

In [21]:
y_pred = pipeline.predict(X_val)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n")

# Filter target names to include only the classes present in y_val
filtered_classes = [label_encoder.classes_[i] for i in sorted(y_val.unique())]
print(classification_report(y_val, y_pred, target_names=filtered_classes))

Validation Accuracy: 0.8324767767822789

Classification Report:

                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.94      0.94      0.94        16
                                        abdominal hernia       0.97      0.88      0.92       102
                                         abscess of nose       0.84      0.80      0.82        59
                                     abscess of the lung       0.00      0.00      0.00         2
                                  abscess of the pharynx       0.78      0.83      0.81        65
                                    acanthosis nigricans       0.00      0.00      0.00         3
                                               acariasis       1.00      0.50      0.67         4
                                               achalasia       1.00      0.20      0.33        10
                                                    

In [22]:
# Predict
y_pred = mlp_pipeline.predict(X_test_filtered)
unique_labels = sorted(y_test_filtered.unique())
test_filtered_classes = [label_encoder.classes_[i] for i in unique_labels]

print("Test Accuracy:", accuracy_score(y_test_filtered, y_pred))
print(classification_report(y_test_filtered, y_pred, labels=unique_labels, target_names=test_filtered_classes))

Test Accuracy: 0.8608486656721366
                                                 precision    recall  f1-score   support

                      abdominal aortic aneurysm       1.00      0.93      0.96        14
                               abdominal hernia       0.95      1.00      0.97        36
                                abscess of nose       0.92      0.88      0.90        26
                            abscess of the lung       1.00      1.00      1.00         2
                         abscess of the pharynx       0.96      0.90      0.93        29
                           acanthosis nigricans       1.00      0.71      0.83         7
                                      acariasis       1.00      1.00      1.00         2
                                      achalasia       1.00      0.44      0.62         9
                                           acne       0.84      0.63      0.72        59
                              actinic keratosis       0.98      0.67      0

In [23]:
pkl.dump(mlp_pipeline, open('mlp_model.pkl', 'wb'))

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

num_classes = len(label_encoder.classes_)
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

In [25]:
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_size, embedding_dim, input_length=max_len))

model_lstm.add(Bidirectional(LSTM(128, return_sequences=True, activation='tanh')))
model_lstm.add(Dropout(0.4))

model_lstm.add(Bidirectional(LSTM(64, activation='tanh')))
model_lstm.add(Dropout(0.4))

model_lstm.add(Dense(256, activation='relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(128, activation='relu'))

model_lstm.add(Dense(num_classes, activation='softmax'))

model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_lstm.fit(X_train_pad, y_train_cat, epochs=10, batch_size=12, validation_split=0.1)

Epoch 1/10
[1m   59/21832[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15:41[0m 43ms/step - accuracy: 0.0016 - loss: 6.5955

KeyboardInterrupt: 

In [None]:
# Tokenize and pad X_val
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

# Now you can use X_val_pad for prediction:
y_pred_probs = model_lstm.predict(X_val_pad)  # Get predicted probabilities

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1) # Get class with highest probability

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n")

# Get unique labels from y_val and corresponding target names
unique_labels = sorted(y_val.unique())
filtered_classes = [label_encoder.classes_[i] for i in unique_labels]

# Generate and print the classification report
print(classification_report(y_val, y_pred, labels=unique_labels, target_names=filtered_classes))

[1m2275/2275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step
Validation Accuracy: 0.7967350079700984

Classification Report:

                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.64      0.56      0.60        16
                                        abdominal hernia       0.98      0.88      0.93       102
                                         abscess of nose       0.70      0.78      0.74        59
                                     abscess of the lung       0.00      0.00      0.00         2
                                  abscess of the pharynx       0.68      0.68      0.68        65
                                    acanthosis nigricans       0.00      0.00      0.00         3
                                               acariasis       0.00      0.00      0.00         4
                                               achalasia       0.50     

In [None]:
# Filter out classes with fewer than 2 samples in test set
test_class_counts = y_test.value_counts()
test_valid_classes = test_class_counts[test_class_counts >= 2].index

# Get boolean mask
mask = y_test.isin(test_valid_classes)

# Apply the mask correctly (works even if X_test is a NumPy array)
X_test_filtered = X_test[mask.to_numpy()] # This is still a Series of text data
y_test_filtered = y_test[mask]

# Tokenize and pad X_test_filtered similar to X_train
X_test_filtered_seq = tokenizer.texts_to_sequences(X_test_filtered) # Convert text to sequences
X_test_filtered_pad = pad_sequences(X_test_filtered_seq, maxlen=max_len) # Pad sequences

# Make predictions using the padded sequences
y_pred_probs = model_lstm.predict(X_test_filtered_pad) # Now use padded data

# Get the class with the highest probability for each sample
y_pred = np.argmax(y_pred_probs, axis=1) # Convert probabilities to predicted class labels

# Calculate and print metrics
print("Test Accuracy:", accuracy_score(y_test_filtered, y_pred))
print("\nTest Classification Report:\n")

# Filter target names and set labels to match the unique classes in y_test_filtered
unique_labels = sorted(y_test_filtered.unique())
test_filtered_classes = [label_encoder.classes_[i] for i in unique_labels]
print(classification_report(y_test_filtered, y_pred, labels=unique_labels, target_names=test_filtered_classes))

[1m755/755[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step
Test Accuracy: 0.8042433283606829

Test Classification Report:

                                                 precision    recall  f1-score   support

                      abdominal aortic aneurysm       0.71      0.71      0.71        14
                               abdominal hernia       0.92      0.94      0.93        36
                                abscess of nose       0.67      0.92      0.77        26
                            abscess of the lung       0.00      0.00      0.00         2
                         abscess of the pharynx       0.69      0.86      0.77        29
                           acanthosis nigricans       0.00      0.00      0.00         7
                                      acariasis       0.00      0.00      0.00         2
                                      achalasia       0.80      0.44      0.57         9
                                           acne       1.00 