In [1]:


import pandas as pd
import spacy
import scispacy

# Install the en_core_sci_sm model if not already installed

import en_core_sci_sm
import re

# Load SciSpacy model
nlp = en_core_sci_sm.load()

# Load dataset (replace with your file path if needed)
df = pd.read_csv('Symptom2Disease.csv')

# Display the first few rows to identify the relevant text column
print(df.head())

# Replace with your actual text column name
TEXT_COLUMN = 'text'  # Change this if necessary

# Example synonym dictionary for normalization
synonym_dict = {

    "cephalalgia": "headache",
    "dyspnea": "shortness of breath",
    "pyrexia": "fever",
    "tussis": "cough",
    "douloure": "douleur",
    "painful": "pain",
}

# Stopwords list (extend as needed)
stopwords = {"the", "and", "of", "a", "an"}

# Function to clean and preprocess text
def preprocess_text(text):
    if pd.isna(text):
        return []

    # Lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-ZÀ-ÿ\s]', '', text.lower())

    # Tokenization and lemmatization
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stopwords and not token.is_space]

    # Normalization using synonym dictionary
    normalized = [synonym_dict.get(token, token) for token in tokens]

    return normalized

# Apply preprocessing
df['processed_symptoms'] = df[TEXT_COLUMN].apply(preprocess_text)

# Display processed results
print(df[['processed_symptoms']].head())

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


   Unnamed: 0      label                                               text
0           0  Psoriasis  I have been experiencing a skin rash on my arm...
1           1  Psoriasis  My skin has been peeling, especially on my kne...
2           2  Psoriasis  I have been experiencing joint pain in my fing...
3           3  Psoriasis  There is a silver like dusting on my skin, esp...
4           4  Psoriasis  My nails have small dents or pits in them, and...
                                  processed_symptoms
0  [I, have, be, experience, skin, rash, on, my, ...
1  [my, skin, have, be, peel, especially, on, my,...
2  [I, have, be, experience, joint, pain, in, my,...
3  [there, be, silver, like, dust, on, my, skin, ...
4  [my, nail, have, small, dent, or, pit, in, the...


In [2]:
def extract_entities(text):
    if pd.isna(text):
        return []
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

df['entities'] = df[TEXT_COLUMN].apply(extract_entities)
print(df[['entities']].head())


                                            entities
0  [skin rash, arms, legs, torso, weeks, itchy, d...
1  [My, skin, peeling, knees, elbows, scalp, peel...
2  [joint pain, fingers, wrists, knees, pain, ach...
3  [silver, skin, scalp, dusting, small scales, f...
4  [My, nails, small dents, pits, inflammatory, t...


In [1]:

import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re
from collections import defaultdict

import scispacy
import en_core_sci_md

# Load the en_core_sci_md model
nlp = en_core_sci_md.load()

# Medical synonym dictionary (extend as needed)
MEDICAL_SYNONYMS = {
    "cephalalgia": "headache",
    "pruritus": "itch",
    "pyrexia": "fever",
    "dyspnea": "shortness of breath",
    "arthralgia": "joint pain",
    "myalgia": "muscle pain",
    "nausea": "nausea",
    "vertigo": "dizziness",
    "rhinorrhea": "runny nose",
    "tachycardia": "rapid heart",
    "epistaxis": "nosebleed",
    "odynophagia": "painful swallowing",
    "paresthesia": "tingling",
    "syncope": "fainting",
    "erythema": "redness",
    "edema": "swelling"
}

def clean_medical_text(text):
    """Comprehensive medical text cleaning pipeline"""
    if not isinstance(text, str):
        return ""

    # Initial cleaning
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers

    # Process with spaCy
    doc = nlp(text)

    # Lemmatization with medical context preservation
    cleaned_tokens = []
    for token in doc:
        # Skip stopwords unless they're medically relevant
        if token.text in STOP_WORDS and token.text not in ['left', 'right', 'upper', 'lower']:
            continue

        # Lemmatize
        lemma = token.lemma_.lower()

        # Apply medical synonym normalization
        lemma = MEDICAL_SYNONYMS.get(lemma, lemma)

        # Keep only meaningful tokens
        if len(lemma) > 2 and lemma not in STOP_WORDS:
            cleaned_tokens.append(lemma)

    # Join back to string
    return ' '.join(cleaned_tokens)

# Load the dataset
df = pd.read_csv('Symptom2Disease.csv')

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_medical_text)

# Verify cleaning
print("Original text:", df['text'].iloc[0])
print("Cleaned text:", df['cleaned_text'].iloc[0])

# Save cleaned data
df[['label', 'cleaned_text']].to_csv('cleaned_medical_symptoms.csv', index=False)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Original text: I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.
Cleaned text: experience skin rash arm leg torso past week red itchy cover dry scaly patch


In [3]:
#preparation de donne

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Chargement des données
df = pd.read_csv('cleaned_medical_symptoms.csv')

# Nettoyage des données
df = df.dropna()
df['cleaned_text'] = df['cleaned_text'].str.lower()

# Encodage des labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Vectorisation du texte
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label_encoded']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
#2. Modèle LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_text'])
X_seq = tokenizer.texts_to_sequences(df['cleaned_text'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Split pour LSTM
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_pad, y, test_size=0.2, random_state=42)

# Modèle LSTM
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(len(le.classes_), activation='softmax'))
model_lstm.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

# Entraînement
history = model_lstm.fit(X_train_lstm, y_train_lstm,
                        validation_data=(X_test_lstm, y_test_lstm),
                        epochs=10, batch_size=64)

Epoch 1/10




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 182ms/step - accuracy: 0.1402 - loss: 3.1662 - val_accuracy: 0.3000 - val_loss: 3.1135
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 177ms/step - accuracy: 0.3243 - loss: 3.0637 - val_accuracy: 0.2750 - val_loss: 2.8973
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 181ms/step - accuracy: 0.3234 - loss: 2.7598 - val_accuracy: 0.4208 - val_loss: 2.3809
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 179ms/step - accuracy: 0.4611 - loss: 2.1592 - val_accuracy: 0.5833 - val_loss: 1.6452
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 181ms/step - accuracy: 0.6351 - loss: 1.5352 - val_accuracy: 0.7458 - val_loss: 1.1282
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 182ms/step - accuracy: 0.7875 - loss: 1.0040 - val_accuracy: 0.8250 - val_loss: 0.8501
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━

In [5]:
#adaboost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Modèle AdaBoost avec arbre de décision comme estimateur de base
ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=100,
    learning_rate=0.5,
    random_state=42
)

ada.fit(X_train, y_train)

In [6]:
#random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    random_state=42
)

rf.fit(X_train, y_train)

In [8]:
#evaluation du models

from sklearn.metrics import classification_report, f1_score, roc_auc_score

# Fonction pour évaluer les modèles
def evaluate_model(model, X_test, y_test, model_type='sklearn'):
    if model_type == 'lstm':
        y_pred = model.predict(X_test).argmax(axis=1)
    else:
        y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Calcul du F1-score macro
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"F1-Score (macro): {f1:.4f}")

    # Calcul de l'AUC-ROC (nécessite les probabilités pour chaque classe)
    if model_type == 'lstm':
        y_proba = model.predict(X_test)
    else:
        y_proba = model.predict_proba(X_test)

    auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
    print(f"AUC-ROC (OvR): {auc:.4f}")

    return f1, auc

# Évaluation LSTM
print("Evaluation du modèle LSTM:")
f1_lstm, auc_lstm = evaluate_model(model_lstm, X_test_lstm, y_test_lstm, 'lstm')

# Évaluation AdaBoost
print("\nEvaluation du modèle AdaBoost:")
f1_ada, auc_ada = evaluate_model(ada, X_test, y_test)

# Évaluation Random Forest
print("\nEvaluation du modèle Random Forest:")
f1_rf, auc_rf = evaluate_model(rf, X_test, y_test)

Evaluation du modèle LSTM:
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       1.00      1.00      1.00        11
           Cervical spondylosis       1.00      1.00      1.00         7
                    Chicken pox       0.70      0.58      0.64        12
                    Common Cold       0.77      0.83      0.80        12
                         Dengue       0.90      0.75      0.82        12
          Dimorphic Hemorrhoids       1.00      1.00      1.00         7
               Fungal infection       0.93      1.00      0.96        13
                   Hypertension       1.00      1.00      1.00        10
                       Impetigo       0.91      0.91      0.91        11
                       J

In [9]:
#validation croisee
from sklearn.model_selection import cross_val_score

# Validation croisée pour Random Forest
scores = cross_val_score(rf, X, y, cv=5, scoring='f1_macro')
print(f"F1-Score moyen en validation croisée (RF): {scores.mean():.4f} (+/- {scores.std():.4f})")

# Validation croisée pour AdaBoost
scores = cross_val_score(ada, X, y, cv=5, scoring='f1_macro')
print(f"F1-Score moyen en validation croisée (AdaBoost): {scores.mean():.4f} (+/- {scores.std():.4f})")




F1-Score moyen en validation croisée (RF): 0.8864 (+/- 0.0396)
F1-Score moyen en validation croisée (AdaBoost): 0.7335 (+/- 0.0492)


In [10]:
from sklearn.model_selection import GridSearchCV

# Optimisation pour Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                          cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Meilleurs paramètres pour Random Forest:")
print(grid_search.best_params_)
print(f"Meilleur F1-Score: {grid_search.best_score_:.4f}")

Meilleurs paramètres pour Random Forest:
{'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 300}
Meilleur F1-Score: 0.9164


In [1]:
# train_model.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import save_model
import pickle

# Load and preprocess data
df = pd.read_csv('cleaned_medical_symptoms.csv')
le = LabelEncoder()
y = le.fit_transform(df['label'])

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])
X_seq = tokenizer.texts_to_sequences(df['cleaned_text'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2)

# Build model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(le.classes_), activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

# Save model and encoders
model.save("best_model.keras")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)


Epoch 1/10




[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 189ms/step - accuracy: 0.1397 - loss: 3.1647 - val_accuracy: 0.4042 - val_loss: 3.1108
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 186ms/step - accuracy: 0.3943 - loss: 3.0598 - val_accuracy: 0.4417 - val_loss: 2.9232
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 191ms/step - accuracy: 0.4224 - loss: 2.7930 - val_accuracy: 0.3792 - val_loss: 2.4020
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 187ms/step - accuracy: 0.4748 - loss: 2.1768 - val_accuracy: 0.6458 - val_loss: 1.6337
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 191ms/step - accuracy: 0.6421 - loss: 1.4976 - val_accuracy: 0.7583 - val_loss: 0.9815
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 192ms/step - accuracy: 0.8021 - loss: 0.8978 - val_accuracy: 0.8458 - val_loss: 0.6819
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━