In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/models/label_encoder_hybrid.joblib
/kaggle/input/models/hybrid_nn_disease_predictor.h5
/kaggle/input/ananyadisease-prediction-resources/Final_Augmented_dataset_Diseases_and_Symptoms.csv
/kaggle/input/ananyadisease-prediction-resources/Disease precaution.csv
/kaggle/input/ananyadisease-prediction-resources/model_xgboost_v3.joblib
/kaggle/input/ananyadisease-prediction-resources/label_encoder_xgboost_v3.joblib


In [2]:
# Import necessary libraries — already available on Kaggle
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier

# Use RapidFuzz instead of fuzzywuzzy[speedup] (Kaggle doesn't support [speedup])
try:
    from rapidfuzz import process, fuzz  # ✅ Fast and Kaggle-compatible
except ImportError:
    from fuzzywuzzy import process, fuzz  # Fallback (slower)


2025-07-08 10:09:57.654449: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751969397.831531      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751969397.883832      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import pandas as pd
import json
import re
from sklearn.feature_selection import VarianceThreshold

# 🗂️ Load dataset (make sure it's in your Kaggle dataset folder)
df = pd.read_csv("/kaggle/input/ananyadisease-prediction-resources/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# 🧹 Sanitize column names (same as used in training)
def sanitize_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s]', '_', name)
    name = re.sub(r'\s+', '_', name)
    return name.strip('_')

X = df.drop("diseases", axis=1)
X.columns = [sanitize_name(col) for col in X.columns]

# 🔍 Apply VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X)
selected_features = X.columns[selector.get_support()].tolist()

# 💾 Save selected features to JSON in /kaggle/working
with open("/kaggle/working/selected_features.json", "w") as f:
    json.dump(selected_features, f)

print(f"✅ selected_features.json created with {len(selected_features)} features in /kaggle/working/")


✅ selected_features.json created with 146 features in /kaggle/working/


In [4]:
# ================================
# STEP 0: Imports & Setup
# ================================
import pandas as pd
import numpy as np
import joblib
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import LabelEncoder
import re
import json
import warnings
warnings.filterwarnings('ignore')

# ================================
# STEP 1: Load Existing Model & Data
# ================================
model = joblib.load("/kaggle/input/ananyadisease-prediction-resources/model_xgboost_v3.joblib")
label_encoder = joblib.load("/kaggle/input/ananyadisease-prediction-resources/label_encoder_xgboost_v3.joblib")

augmented_df = pd.read_csv("/kaggle/input/ananyadisease-prediction-resources/Final_Augmented_dataset_Diseases_and_Symptoms.csv")
precaution_df = pd.read_csv("/kaggle/input/ananyadisease-prediction-resources/Disease precaution.csv")

with open("/kaggle/working/selected_features.json") as f:
    valid_symptoms = json.load(f)

# Extract original symptoms (before sanitization)
original_symptoms = augmented_df.columns.tolist()
original_symptoms.remove("diseases")

# ================================
# STEP 2: Symptom Embedding
# ================================
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
symptom_embeddings = embed_model.encode(original_symptoms, convert_to_tensor=True)

# ================================
# STEP 3: Matching Function
# ================================
def match_symptoms_with_embeddings(user_inputs, symptom_embeddings, original_symptoms, threshold=0.55):
    matched = []
    for inp in user_inputs:
        query_embedding = embed_model.encode(inp, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embedding, symptom_embeddings)[0]
        top_idx = torch.argmax(cos_scores).item()
        top_score = cos_scores[top_idx].item()

        if top_score >= threshold:
            matched_symptom = original_symptoms[top_idx]
            matched.append(matched_symptom)
            print(f"✔️ Matched '{inp}' → '{matched_symptom}' (score: {top_score:.2f})")
        else:
            print(f"❌ No match found for '{inp}' (score: {top_score:.2f})")
    return matched

# ================================
# STEP 4: Precaution Lookup
# ================================
from fuzzywuzzy import process

def get_precautions(disease_name):
    candidates = precaution_df['Disease'].dropna().tolist()
    match = process.extractOne(disease_name, candidates, score_cutoff=60)
    if match:
        matched_disease = match[0]
        row = precaution_df[precaution_df['Disease'] == matched_disease]
        if not row.empty:
            return row.iloc[0][['Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4']].dropna().tolist()
    return ["No precautions found."]

# ================================
# STEP 5: Prediction Function
# ================================
def sanitize_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s]', '_', name)
    name = re.sub(r'\s+', '_', name)
    return name.strip('_')

def predict_disease_v2(user_symptoms):
    matched = match_symptoms_with_embeddings(user_symptoms, symptom_embeddings, original_symptoms)

    if not matched:
        return {"Predicted Disease": "None", "Confidence": 0.0, "Precautions": ["No valid symptoms"], "Top Predictions": []}

    matched_clean = [sanitize_name(s) for s in matched]

    vector = pd.Series(0, index=valid_symptoms)
    for symptom in matched_clean:
        if symptom in vector.index:
            vector[symptom] = 1

    vector = vector.values.reshape(1, -1)
    y_pred = model.predict(vector)[0]
    probs = model.predict_proba(vector)[0]

    predicted = label_encoder.inverse_transform([y_pred])[0]
    confidence = probs[y_pred]

    top_indices = np.argsort(probs)[-3:][::-1]
    top_3 = [(label_encoder.inverse_transform([i])[0], probs[i]) for i in top_indices]

    return {
        "Predicted Disease": predicted,
        "Confidence": round(confidence, 3),
        "Top Predictions": [(d, round(p, 3)) for d, p in top_3],
        "Precautions": get_precautions(predicted)
    }

# ================================
# STEP 6: Test Example
# ================================
test_input = ["my head hurts", "dizzy", "burning in chest"]
print(predict_disease_v2(test_input))



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✔️ Matched 'my head hurts' → 'headache' (score: 0.65)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✔️ Matched 'dizzy' → 'dizziness' (score: 0.81)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✔️ Matched 'burning in chest' → 'burning chest pain' (score: 0.89)
{'Predicted Disease': 'tension headache', 'Confidence': 0.186, 'Top Predictions': [('tension headache', 0.186), ('autonomic nervous system disorder', 0.107), ('trigeminal neuralgia', 0.06)], 'Precautions': ['bath twice', 'avoid fatty spicy food', 'drink plenty of water', 'avoid too many products']}


In [5]:
# ================================
# STEP 0: Install Dependencies (Kaggle usually already has them)
# ================================
# You don't need !pip install commands in Kaggle if packages already exist.
# For completeness, uncomment if needed:
# !pip install sentence-transformers scikit-learn tensorflow pandas

# ================================
# STEP 1: Import Libraries
# ================================
import pandas as pd
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# ================================
# STEP 2: Load Dataset
# ================================
df = pd.read_csv("/kaggle/input/ananyadisease-prediction-resources/Final_Augmented_dataset_Diseases_and_Symptoms.csv")
df = df.dropna(subset=["diseases"])

# ================================
# STEP 3: Convert Symptom Indicators into Sentences
# ================================
symptom_columns = df.columns.drop("diseases")
df["symptom_sentence"] = df[symptom_columns].apply(
    lambda row: ", ".join([col.replace("_", " ") for col in symptom_columns if row[col] == 1]), axis=1
)

# ================================
# STEP 4: Generate Sentence Embeddings
# ================================
embedder = SentenceTransformer("all-MiniLM-L6-v2")
X_embeddings = embedder.encode(df["symptom_sentence"].tolist(), show_progress_bar=True)

# ================================
# STEP 5: Encode Target Labels
# ================================
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df["diseases"])
y_onehot = to_categorical(y_encoded)
num_classes = y_onehot.shape[1]

# ================================
# STEP 6: Train-Test Split
# ================================
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y_onehot, test_size=0.2, random_state=42)

# ================================
# STEP 7: Define and Train Neural Network
# ================================
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_embeddings.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# ================================
# STEP 8: Save Model and Encoder
# ================================
model.save("/kaggle/working/disease_predictor_nn.h5")
with open("/kaggle/working/label_encoder_nn.json", "w") as f:
    json.dump(label_encoder.classes_.tolist(), f)

# ================================
# STEP 9: Evaluate
# ================================
loss, accuracy = model.evaluate(X_test, y_test)
print(f"✅ Test Accuracy: {accuracy:.2f}")


Batches:   0%|          | 0/7718 [00:00<?, ?it/s]

I0000 00:00:1751969661.859342      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15207 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/10


I0000 00:00:1751969666.688398      82 service.cc:148] XLA service 0x78e648036450 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751969666.689103      82 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1751969666.904496      82 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  71/4939[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 2ms/step - accuracy: 0.0048 - loss: 6.5669

I0000 00:00:1751969668.357852      82 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4939/4939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 0.3393 - loss: 2.9889 - val_accuracy: 0.7557 - val_loss: 0.7923
Epoch 2/10
[1m4939/4939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.7006 - loss: 0.9522 - val_accuracy: 0.7979 - val_loss: 0.6099
Epoch 3/10
[1m4939/4939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.7529 - loss: 0.7616 - val_accuracy: 0.8153 - val_loss: 0.5343
Epoch 4/10
[1m4939/4939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.7777 - loss: 0.6675 - val_accuracy: 0.8244 - val_loss: 0.5009
Epoch 5/10
[1m4939/4939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.7896 - loss: 0.6243 - val_accuracy: 0.8296 - val_loss: 0.4731
Epoch 6/10
[1m4939/4939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - accuracy: 0.7954 - loss: 0.5940 - val_accuracy: 0.8374 - val_loss: 0.4563
Epoch 7/10
[1m4939/4

In [6]:
import pandas as pd

# 🔽 Load dataset (update the path as per your Kaggle dataset name)
df = pd.read_csv("/kaggle/input/ananyadisease-prediction-resources/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# 💬 Convert binary symptom indicators to natural language descriptions
def row_to_description(row):
    return ', '.join([col.replace('_', ' ') for col in df.columns[1:] if row[col] == 1])

# 🧪 Generate the description column
df["description"] = df.apply(row_to_description, axis=1)

# 💾 Save as hybrid_dataset.csv in /kaggle/working
df.to_csv("/kaggle/working/hybrid_dataset.csv", index=False)

print("✅ hybrid_dataset.csv created with", len(df), "rows.")


✅ hybrid_dataset.csv created with 246945 rows.


In [7]:
# 📦 Install required packages (Run this cell first in Kaggle)
!pip install -q sentence-transformers transformers fuzzywuzzy[speedup]

# 📚 Imports
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf

# ✅ Load hybrid dataset
data = pd.read_csv("/kaggle/working/hybrid_dataset.csv")

# ➗ Split features
X_structured = data.drop(columns=["diseases", "description"]).to_numpy()
X_text = data["description"].tolist()
y = data["diseases"]

# 🧠 Sentence embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")
X_embed = embedder.encode(X_text)

# 🔀 Concatenate SBERT embeddings + structured symptoms
X_combined = np.concatenate([X_structured, X_embed], axis=1)

# 🔡 Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 🧪 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)

# 🧠 Hybrid model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_combined.shape[1],)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# 💾 Save model and encoder to /kaggle/working
model.save("/kaggle/working/hybrid_nn_disease_predictor.h5")
joblib.dump(label_encoder, "/kaggle/working/label_encoder_hybrid.joblib")

print("✅ Hybrid model and label encoder saved.")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

Batches:   0%|          | 0/7718 [00:00<?, ?it/s]

Epoch 1/10
[1m5557/5557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - accuracy: 0.5012 - loss: 2.3304 - val_accuracy: 0.8334 - val_loss: 0.4849
Epoch 2/10
[1m5557/5557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.7924 - loss: 0.6537 - val_accuracy: 0.8494 - val_loss: 0.4157
Epoch 3/10
[1m5557/5557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8127 - loss: 0.5687 - val_accuracy: 0.8519 - val_loss: 0.4008
Epoch 4/10
[1m5557/5557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8223 - loss: 0.5300 - val_accuracy: 0.8529 - val_loss: 0.3948
Epoch 5/10
[1m5557/5557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8258 - loss: 0.5119 - val_accuracy: 0.8544 - val_loss: 0.3831
Epoch 6/10
[1m5557/5557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8304 - loss: 0.4897 - val_accuracy: 0.8530 - val_loss: 0.3789
Epoch 7/10

In [8]:
# 📚 Imports
import numpy as np
import pandas as pd
import torch
import joblib
from transformers import AutoTokenizer, AutoModel
from tensorflow.keras.models import load_model
from fuzzywuzzy import process

# 🧠 Load SBERT
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sbert_model = AutoModel.from_pretrained(model_name)

def sentence_to_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = sbert_model(**inputs).last_hidden_state.mean(dim=1)
    return output.squeeze().numpy()

# 📦 Load trained model & encoder
model = load_model("/kaggle/input/models/hybrid_nn_disease_predictor.h5")
label_encoder = joblib.load("/kaggle/input/models/label_encoder_hybrid.joblib")

# 📖 Load precautions
precautions_df = pd.read_csv("/kaggle/input/ananyadisease-prediction-resources/Disease precaution.csv")

# 🧪 Final prediction function
def predict_disease(text):
    emb = sentence_to_embedding(text).reshape(1, -1)

    # Zero vector for structured input (if you want to add later)
    dummy_structured = np.zeros((1, model.input_shape[1] - emb.shape[1]))
    full_input = np.concatenate([dummy_structured, emb], axis=1)

    probs = model.predict(full_input)[0]
    top_indices = probs.argsort()[-3:][::-1]
    top_preds = [(label_encoder.inverse_transform([i])[0], float(probs[i])) for i in top_indices]
    top_disease = top_preds[0][0]

    # Precaution match
    match = process.extractOne(top_disease, precautions_df["Disease"], score_cutoff=60)
    if match:
        row = precautions_df[precautions_df["Disease"] == match[0]]
        precautions = row.iloc[0, 1:].dropna().tolist()
    else:
        precautions = ["No specific precautions found."]

    return {
        "Predicted Disease": top_disease,
        "Confidence": float(probs[top_indices[0]]),
        "Top Predictions": top_preds,
        "Precautions": precautions
    }

# 🔍 Example usage
print(predict_disease("not feeling thirsty"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306ms/step
{'Predicted Disease': 'pneumoconiosis', 'Confidence': 0.1563972681760788, 'Top Predictions': [('pneumoconiosis', 0.1563972681760788), ('autonomic nervous system disorder', 0.13649477064609528), ('lung cancer', 0.11580994725227356)], 'Precautions': ['consult doctor', 'medication', 'rest', 'follow up']}
