In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [3]:
df = pd.read_csv("final_health_dataset_all_genders.csv")

df.head()


Unnamed: 0,age,gender,bmi,screen_time,hydration,sun_exposer,activity_level,stress_level,sleep_hours,fatigue_level,...,Anemia,Dyslipidemia,Lung_Cancer,Colorectal_Cancer,Blood_Cancer,Glioma,Pituitary_Tumor,Metastatic_Brain_Tumor,Breast_Cancer,Prostate_Cancer
0,29,0,27.6,2,1,5,7,9,8,9,...,1,1,0,0,1,1,1,1,0,0
1,57,0,33.8,6,5,5,4,7,5,5,...,1,1,1,1,0,0,1,1,1,0
2,39,0,30.0,13,1,1,8,3,8,2,...,1,1,1,0,0,0,1,0,0,0
3,46,0,27.6,9,1,2,9,4,5,6,...,1,1,1,0,0,0,1,1,0,0
4,25,0,33.5,12,7,3,7,6,8,9,...,0,1,0,0,1,1,1,0,0,0


In [7]:
df.describe()

Unnamed: 0,age,gender,bmi,screen_time,hydration,sun_exposer,activity_level,stress_level,sleep_hours,fatigue_level,...,Anemia,Dyslipidemia,Lung_Cancer,Colorectal_Cancer,Blood_Cancer,Glioma,Pituitary_Tumor,Metastatic_Brain_Tumor,Breast_Cancer,Prostate_Cancer
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,46.766667,0.499333,26.500067,8.115333,5.503667,3.993667,5.477667,5.451,6.491,5.511667,...,0.462333,0.974667,0.613333,0.482,0.356667,0.316,0.791333,0.668,0.254667,0.284667
std,16.826901,0.500083,4.900893,3.706293,2.858329,2.585315,2.847608,2.886932,1.724213,2.84732,...,0.498662,0.157162,0.487067,0.499759,0.479095,0.46499,0.406423,0.47101,0.435746,0.451331
min,18.0,0.0,18.0,2.0,1.0,0.0,1.0,1.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,0.0,22.2,5.0,3.0,2.0,3.0,3.0,5.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,47.0,0.0,26.6,8.0,6.0,4.0,5.0,5.0,7.0,6.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,62.0,1.0,30.8,11.0,8.0,6.0,8.0,8.0,8.0,8.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,75.0,1.0,35.0,14.0,10.0,8.0,10.0,10.0,9.0,10.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
df.shape


(3000, 41)

In [13]:
df.columns


Index(['age', 'gender', 'bmi', 'screen_time', 'hydration', 'sun_exposer',
       'activity_level', 'stress_level', 'sleep_hours', 'fatigue_level',
       'systolic_bp', 'diastolic_bp', 'symptoms', 'Diabetes_Risk_%',
       'Hypertension_Risk_%', 'Cardiovascular_Disease_Risk_%',
       'Chronic_Kidney_Disease_Risk_%', 'Anemia_Risk_%', 'Dyslipidemia_Risk_%',
       'Lung_Cancer_Risk_%', 'Colorectal_Cancer_Risk_%', 'Blood_Cancer_Risk_%',
       'Glioma_Risk_%', 'Pituitary_Tumor_Risk_%',
       'Metastatic_Brain_Tumor_Risk_%', 'Breast_Cancer_Risk_%',
       'Prostate_Cancer_Risk_%', 'Diabetes', 'Hypertension',
       'Cardiovascular_Disease', 'Chronic_Kidney_Disease', 'Anemia',
       'Dyslipidemia', 'Lung_Cancer', 'Colorectal_Cancer', 'Blood_Cancer',
       'Glioma', 'Pituitary_Tumor', 'Metastatic_Brain_Tumor', 'Breast_Cancer',
       'Prostate_Cancer'],
      dtype='object')

In [15]:
df["symptoms"].head()


0    unexplained weight change, blurred vision, bon...
1    pale skin, chest pain, abdominal pain, nipple ...
2    dizziness, pale skin, chest pain, slow wound h...
3    chest pain, coughing blood, severe headache, o...
4    persistent headache, headache, unexplained wei...
Name: symptoms, dtype: object

In [17]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
tfidf = TfidfVectorizer(
    max_features=500,       # good for ~3000 rows
    ngram_range=(1, 2),     # capture medical phrases
    stop_words="english"
)


In [21]:
X_symptoms = tfidf.fit_transform(df["symptoms"])


In [23]:
symptom_features = pd.DataFrame(
    X_symptoms.toarray(),
    columns=[f"symptom_{s}" for s in tfidf.get_feature_names_out()],
    index=df.index
)


In [25]:
df = df.drop(columns=["symptoms"])


In [27]:
df_final = pd.concat([df, symptom_features], axis=1)


In [29]:
df_final.shape


(3000, 540)

In [31]:
df_final.head()

Unnamed: 0,age,gender,bmi,screen_time,hydration,sun_exposer,activity_level,stress_level,sleep_hours,fatigue_level,...,symptom_weakness leg,symptom_weakness persistent,symptom_weakness severe,symptom_weakness shortness,symptom_weakness vision,symptom_weakness weak,symptom_weight,symptom_weight change,symptom_wound,symptom_wound healing
0,29,0,27.6,2,1,5,7,9,8,9,...,0.0,0.0,0.355262,0.0,0.0,0.0,0.114381,0.114381,0.0,0.0
1,57,0,33.8,6,5,5,4,7,5,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,0,30.0,13,1,1,8,3,8,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133578,0.133578
3,46,0,27.6,9,1,2,9,4,5,6,...,0.0,0.0,0.0,0.236332,0.0,0.0,0.0,0.0,0.0,0.0
4,25,0,33.5,12,7,3,7,6,8,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.108759,0.108759,0.0,0.0


In [33]:
df_final.columns

Index(['age', 'gender', 'bmi', 'screen_time', 'hydration', 'sun_exposer',
       'activity_level', 'stress_level', 'sleep_hours', 'fatigue_level',
       ...
       'symptom_weakness leg', 'symptom_weakness persistent',
       'symptom_weakness severe', 'symptom_weakness shortness',
       'symptom_weakness vision', 'symptom_weakness weak', 'symptom_weight',
       'symptom_weight change', 'symptom_wound', 'symptom_wound healing'],
      dtype='object', length=540)

In [71]:
list(df_final.columns)



['age',
 'gender',
 'bmi',
 'screen_time',
 'hydration',
 'sun_exposer',
 'activity_level',
 'stress_level',
 'sleep_hours',
 'fatigue_level',
 'systolic_bp',
 'diastolic_bp',
 'Diabetes_Risk_%',
 'Hypertension_Risk_%',
 'Cardiovascular_Disease_Risk_%',
 'Chronic_Kidney_Disease_Risk_%',
 'Anemia_Risk_%',
 'Dyslipidemia_Risk_%',
 'Lung_Cancer_Risk_%',
 'Colorectal_Cancer_Risk_%',
 'Blood_Cancer_Risk_%',
 'Glioma_Risk_%',
 'Pituitary_Tumor_Risk_%',
 'Metastatic_Brain_Tumor_Risk_%',
 'Breast_Cancer_Risk_%',
 'Prostate_Cancer_Risk_%',
 'Diabetes',
 'Hypertension',
 'Cardiovascular_Disease',
 'Chronic_Kidney_Disease',
 'Anemia',
 'Dyslipidemia',
 'Lung_Cancer',
 'Colorectal_Cancer',
 'Blood_Cancer',
 'Glioma',
 'Pituitary_Tumor',
 'Metastatic_Brain_Tumor',
 'Breast_Cancer',
 'Prostate_Cancer',
 'symptom_abdominal',
 'symptom_abdominal pain',
 'symptom_ankle',
 'symptom_ankle swelling',
 'symptom_appetite',
 'symptom_appetite chest',
 'symptom_appetite memory',
 'symptom_appetite persistent'

In [75]:
TARGET_DISEASES = [
    'Diabetes',
    'Hypertension',
    'Cardiovascular_Disease',
    'Chronic_Kidney_Disease',
    'Anemia',
    'Dyslipidemia',
    'Lung_Cancer',
    'Colorectal_Cancer',
    'Blood_Cancer',
    'Glioma',
    'Pituitary_Tumor',
    'Metastatic_Brain_Tumor',
    'Breast_Cancer',
    'Prostate_Cancer'
]
REM_DATA = [
    'Diabetes_Risk_%',
 'Hypertension_Risk_%',
 'Cardiovascular_Disease_Risk_%',
 'Chronic_Kidney_Disease_Risk_%',
 'Anemia_Risk_%',
 'Dyslipidemia_Risk_%',
 'Lung_Cancer_Risk_%',
 'Colorectal_Cancer_Risk_%',
 'Blood_Cancer_Risk_%',
 'Glioma_Risk_%',
 'Pituitary_Tumor_Risk_%',
 'Metastatic_Brain_Tumor_Risk_%',
 'Breast_Cancer_Risk_%',
 'Prostate_Cancer_Risk_%',
 'Diabetes',
 'Hypertension',
 'Cardiovascular_Disease',
 'Chronic_Kidney_Disease',
 'Anemia',
 'Dyslipidemia',
 'Lung_Cancer',
 'Colorectal_Cancer',
 'Blood_Cancer',
 'Glioma',
 'Pituitary_Tumor',
 'Metastatic_Brain_Tumor',
 'Breast_Cancer',
 'Prostate_Cancer'
]

In [79]:
x = df_final.drop(columns = REM_DATA)

In [104]:
x.columns


Index(['age', 'gender', 'bmi', 'screen_time', 'hydration', 'sun_exposer',
       'activity_level', 'stress_level', 'sleep_hours', 'fatigue_level',
       ...
       'symptom_weakness leg', 'symptom_weakness persistent',
       'symptom_weakness severe', 'symptom_weakness shortness',
       'symptom_weakness vision', 'symptom_weakness weak', 'symptom_weight',
       'symptom_weight change', 'symptom_wound', 'symptom_wound healing'],
      dtype='object', length=512)

In [83]:
y = df_final[TARGET_DISEASES]

In [85]:
y.head()

Unnamed: 0,Diabetes,Hypertension,Cardiovascular_Disease,Chronic_Kidney_Disease,Anemia,Dyslipidemia,Lung_Cancer,Colorectal_Cancer,Blood_Cancer,Glioma,Pituitary_Tumor,Metastatic_Brain_Tumor,Breast_Cancer,Prostate_Cancer
0,1,0,0,0,1,1,0,0,1,1,1,1,0,0
1,1,0,1,0,1,1,1,1,0,0,1,1,1,0
2,1,1,1,0,1,1,1,0,0,0,1,0,0,0
3,1,0,0,0,1,1,1,0,0,0,1,1,0,0
4,1,1,1,0,0,1,0,0,1,1,1,0,0,0


In [87]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)


In [89]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    random_state=42
)


In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

model = MultiOutputClassifier(
    RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        class_weight="balanced",
        random_state=42
    )
)

model.fit(X_train, y_train)


In [96]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=TARGET_DISEASES))


                        precision    recall  f1-score   support

              Diabetes       0.98      0.99      0.99       569
          Hypertension       0.93      1.00      0.96       319
Cardiovascular_Disease       0.99      1.00      0.99       423
Chronic_Kidney_Disease       0.91      0.79      0.85        53
                Anemia       0.99      0.96      0.97       271
          Dyslipidemia       0.98      1.00      0.99       588
           Lung_Cancer       0.94      0.94      0.94       362
     Colorectal_Cancer       1.00      1.00      1.00       290
          Blood_Cancer       1.00      1.00      1.00       211
                Glioma       0.99      0.97      0.98       194
       Pituitary_Tumor       1.00      1.00      1.00       474
Metastatic_Brain_Tumor       1.00      1.00      1.00       410
         Breast_Cancer       1.00      1.00      1.00       146
       Prostate_Cancer       1.00      1.00      1.00       186

             micro avg       0.98     

In [98]:
import joblib

joblib.dump(model, "disease_classifier.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(tfidf, "symptom_tfidf.pkl")


['symptom_tfidf.pkl']

In [116]:
import joblib
import numpy as np
from scipy.sparse import hstack

# ===============================
# Target Diseases (CLASS ORDER)
# ===============================
TARGET_DISEASES = [
    'Diabetes',
    'Hypertension',
    'Cardiovascular_Disease',
    'Chronic_Kidney_Disease',
    'Anemia',
    'Dyslipidemia',
    'Lung_Cancer',
    'Colorectal_Cancer',
    'Blood_Cancer',
    'Glioma',
    'Pituitary_Tumor',
    'Metastatic_Brain_Tumor',
    'Breast_Cancer',
    'Prostate_Cancer'
]

# ===============================
# Load trained artifacts
# ===============================
model = joblib.load("disease_classifier.pkl")
scaler = joblib.load("scaler.pkl")
tfidf = joblib.load("symptom_tfidf.pkl")

# ===============================
# NEW USER INPUT (EDIT HERE)
# ===============================
user_data = {
    "age": 52,
    "gender": 1,  # 0 = Female, 1 = Male
    "bmi": 31.2,
    "screen_time": 7,
    "hydration": 4,
    "sun_exposer": 2,
    "activity_level": 3,
    "stress_level": 8,
    "sleep_hours": 5,
    "fatigue_level": 7,
    "systolic_bp":133,
    "diastolic_bp":80,
    "symptoms": "pale skin, chest pain, abdominal pain, nipple discharge, coughing blood, blurred vision, severe headache, cognitive decline, breast lump, leg swelling, vision problems, persistent fatigue, chest discomfort, fatigue"
}

# ===============================
# Build numerical features
# (ORDER MUST MATCH TRAINING)
# ===============================
numerical_features = np.array([[
    user_data["age"],
    user_data["gender"],
    user_data["bmi"],
    user_data["screen_time"],
    user_data["hydration"],
    user_data["sun_exposer"],
    user_data["activity_level"],
    user_data["stress_level"],
    user_data["sleep_hours"],
    user_data["fatigue_level"],
    user_data["systolic_bp"],
    user_data["diastolic_bp"]
]])

# ===============================
# ===============================
# TF-IDF for symptoms
# ===============================
symptom_vector = tfidf.transform([user_data["symptoms"]])

# ===============================
# Combine features
# ===============================
X = hstack([numerical_features, symptom_vector])

# ===============================
# Scale features
# ===============================
X_scaled = scaler.transform(X.toarray())

# ===============================
# Predict (MULTI-LABEL)
# ===============================
prediction = model.predict(X_scaled)[0]
proba_raw = model.predict_proba(X_scaled)

# Convert predict_proba output safely
import numpy as np
if isinstance(proba_raw, list):
    probabilities = np.array([p[0][1] for p in proba_raw])
else:
    probabilities = proba_raw[0]

# ===============================
# Output (CORRECT)
# ===============================
print("\n===== PREDICTION RESULT =====")

found = False
for i, disease in enumerate(TARGET_DISEASES):
    if prediction[i] == 1:
        found = True
        print(f"{disease} → POSITIVE ({round(probabilities[i]*100, 2)} %)")

if not found:
    print("No disease detected above threshold.")

print("\nTop 3 Risk Diseases:")
top3_idx = np.argsort(probabilities)[::-1][:3]
for i in top3_idx:
    print(f"{TARGET_DISEASES[i]} → {round(probabilities[i]*100, 2)} %")

print("=============================\n")





===== PREDICTION RESULT =====
Diabetes → POSITIVE (99.69 %)
Cardiovascular_Disease → POSITIVE (97.71 %)
Anemia → POSITIVE (99.49 %)
Dyslipidemia → POSITIVE (99.46 %)
Lung_Cancer → POSITIVE (98.42 %)
Colorectal_Cancer → POSITIVE (92.3 %)
Pituitary_Tumor → POSITIVE (97.04 %)
Metastatic_Brain_Tumor → POSITIVE (99.27 %)
Breast_Cancer → POSITIVE (84.58 %)

Top 3 Risk Diseases:
Diabetes → 99.69 %
Anemia → 99.49 %
Dyslipidemia → 99.46 %

