In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import RFE

# Load dataset
file_path = "Disease_symptom_and_patient_profile_dataset.csv"
df = pd.read_csv(file_path)

# Handle missing values
df = df.ffill()

# Ensure 'Disease' column is all strings
df['Disease'] = df['Disease'].astype(str)

# Encode categorical features into numerical values
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' and col != 'Disease':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Store encoder for inverse transformation if needed

# Define features (X) and target (y)
X = df.drop(columns=["Disease", "Outcome Variable"])  # Remove target & redundant columns
y = df["Disease"]

# Handle rare diseases (merge categories with <3 occurrences)
disease_counts = y.value_counts()
rare_diseases = disease_counts[disease_counts < 3].index
y = y.replace(rare_diseases, "Other")

# Encode the updated target variable
disease_encoder = LabelEncoder()
y = disease_encoder.fit_transform(y)

# Apply SMOTE with k_neighbors adjusted or use RandomOverSampler if needed
try:
    smote = SMOTE(k_neighbors=min(3, y.min()), random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
except ValueError:
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X, y)

# Split dataset into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Feature Selection with Recursive Feature Elimination (RFE)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(rf_model, n_features_to_select=10)
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Hyperparameter tuning for Random Forest
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

# Train XGBoost model
xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=7, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Train LightGBM model
lgb_model = LGBMClassifier(n_estimators=300, learning_rate=0.05, max_depth=7)
lgb_model.fit(X_train, y_train)

# Predictions using best RF model
y_pred_rf = rf_best.predict(X_test)

# Predictions using XGBoost
y_pred_xgb = xgb_model.predict(X_test)

# Predictions using LightGBM
y_pred_lgb = lgb_model.predict(X_test)

# Evaluate models
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)

print(f"Optimized Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"XGBoost Accuracy: {accuracy_xgb * 100:.2f}%")
print(f"LightGBM Accuracy: {accuracy_lgb * 100:.2f}%")

print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("\nLightGBM Classification Report:\n", classification_report(y_test, y_pred_lgb))




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42
[LightGBM] [Info] Number of data points in the train set: 2948, number of used features: 8
[LightGBM] [Info] Start training from score -3.698423
[LightGBM] [Info] Start training from score -3.645077
[LightGBM] [Info] Start training from score -3.671394
[LightGBM] [Info] Start training from score -3.558065
[LightGBM] [Info] Start training from score -3.582163
[LightGBM] [Info] Start training from score -3.684817
[LightGBM] [Info] Start training from score -3.594433
[LightGBM] [Info] Start training from score -3.658149
[LightGBM] [Info] Start training from score -3.671394
[LightGBM] [Info] Start training from score -3.534535
[LightGBM] [Info] Start training from score -3.645077
[LightGBM] [Info] Start training from score -3.671394
[LightGBM] [Info] Start training from score -3.698423
[LightGBM] [I



Optimized Random Forest Accuracy: 90.11%
XGBoost Accuracy: 89.70%
LightGBM Accuracy: 89.97%

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.79      0.84        24
           1       1.00      1.00      1.00        20
           2       0.79      1.00      0.88        22
           3       1.00      0.92      0.96        13
           4       1.00      1.00      1.00        15
           5       1.00      1.00      1.00        23
           6       0.76      1.00      0.86        16
           7       1.00      1.00      1.00        21
           8       0.87      0.91      0.89        22
           9       0.92      1.00      0.96        11
          10       0.95      0.90      0.92        20
          11       1.00      0.77      0.87        22
          12       1.00      1.00      1.00        24
          13       0.91      1.00      0.95        20
          14       1.00      0.94      0.97        18
    

In [2]:
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

In [3]:
rf_best.predict(X_train)

array([25,  4, 35, ...,  5, 36, 32])

In [4]:
rf_best.predict(X_test)

array([36, 31, 30, 19, 34,  7, 16, 16, 31, 35,  6, 29, 36,  8, 37,  3, 28,
        9, 19,  5,  1, 17,  1,  4,  9, 17, 21, 16, 21,  2, 34,  2, 21,  7,
       20, 32, 13,  8, 27, 25, 12,  7,  6, 29, 29, 33, 28, 27, 19, 37, 29,
       14, 10, 23, 25,  9, 17, 28, 25, 32, 24, 32, 30, 32,  2, 25, 19, 18,
        8, 29,  8, 16, 27,  2, 22, 37, 29, 20,  4, 21, 28, 25, 18, 20, 10,
       21,  0, 34, 37, 36,  6, 36, 15, 13,  0, 30, 15, 31, 32, 21, 37, 17,
        6, 25, 15, 13, 11,  6, 19, 37, 37,  1,  5, 11, 34, 13, 14,  2, 37,
       19, 34, 37, 13, 21, 16, 24,  5,  4,  6,  4,  4, 21, 23, 21,  6, 22,
       22, 19,  3, 25, 11,  8, 13, 35, 16,  0,  9, 24,  2, 13, 25, 15,  5,
        8, 37, 14, 25,  2, 21, 15, 37, 19,  3, 13, 31, 37,  8,  8, 20, 15,
        6, 29, 14, 30,  8, 12, 11, 14, 21, 17,  2, 18, 20, 31, 33,  0,  6,
       20, 22,  1, 31,  2, 16, 21, 15,  7, 34, 35, 20,  4, 22,  9,  4,  5,
       26, 16, 29, 13, 21, 12, 15, 25, 24,  2, 29,  7, 17, 33, 37,  4,  1,
        5,  5, 16, 14, 20

In [5]:
import pickle

In [6]:
with open("disease_prediction_model.pkl", "wb") as model_file:
    pickle.dump(rf_model, model_file)

In [11]:
df.Disease.unique()

array(['Influenza', 'Common Cold', 'Eczema', 'Asthma', 'Hyperthyroidism',
       'Allergic Rhinitis', 'Anxiety Disorders', 'Diabetes',
       'Gastroenteritis', 'Pancreatitis', 'Rheumatoid Arthritis',
       'Depression', 'Liver Cancer', 'Stroke', 'Urinary Tract Infection',
       'Dengue Fever', 'Hepatitis', 'Kidney Cancer', 'Migraine',
       'Muscular Dystrophy', 'Sinusitis', 'Ulcerative Colitis',
       'Bipolar Disorder', 'Bronchitis', 'Cerebral Palsy',
       'Colorectal Cancer', 'Hypertensive Heart Disease',
       'Multiple Sclerosis', 'Myocardial Infarction (Heart...',
       'Urinary Tract Infection (UTI)', 'Osteoporosis', 'Pneumonia',
       'Atherosclerosis', 'Chronic Obstructive Pulmonary...', 'Epilepsy',
       'Hypertension', 'Obsessive-Compulsive Disorde...', 'Psoriasis',
       'Rubella', 'Cirrhosis', 'Conjunctivitis (Pink Eye)',
       'Liver Disease', 'Malaria', 'Spina Bifida', 'Kidney Disease',
       'Osteoarthritis', 'Klinefelter Syndrome', 'Acne', 'Brain Tumor',


In [4]:
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,1,0,1,1,19,0,1,2,1
1,Common Cold,0,1,1,0,25,0,2,2,0
2,Eczema,0,1,1,0,25,0,2,2,0
3,Asthma,1,1,0,1,25,1,2,2,1
4,Asthma,1,1,0,1,25,1,2,2,1


In [5]:
import numpy as np

# Example new patient data (modify as needed)
new_patient = {
    "Fever": 1,
    "Cough": 1,
    "Fatigue": 0,
    "Difficulty Breathing": 1,
    "Age": 30,
    "Gender": 1,  # Assuming 1 = Male, 0 = Female
    "Blood Pressure": 2,
    "Cholesterol Level": 1
}

# Convert new patient data to DataFrame
new_patient_df = pd.DataFrame([new_patient])

# Ensure columns match training features
new_patient_df = new_patient_df[X.columns]

# Apply feature selection transformation
new_patient_df_transformed = rfe.transform(new_patient_df)

# Predict disease using the best model
predicted_label = rf_best.predict(new_patient_df_transformed)

# Decode the predicted disease label
predicted_disease = disease_encoder.inverse_transform(predicted_label)

print(f"Predicted Disease: {predicted_disease[0]}")


Predicted Disease: Asthma


In [6]:
import numpy as np

# Example new patient data (modify as needed)
new_patient = {
    "Fever": 1,
    "Cough": 0,
    "Fatigue": 1,
    "Difficulty Breathing": 1,
    "Age": 60,
    "Gender": 1,  # Assuming 1 = Male, 0 = Female
    "Blood Pressure": 2,
    "Cholesterol Level": 1
}

# Convert new patient data to DataFrame
new_patient_df = pd.DataFrame([new_patient])

# Ensure columns match training features
new_patient_df = new_patient_df[X.columns]

# Apply feature selection transformation
new_patient_df_transformed = rfe.transform(new_patient_df)

# Predict disease using the best model
predicted_label = rf_best.predict(new_patient_df_transformed)

# Decode the predicted disease label
predicted_disease = disease_encoder.inverse_transform(predicted_label)

print(f"Predicted Disease: {predicted_disease[0]}")


Predicted Disease: Common Cold


In [8]:
import pickle

In [22]:
# Save model & encoders
with open("disease_prediction_model.pkl", "wb") as model_file:
    pickle.dump(rf_model, model_file)

with open("label_encoders.pkl", "wb") as le_file:
    pickle.dump(label_encoders, le_file)

with open("disease_encoder.pkl", "wb") as de_file:
    pickle.dump(disease_encoder, de_file)

print("Model and encoders saved successfully!")

Model and encoders saved successfully!


In [23]:
import pickle

# Assuming `model` is your trained model
with open("disease_prediction_model.pkl", "wb") as model_file:
    pickle.dump(rf_model, model_file)



In [24]:
with open("disease_prediction_model.pkl", "rb") as model_file:
    try:
        model = pickle.load(model_file)
        print("Model loaded successfully!")
    except EOFError:
        print("Error: Model file is corrupted or incomplete.")



Model loaded successfully!


In [14]:
import os
print(os.listdir())  # List all files in the current directory


['.gitignore', '.qodo', 'disease_encoder.pkl', 'disease_prediction_model.pkl', 'Disease_symptom_and_patient_profile_dataset.csv', 'label_encoders.pkl', 'Untitled6.ipynb']


In [15]:
from google.colab import drive
drive.mount('/content/drive')  # Mount Google Drive


ModuleNotFoundError: No module named 'google.colab'

In [25]:
model_path = "disease_prediction_model.pkl"
with open(model_path, "wb") as model_file:
    pickle.dump(model, model_file)


In [26]:
with open(model_path, "rb") as model_file:
    model = pickle.load(model_file)


In [18]:
import os
os.remove("disease_prediction_model.pkl")  # Delete corrupted file


In [27]:
import os

file_path = "disease_prediction_model.pkl"

# Check if the file exists
if os.path.exists(file_path):
    print(f"File found: {file_path}")
    print(f"File size: {os.path.getsize(file_path)} bytes")
else:
    print("❌ File not found!")


File found: disease_prediction_model.pkl
File size: 817 bytes


In [28]:
import pickle

file_path = "disease_prediction_model.pkl"

try:
    with open(file_path, "rb") as file:
        model = pickle.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")


✅ Model loaded successfully!


In [29]:
import sys
print(sys.executable)
print(sys.version)


c:\Python312\python.exe
3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]
