# **Early Disease Prediction**

## **Importing Libararies**

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## **Loading Training Data**

In [3]:
# Load training data
train_df = pd.read_csv('Training.csv')
train_df.drop(columns='Unnamed: 133', inplace=True)

## **Loading Testing Data**

In [4]:
# Load test data
test_df = pd.read_csv('Testing.csv')

## **Data Preprocessing**

In [5]:
# Data Preprocessing
X_train = train_df.iloc[:, :-1]  # Training features
y_train = train_df.iloc[:, -1]   # Training target

X_test = test_df.iloc[:, :-1]    # Test features
y_test = test_df.iloc[:, -1]     # Test target

## **Target Column Encoding**

In [6]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

## **Logistic Regression**

In [7]:
# 1. Logistic Regression
print("=== Logistic Regression ===")
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000))
])
lr_pipeline.fit(X_train, y_train_encoded)
lr_pred = lr_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test_encoded, lr_pred))
print("Classification Report:")
print(classification_report(y_test_encoded, lr_pred, target_names=label_encoder.classes_))
print("\n" + "="*50 + "\n")

=== Logistic Regression ===
Accuracy: 0.9761904761904762
Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            

## **Random Forest**

In [8]:
# 2. Random Forest with GridSearch
print("=== Random Forest ===")
rf_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train_encoded)

best_rf = grid_rf.best_estimator_
rf_pred = best_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test_encoded, rf_pred))
print("Best Parameters:", grid_rf.best_params_)
print("Classification Report:")
print(classification_report(y_test_encoded, rf_pred, target_names=label_encoder.classes_))
print("\n" + "="*50 + "\n")

=== Random Forest ===
Accuracy: 0.9761904761904762
Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         

## **XGboost**

In [9]:
# 3. XGBoost
print("=== XGBoost ===")
xgb_model = XGBClassifier(
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    num_class=len(label_encoder.classes_),
    tree_method='hist',  # For better performance
    n_jobs=-1           # Use all cores
)

xgb_model.fit(X_train, y_train_encoded)
xgb_pred = xgb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test_encoded, xgb_pred))
print("Classification Report:")
print(classification_report(y_test_encoded, xgb_pred, target_names=label_encoder.classes_))

# Save the best model (XGBoost) and label encoder
model_data = {
    'model': xgb_model,
    'label_encoder': label_encoder,
    'feature_names': list(X_train.columns)  # Save feature names for reference
}

joblib.dump(model_data, 'disease_prediction_xgboost.pkl')
print("\nModel and label encoder saved successfully!")

=== XGBoost ===
Accuracy: 0.9761904761904762
Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       0.50      1.00      0.67         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold 

## **Prediction on Test Data**

In [10]:
# Prediction function
def predict_disease(symptoms):
    """Function to load model and make predictions"""
    loaded_data = joblib.load('disease_prediction_xgboost.pkl')
    model = loaded_data['model']
    encoder = loaded_data['label_encoder']
    features = loaded_data['feature_names']
    
    # Create a dictionary for easier symptom mapping
    symptom_dict = {feature: 0 for feature in features}
    
    # Update with provided symptoms (assuming symptoms is a dictionary)
    for symptom, value in symptoms.items():
        if symptom in symptom_dict:
            symptom_dict[symptom] = value
    
    # Convert to array in correct order
    symptoms_array = np.array([symptom_dict[feature] for feature in features]).reshape(1, -1)
    
    # Make prediction
    pred_encoded = model.predict(symptoms_array)
    disease = encoder.inverse_transform(pred_encoded)
    probabilities = model.predict_proba(symptoms_array)
    
    # Get top 3 predictions
    top3_indices = np.argsort(probabilities[0])[-3:][::-1]
    top3_diseases = encoder.inverse_transform(top3_indices)
    top3_probs = probabilities[0][top3_indices]
    
    return {
        'primary_prediction': disease[0],
        'all_predictions': dict(zip(label_encoder.classes_, probabilities[0])),
        'top3_predictions': list(zip(top3_diseases, top3_probs))
    }