# 🩺 AI Powered Leukemia Detection Model
This notebook implements an AI-powered model to detect leukemia using machine learning techniques. It includes data preprocessing, model training, evaluation, and prediction functionalities.

# 🛠️ Standard Imports
This cell imports all the necessary libraries and modules required for data processing, visualization, and machine learning.

In [None]:
# Standard imports - DO NOT CHANGE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import warnings
warnings.filterwarnings('ignore')

# 🧠 Leukemia Detection Model Class
This cell defines the `LeukemiaDetectionModel` class, which includes methods for loading data, preprocessing, training models, and making predictions.

In [None]:
#LeukemiaDetectionModel class - DO NOT CHANGE
class LeukemiaDetectionModel:
    def __init__(self, target_column=None, id_column=None):
        """Initialize model with optional column detection"""
        self.target_column = target_column
        self.id_column = id_column
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.trained_models = None

    def auto_detect_columns(self):
        """Automatically detect target and ID columns based on keywords"""
        if self.target_column is None:
            for col in self.df.columns:
                if 'leukemia' in col.lower() and 'status' in col.lower():
                    self.target_column = col
                    print(f"Detected target column: {self.target_column}")
                    break
            if self.target_column is None:
                raise ValueError("Unable to detect target column related to leukemia status. Please specify it explicitly.")

        if self.id_column is None:
            for col in self.df.columns:
                if 'id' in col.lower():
                    self.id_column = col
                    print(f"Detected ID column: {self.id_column}")
                    break

    def load_data(self, data_path=None):
        """Load data and detect columns if necessary"""
        if data_path is not None:
            if data_path.endswith('.csv'):
                self.df = pd.read_csv(data_path)
            elif data_path.endswith('.xlsx'):
                self.df = pd.read_excel(data_path)
            elif data_path.endswith('.json'):
                self.df = pd.read_json(data_path)
            else:
                raise ValueError("Unsupported file format. Please provide a CSV, Excel, or JSON file.")
        else:
            raise ValueError("data_path must be provided")

        # Automatically detect target and ID columns if not provided
        self.auto_detect_columns()

        # Handle missing values
        if self.df.isnull().sum().sum() > 0:
            self.df.fillna(self.df.mean(numeric_only=True), inplace=True)
            self.df.fillna('Unknown', inplace=True)

        return self

    def explore_data(self, max_plots=6):
        """Explore data with automatic feature type detection"""
        print("\nClass Distribution for", self.target_column)
        print(self.df[self.target_column].value_counts(normalize=True))
        
        # Convert target to numerical if it's categorical
        if self.df[self.target_column].dtype == 'object':
            target_encoder = LabelEncoder()
            self.df[self.target_column] = target_encoder.fit_transform(self.df[self.target_column])
            print("\nConverted target values:")
            for i, label in enumerate(target_encoder.classes_):
                print(f"{label} -> {i}")
        
        # Automatically identify numerical and categorical columns
        self.numerical_features = self.df.select_dtypes(
            include=['int64', 'float64']).columns.tolist()
        self.categorical_features = self.df.select_dtypes(
            include=['object', 'category']).columns.tolist()
        
        # Remove target and ID columns from features
        for col in [self.target_column, self.id_column]:
            if col in self.numerical_features:
                self.numerical_features.remove(col)
            if col in self.categorical_features:
                self.categorical_features.remove(col)
        
        print("\nNumerical features:", self.numerical_features)
        print("Categorical features:", self.categorical_features)
        
        # Plot distributions of numerical features
        if len(self.numerical_features) > 0:
            n_plots = min(len(self.numerical_features), max_plots)
            plt.figure(figsize=(15, 10))
            for i, feature in enumerate(self.numerical_features[:n_plots], 1):
                plt.subplot(2, 3, i)
                sns.histplot(data=self.df, x=feature, hue=self.target_column, multiple="stack")
                plt.title(f'Distribution of {feature}')
            plt.tight_layout()
            plt.show()

        return self

    def preprocess_data(self):
        



        df_processed = self.df.copy()

        # Encode categorical variables
        for feature in self.categorical_features:
            self.label_encoders[feature] = LabelEncoder()
            df_processed[feature] = self.label_encoders[feature].fit_transform(df_processed[feature])

        # Prepare features and target
        exclude_cols = [col for col in [self.target_column, self.id_column] if col is not None]
        X = df_processed.drop(exclude_cols, axis=1).values
        y = df_processed[self.target_column].values

        # Use the entire dataset for training
        self.X_train = X
        self.y_train = y

        # Scale the features
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)

        return self

    def select_features(self, n_features=10):
        """Use all features without selection."""
        if not hasattr(self, 'X_train_scaled') or not hasattr(self, 'y_train'):
            raise ValueError("Data must be preprocessed before feature selection.")

        # Use all features
        feature_names = [col for col in self.df.columns if col not in [self.target_column, self.id_column]]
        self.selected_features = feature_names
        print(f"Using all features: {self.selected_features}")

        return self

    def train_models(self):
        """Train multiple models with hyperparameter tuning and class weighting"""
        from sklearn.model_selection import GridSearchCV

        # Define hyperparameter grids
        param_grids = {
            'Logistic Regression': {
                'C': [0.1, 1, 10],
                'penalty': ['l2'],
                'solver': ['lbfgs']
            },
            'Random Forest': {
                'n_estimators': [50, 100, 200],
                'max_depth': [10, 20, None],
                'class_weight': ['balanced', 'balanced_subsample']
            }
        }

        self.models = {
            'Logistic Regression': LogisticRegression(max_iter=1000),
            'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1)
        }

        self.trained_models = {}
        for name, model in self.models.items():
            print(f"\nTraining {name} with hyperparameter tuning...")
            grid_search = GridSearchCV(model, param_grids[name], cv=3, scoring='precision')
            grid_search.fit(self.X_train_scaled, self.y_train)
            best_model = grid_search.best_estimator_
            train_score = best_model.score(self.X_train_scaled, self.y_train)
            print(f"{name} best parameters: {grid_search.best_params_}")
            print(f"{name} training precision: {train_score:.4f}")
            self.trained_models[name] = best_model

        return self

    def evaluate_models(self):
        """Evaluate models with detailed metrics on the training data."""
        self.results = {}
        for name, model in self.trained_models.items():
            print(f"\nEvaluating {name}...")
            y_pred_proba = model.predict_proba(self.X_train_scaled)[:, 1]
            threshold = 0.6  # Adjusted threshold for higher precision
            y_pred = (y_pred_proba >= threshold).astype(int)
            accuracy = accuracy_score(self.y_train, y_pred)
            self.results[name] = {
                'accuracy': accuracy,
                'predictions': y_pred
            }
            print(f"\nClassification Report for {name}:")
            print(classification_report(self.y_train, y_pred))
        return self

    def _load_data_from_path(self, data_path):
        """Helper method to load data from a file path."""
        if data_path.endswith('.csv'):
            return pd.read_csv(data_path)
        elif data_path.endswith('.xlsx'):
            return pd.read_excel(data_path)
        elif data_path.endswith('.json'):
            return pd.read_json(data_path)
        else:
            raise ValueError("Unsupported file format. Please provide a CSV, Excel, or JSON file.")

    def predict_unlabeled_data(self, user_input):
        """Predict leukemia status for user-provided input."""
        # Align user input with training features
        aligned_input = [user_input[feature] for feature in self.selected_features]

        # Scale numerical features
        aligned_input_scaled = self.scaler.transform([aligned_input])

        # Use the trained model to predict
        predictions = {name: model.predict(aligned_input_scaled)[0] for name, model in self.trained_models.items()}

        return predictions
    
    def select_features(self, n_features=10):
        """Use all features without selection."""
        if not hasattr(self, 'X_train_scaled') or not hasattr(self, 'y_train'):
            raise ValueError("Data must be preprocessed before feature selection.")

        # Use all features
        feature_names = [col for col in self.df.columns if col not in [self.target_column, self.id_column]]
        self.selected_features = feature_names
        print(f"Using all features: {self.selected_features}")

        return self

# 🚀 Run the Leukemia Detection Model
This cell creates an instance of the `LeukemiaDetectionModel` class, loads the dataset, explores the data, preprocesses it, selects the top features, trains the models, and evaluates their performance.

In [None]:
# Create and run the model with automatic column detection and feature selection
model = LeukemiaDetectionModel()

model.load_data(
    data_path='synthetic_leukemia_biomarkers.csv'  # Replace with your data file path
)

model.explore_data()

model.preprocess_data()

# Select the top 10 features
model.select_features(n_features=10)

model.train_models()

model.evaluate_models()

# Generate confusion matrix for the model using corrected implementation
for name, trained_model in model.trained_models.items():
    print(f'Confusion Matrix for {name}:')
    y_pred = trained_model.predict(model.X_train_scaled)  # Use training data

    # Combine ground truth and predictions into a DataFrame
    results_df = pd.DataFrame({
        'Ground_Truth': model.y_train,  # Use training labels
        'Prediction': y_pred
    })

    # Display the DataFrame
    print("Combined Results DataFrame:")
    print(results_df)

    # Create and display confusion matrix
    cm = confusion_matrix(model.y_train, y_pred)  # Use training labels
    print("\nConfusion Matrix:")
    print(cm)

    # Visualize the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Has Leukemia', 'Does Not Have Leukemia'])
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix for {name}')
    plt.show()

# Instructions for interpreting classification metrics
print("\nHow to Interpret the Classification Report:")
print("Precision: The proportion of true positive predictions out of all positive predictions. High precision means fewer false positives.")
print("Recall: The proportion of true positive predictions out of all actual positives. High recall means fewer false negatives.")
print("F1-Score: The harmonic mean of precision and recall. A balanced metric when precision and recall are equally important.")

# 🔍 Predict Leukemia Status
This cell allows the user to input data for only the selected features and predicts whether the person has leukemia or not using the trained models.

In [None]:
# Predict leukemia status for a new dataset using user input with all features
def get_user_input():
    """Prompt user for input for all features and return a dictionary."""
    print("\nPlease provide the following inputs for the features:")
    user_data = {}
    for feature in model.selected_features:
        min_val = model.df[feature].min()
        max_val = model.df[feature].max()
        value = float(input(f"Enter value for {feature} (Range [{min_val}, {max_val}]): "))
        user_data[feature] = value
    return user_data

# Get user input and make predictions
user_input = get_user_input()

# Align user input with all features
aligned_input = [user_input[feature] for feature in model.selected_features]

# Scale the input and make predictions
aligned_input_scaled = model.scaler.transform([aligned_input])
predictions = {name: trained_model.predict(aligned_input_scaled)[0] for name, trained_model in model.trained_models.items()}

# Convert predictions to human-readable labels and display them
print("\nPredictions:")  # Single print statement for all predictions
for model_name, prediction in predictions.items():
    result = "Has Leukemia" if prediction == 1 else "Does Not Have Leukemia"
    print(f"Prediction from {model_name}: {result}")