# üß† Machine Learning Web App (SAIR Project)

This project is part of the **SAIR (Sudanese Artificial Intelligence Road)** community learning path.  
After completing the Regression and Classification modules, this project focuses on applying that knowledge in a practical way.

---

## üéØ Project Goal
The goal is to build an **interactive web application** that allows users to perform common machine learning tasks without writing code.

---

## ‚öôÔ∏è What the App Does
- üóÇÔ∏è **Upload Dataset:** Users can upload their own dataset (CSV or similar).  
- üìä **Select Problem Type:** Choose whether the dataset is for **regression** or **classification**.  
- ü§ñ **Choose or Upload Model:** Users can either:
  - Upload a pre-trained model, **or**
  - Select a model from the provided list (e.g., Linear Regression, Logistic Regression, Random Forest, etc.).
- üîÅ **Full Pipeline Mode:** The app can automatically:
  - Preprocess the data  
  - Split into train/test sets  
  - Train the model  
  - Evaluate its performance  
  - Show predictions and metrics visually  

---

## üí° Purpose
To make machine learning **simple, fast, and accessible** ‚Äî allowing anyone to test and train models through an intuitive web interface.

---



    
    
    
    üõ†Ô∏è Production Environment Setup

In [41]:
# üõ†Ô∏è Advanced imports for production ML

# Suppress warnings for cleaner outputs
import warnings
warnings.filterwarnings('ignore')

# üîß Core Python libraries
import numpy as np           # Efficient numerical computations
import pandas as pd          # Data manipulation and analysis
import matplotlib.pyplot as plt  # Basic plotting
import seaborn as sns        # Advanced visualization
from scipy import stats      # Statistical functions
import joblib               # Save/load large models and preprocessing objects
import json                 # Handle JSON configs and outputs
from datetime import datetime  # Timestamping for logs
import os                   # File system operations
import time                 # Time tracking for experiments
import gradio as gr


# üß∞ Sklearn libraries - expanded for advanced ML workflows
from sklearn.datasets import fetch_california_housing  # Real-world dataset
from sklearn.model_selection import (
    train_test_split,     # Split data into train/test sets
    cross_val_score,      # Cross-validation scoring
    GridSearchCV,         # Hyperparameter tuning (grid search)
    RandomizedSearchCV    # Hyperparameter tuning (randomized search)
)
from sklearn.preprocessing import (
    StandardScaler,       # Feature scaling (zero-mean, unit variance)
    RobustScaler,         # Scaling robust to outliers
    PolynomialFeatures    # Generate polynomial features for non-linear relationships
)
from sklearn.pipeline import Pipeline, FeatureUnion  # Build modular pipelines
from sklearn.compose import ColumnTransformer         # Apply different preprocessing to columns
from sklearn.feature_selection import (
    SelectKBest,          # Univariate feature selection
    f_regression,         # Scoring function for regression
    RFE                   # Recursive feature elimination
)
from sklearn.linear_model import (
    LinearRegression,     # Baseline regression
    Ridge,                # L2-regularized regression
    Lasso,                # L1-regularized regression
    ElasticNet            # Combination of L1 and L2 regularization
)
from sklearn.ensemble import (
    RandomForestRegressor,       # Ensemble of decision trees
    GradientBoostingRegressor,   # Boosted trees for regression
    VotingRegressor              # Combine multiple regressors
)
from sklearn.svm import SVR               # Support Vector Regression
from sklearn.metrics import (
    mean_squared_error,  # Regression metric
    r2_score,            # Regression metric
    mean_absolute_error  # Regression metric
)
from sklearn.inspection import (
    permutation_importance,       # Feature importance
    PartialDependenceDisplay      # Partial dependence plots
)

# üß™ Advanced model tracking with MLflow
import mlflow                  # Experiment tracking
import mlflow.sklearn          # Log sklearn models
from mlflow.models.signature import infer_signature  # Auto-capture input/output schema for reproducible deployment

import gradio as gr
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from xgboost import XGBClassifier # <--- New Import
from lightgbm import LGBMClassifier # <--- New Import
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV


# üß† Getting data from the user

In [42]:
class Config:
    # Reproducibility - Critical for production!
    RANDOM_STATE = 42
    TEST_SIZE = 0.2
    VAL_SIZE = 0.2  # NEW: Validation set for tuning
    CV_FOLDS = 5
    N_JOBS = -1  # Use all available cores
    
    # Model directories - Organized project structure
    MODEL_DIR = "models"
    EXPERIMENT_DIR = "experiments"
    
    # Create directories if they don't exist
    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(EXPERIMENT_DIR, exist_ok=True)
    
config = Config()

# Initialize MLflow for experiment tracking
mlflow.set_tracking_uri(f"file://{os.path.abspath(config.EXPERIMENT_DIR)}")
experiment_name = "AutoML_Benchmark"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='file:///home/abdelhadi/SAIR_2_project/experiments/371399585566611001', creation_time=1763800319038, experiment_id='371399585566611001', last_update_time=1763800319038, lifecycle_stage='active', name='AutoML_Benchmark', tags={}>

In [43]:


# --- Function 1: Load and Store Data ---


# This is tied to the first button.
# It returns the dataframe twice: once to see, once to save.
def load_and_store_data(file):
    if file is None: return None, None
    try:
        df = pd.read_csv(file.name)
        return df, df 
    except Exception as e:
        return None, None 

## üîç Step 2: Exploratory Data Analysis (EDA)


In [44]:

# --- 2. EDA Functions ---
def find_missing_values(df):
    if df is None: return None
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if missing.empty:
        return pd.DataFrame(columns=["Column", "Missing Count"])
    missing_df = missing.to_frame(name="Missing Count").reset_index().rename(columns={'index': 'Column'})
    return missing_df

def find_categorical_columns(df):
    if df is None: return None
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) == 0:
        return pd.DataFrame(columns=["Categorical Columns"])
    return pd.DataFrame(cat_cols, columns=["Categorical Columns"])

def get_shape(df):
    if df is None: return "No data loaded."
    rows, cols = df.shape
    return f"Dataset has {rows} rows and {cols} columns."

# --- 3. Split X/y Function ---


In [45]:

# --- 3. Split X/y Function ---
def split_and_save_data(df, y_column_name):
    if df is None:
        return "Please upload a file first.", None, None, None, None
    if not y_column_name:
        return "Please enter a 'y' column name.", None, None, None, None
    if y_column_name not in df.columns:
        return f"Error: '{y_column_name}' is not in the dataset.", None, None, None, None
    
    df_cleaned = df.dropna(subset=[y_column_name])
    y = df_cleaned[[y_column_name]].astype(int)
    X = df_cleaned.drop(columns=[y_column_name])
    
    return f"Success! Split '{y_column_name}'. Data is ready for preprocessing.", X, y, X, y

# preproscessing the dataset (fixing the missing and categorical columns ) and spliting to val and train and test

In [46]:

# --- 4. Preprocessing Function (Matches your latest code) ---
def preprocess_data(X, y):
    if X is None or y is None:
        return "Please split your data in Tab 3 first.", None, None, None, None, None, None, None, None

    try:
        X_train_val, X_test, y_train_val, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=0.25, random_state=42)
        
        missing_numeric_cols = X_train.select_dtypes(include=np.number).isnull().sum()
        missing_numeric_cols = missing_numeric_cols[missing_numeric_cols > 0].index.tolist()
        categorical_cols_to_fix = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

        numeric_features = X.select_dtypes(include=np.number).columns
        categorical_features = X.select_dtypes(include=['object', 'category']).columns
        numeric_transformer = SimpleImputer(strategy='median')
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        preprocessor.fit(X_train)
        
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)
        X_test_processed = preprocessor.transform(X_test)
        
        status = (
            f"Preprocessing complete!\n"
            f"Data split (Train/Val/Test): {len(X_train)} / {len(X_val)} / {len(X_test)}\n"
            f"Processed X_train shape: {X_train_processed.shape}\n\n"
            f"--- Columns Fixed ---\n"
            f"Imputed (filled missing): {missing_numeric_cols}\n"
            f"Encoded (text to numbers): {categorical_cols_to_fix}"
        )
        
        # 5. Fix the Sparse Matrix Issue
        # Convert sparse matrix to dense array so pandas can read it
        if hasattr(X_train_processed, "toarray"):
            X_train_processed = X_train_processed.toarray()
            X_val_processed = X_val_processed.toarray()
            X_test_processed = X_test_processed.toarray()
        # Get new column names
        new_cols = preprocessor.get_feature_names_out()
        
        # Create full DataFrame for preview and plotting
        X_train_df = pd.DataFrame(X_train_processed, columns=new_cols)
        
        # For preview (first 5 rows)
        preview_df = X_train_df.head()

        # Return 9 items
        return (status, preview_df, 
                X_train_processed, y_train, 
                X_val_processed, y_val, 
                X_test_processed, y_test,
                X_train_df) # <--- 9th Item: The DataFrame for plotting
        
    except Exception as e:
        return f"An error occurred: {str(e)}", None, None, None, None, None, None, None, None


## üîç Graphs of the dataset

In [47]:
# --- 5. Plotting Functions (Using X_train_df) ---
def plot_all_distributions(X_train_df):
    if X_train_df is None:
        return None
    try:
        cols = X_train_df.columns.tolist()
        num_cols = len(cols)
        grid_size = int(np.ceil(np.sqrt(num_cols)))
        
        fig, axes = plt.subplots(grid_size, grid_size, figsize=(16, 12))
        axes = axes.flatten()
        
        for i, col in enumerate(cols):
            sns.histplot(X_train_df[col], kde=True, ax=axes[i])
            axes[i].set_title(col, fontsize=10)
        
        for j in range(i + 1, len(axes)):
            axes[j].axis('off')
        plt.tight_layout()
        
        return fig
    except Exception as e:
        print(e)
        return None

def plot_correlation(X_train_df):
    if X_train_df is None:
        return None
    try:
        fig = plt.figure(figsize=(12, 10))
        corr = X_train_df.corr(numeric_only=True) 
        sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", square=True, linewidths=1, cbar_kws={"shrink": 0.8})
        
          
        plt.title("Correlation Heatmap")
        
        
        return fig
    except Exception as e:
        print(e)
        return None

# get form the user the type of the dataset "regression or classification" 

In [48]:
# --- Helper to save the task type ---
def update_task_type(new_value):
    return new_value

# outlierhandeler and the pipeline

In [49]:

# NEW: Outlier Handler for Robust Models
class OutlierHandler(BaseEstimator, TransformerMixin):
    """Handle outliers using IQR method - More robust than simple scaling"""
    
    def __init__(self, factor=1.5):
        self.factor = factor
        self.lower_bounds_ = None
        self.upper_bounds_ = None
    
    def fit(self, X, y=None):
        self.lower_bounds_ = []
        self.upper_bounds_ = []
        
        # Calculate IQR bounds for each feature
        for i in range(X.shape[1]):
            Q1 = np.percentile(X[:, i], 25)  # 25th percentile
            Q3 = np.percentile(X[:, i], 75)  # 75th percentile  
            IQR = Q3 - Q1  # Interquartile Range
            self.lower_bounds_.append(Q1 - self.factor * IQR)
            self.upper_bounds_.append(Q3 + self.factor * IQR)
        
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        # Clip values to IQR bounds
        for i in range(X.shape[1]):
            lower = self.lower_bounds_[i]
            upper = self.upper_bounds_[i]
            X_transformed[:, i] = np.clip(X_transformed[:, i], lower, upper)
        
        return X_transformed
    




# Create comprehensive preprocessing pipeline
preprocessor = Pipeline([
    ('outlier_handler', OutlierHandler(factor=1.5)),  # Handle outliers
    ('scaler', RobustScaler())  # Robust to outliers (better than StandardScaler)
])





# using the preprocessor pipeline in the ui 



In [50]:


def scale_and_clean_data(X_train_processed, X_val_processed, X_test_processed, task_type):
    # 1. Safety Check
    if X_train_processed is None: 
        return "Please run Stage 1 (Preprocessing) first.", None, None, None

    try:
        # --- REGRESSION LOGIC ---
        if task_type == "Regression":
            # 2. Create the Advanced Pipeline
            advanced_pipeline = Pipeline([
                ('outlier_handler', OutlierHandler(factor=1.5)),  # Clip Outliers
                ('scaler', RobustScaler())  # Scale Robustly
            ])
            
            # 3. Fit on Training Data Only
            advanced_pipeline.fit(X_train_processed)
            
            # 4. Transform All Sets
            X_train_scaled = advanced_pipeline.transform(X_train_processed)
            X_val_scaled = advanced_pipeline.transform(X_val_processed)
            X_test_scaled = advanced_pipeline.transform(X_test_processed)
            
            status = (f"‚úÖ Advanced Processing Complete (Regression)\n"
                      f"‚Ä¢ Outliers Capped (Winsorized)\n"
                      f"‚Ä¢ Data Scaled (RobustScaler)")
            
            return status, X_train_scaled, X_val_scaled, X_test_scaled

        # --- CLASSIFICATION LOGIC ---
        elif task_type == "Classification":

            # 5. scaling the data 
            advanced_pipeline = Pipeline([('scaler', RobustScaler()) ])
             # 3. Fit on Training Data Only
            advanced_pipeline.fit(X_train_processed)
             # 4. Transform All Sets
            X_train_scaled = advanced_pipeline.transform(X_train_processed)
            X_val_scaled = advanced_pipeline.transform(X_val_processed)
            X_test_scaled = advanced_pipeline.transform(X_test_processed)
            status = (f"‚úÖ Advanced Processing Complete (classification)\n"
                      f"‚Ä¢ Data Scaled (RobustScaler)")
            # Just return the data as-is for now
            return status, X_train_scaled, X_val_scaled, X_test_scaled
            
    except Exception as e:
        return f"Error: {str(e)}", None, None, None

# over viwe of dataset

In [51]:
from ydata_profiling import ProfileReport

# --- 2.5 Profiling Function ---
def generate_profile_report(df):
    if df is None:
        return "Please upload a file in Tab 1 first."
    
    try:
        # minimal=True is CRITICAL for web apps (makes it fast)
        profile = ProfileReport(df, title="Dataset Profiling Report", minimal=False)
        return profile.to_html()
        
    except Exception as e:
        return f"Error generating report: {str(e)}"

# fucntion for regression

In [52]:
# --- Define Model Portfolio (Global) ---
advanced_models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=config.RANDOM_STATE),
    'Lasso': Lasso(random_state=config.RANDOM_STATE),
    'ElasticNet': ElasticNet(random_state=config.RANDOM_STATE),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=config.RANDOM_STATE, n_jobs=config.N_JOBS),
    'Gradient Boosting': GradientBoostingRegressor(random_state=config.RANDOM_STATE),
    'SVR': SVR()
}

# Add Ensemble (depends on the others)
voting_ensemble = VotingRegressor([
    ('ridge', Ridge(random_state=config.RANDOM_STATE)),
    ('rf', RandomForestRegressor(random_state=config.RANDOM_STATE, n_jobs=config.N_JOBS)),
    ('gb', GradientBoostingRegressor(random_state=config.RANDOM_STATE))
])
advanced_models['Voting Ensemble'] = voting_ensemble




def evaluate_model_advanced(model, X_train, y_train, X_val, y_val, model_name):
    try:
        # 1. Train
        start_time = time.time()
        model.fit(X_train, y_train)
        duration = time.time() - start_time
        
        # 2. Predict (Validation - For Ranking)
        val_preds = model.predict(X_val)
        val_r2 = r2_score(y_val, val_preds)
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        
        # 3. Predict (Training - For Diagnostics ONLY)
        train_preds = model.predict(X_train)
        train_r2 = r2_score(y_train, train_preds)
        
        # 4. Calculate Overfitting Gap
        gap = train_r2 - val_r2
        
        # 5. Cross-Validation (Stability Check)
        cv_scores = cross_val_score(model, X_train, y_train, cv=config.CV_FOLDS, scoring='r2', n_jobs=config.N_JOBS)
        cv_mean = cv_scores.mean()
        
        # 6. MLflow Logging
        with mlflow.start_run(run_name=f"Train_{model_name}", nested=True):
            mlflow.log_param("model_name", model_name)
            mlflow.log_metric("val_r2", val_r2)
            mlflow.log_metric("train_r2", train_r2) # Log training score too
            mlflow.log_metric("overfitting_gap", gap)
            mlflow.sklearn.log_model(model, "model")

        # 7. Create the Plot
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.scatterplot(x=y_val, y=val_preds, alpha=0.6, ax=ax)
        min_v, max_v = y_val.min(), y_val.max()
        ax.plot([min_v, max_v], [min_v, max_v], 'r--', lw=2)
        ax.set_title(f"{model_name} (Val R¬≤: {val_r2:.2f})", fontsize=10)
        
        metrics = {
            "Model": model_name,
            "Val R¬≤": round(val_r2, 4),
            "Train R¬≤": round(train_r2, 4),   # Added this
            "Overfitting": round(gap, 4),     # Added this
            "CV R¬≤": round(cv_mean, 4),
            "Time (s)": round(duration, 2)
        }
        return metrics, fig, model, val_r2

    except Exception as e:
        print(f"Error training {model_name}: {e}")
        return None, None, None, -np.inf

# fuction for classification

In [53]:



#--- CLASSIFICATION models  (Updated) ---
classification_models = {
    # üìä Linear Models
    'Logistic Regression': LogisticRegression(
        random_state=config.RANDOM_STATE,
        max_iter=1000,
        C=0.1,
        solver='liblinear'
    ),

    # üå≤ Tree-based Methods
    'Decision Tree': DecisionTreeClassifier(random_state=config.RANDOM_STATE),
    'Random Forest': RandomForestClassifier(
        random_state=config.RANDOM_STATE,
        n_jobs=config.N_JOBS,
        n_estimators=200,
        max_depth=15
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        random_state=config.RANDOM_STATE,
        n_estimators=200,
        learning_rate=0.1
    ),
    'AdaBoost': AdaBoostClassifier(random_state=config.RANDOM_STATE),

    # üöÄ Advanced Boosters (NEW)
    'XGBoost': XGBClassifier(
        random_state=config.RANDOM_STATE,
        n_jobs=config.N_JOBS,
        eval_metric='logloss',
        n_estimators=200, # Reduced slightly for speed in web app
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8
    ),
    
    'LightGBM': LGBMClassifier(
        random_state=config.RANDOM_STATE,
        n_jobs=config.N_JOBS,
        verbose=-1,
        n_estimators=200, # Reduced slightly for speed
        learning_rate=0.05,
        num_leaves=31
    ),

    # üîç Instance-based Methods
    'K-Nearest Neighbors': KNeighborsClassifier(n_jobs=config.N_JOBS),

    # üéØ Kernel Methods
    'Support Vector Machine': SVC(
        random_state=config.RANDOM_STATE,
        probability=True,
        kernel='rbf',
        C=1.0
    ),
}



# classification traning fuction

def evaluate_classification_model(model, X_train, y_train, X_val, y_val, model_name):
    try:
        # 1. Train
        start_time = time.time()
        model.fit(X_train, y_train)
        duration = time.time() - start_time
        
        # 2. Predict
        val_preds = model.predict(X_val)
        train_preds = model.predict(X_train)
        
        # 3. Metrics (CLASSIFICATION LOGIC)
        # Accuracy: How many did we get right?
        val_acc = accuracy_score(y_val, val_preds)
        train_acc = accuracy_score(y_train, train_preds)
        
        # F1 Score: Better for imbalanced data (weighted average)
        val_f1 = f1_score(y_val, val_preds, average='weighted')
        
        # 4. Calculate Overfitting Gap
        gap = train_acc - val_acc
        
        # 5. Cross-Validation (Stability Check)
        cv_scores = cross_val_score(model, X_train, y_train, 
                                  cv=config.CV_FOLDS, scoring='accuracy', n_jobs=config.N_JOBS)
        cv_mean = cv_scores.mean()
        
        # 6. MLflow Logging
        with mlflow.start_run(run_name=f"Train_{model_name}", nested=True):
            mlflow.log_param("model_name", model_name)
            mlflow.log_metric("val_accuracy", val_acc)
            mlflow.log_metric("val_f1", val_f1)
            mlflow.log_metric("overfitting_gap", gap)
            mlflow.sklearn.log_model(model, "model")

        # 7. Plot: Confusion Matrix (Not Scatter Plot!)
        fig, ax = plt.subplots(figsize=(6, 4))
        cm = confusion_matrix(y_val, val_preds)
        
        # Heatmap showing True vs Predicted counts
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
        
        ax.set_title(f"{model_name} (Acc: {val_acc:.2%})", fontsize=10)
        ax.set_xlabel("Predicted Label")
        ax.set_ylabel("True Label")
        plt.tight_layout()
        
        metrics = {
            "Model": model_name,
            "Val Accuracy": round(val_acc, 4),
            "Train Accuracy": round(train_acc, 4),
            "Overfitting": round(gap, 4),
            "CV Accuracy": round(cv_mean, 4),
            "F1 Score": round(val_f1, 4),
            "Time (s)": round(duration, 2)
        }
        return metrics, fig, model, val_acc

    except Exception as e:
        print(f"Error training {model_name}: {e}")
        return None, None, None, -np.inf



# for training 

In [54]:
# --- 9. Auto-ML Pipeline Function (Fixed) ---
def train_models_pipeline(X_train_final, y_train, X_val_final, y_val,X_test_final,y_test, task_type):
    # 1. Safety Checks
    if X_train_final is None or y_train is None:
        return None, None, None, "Please run Advanced Processing (Tab 5) first."
    
    if isinstance(y_train, (pd.DataFrame, pd.Series)):
        y_train = y_train.values.ravel()
    else:
        y_train = np.ravel(y_train)
        
    if isinstance(y_val, (pd.DataFrame, pd.Series)):
        y_val = y_val.values.ravel()
    else:
        y_val = np.ravel(y_val)
    y_test = np.ravel(y_test)
    
    # 2. Setup
    mlflow.set_experiment("AutoML_Benchmark")
    results_list = []
    best_score = -np.inf
    best_model = None
    best_model_name = ""
    
    ## ==========================
    # REGRESSION PIPELINE
    # ==========================
    if task_type == "Regression":
        
        # 3. Loop through Global Model Portfolio
        for name, model in advanced_models.items():
            try:
                # A. Run the Worker Function
                metrics, _, trained_model, score = evaluate_model_advanced(
                    model, X_train_final, y_train, X_val_final, y_val, name
                )
                
                if metrics:
                    results_list.append(metrics)
                    
                    # B. Track the Winner
                    if score > best_score:
                        best_score = score
                        best_model = trained_model
                        best_model_name = name
            except Exception as e:
                print(f"Skipping {name} due to error: {e}")
                continue

        # --- CRITICAL FIX: Check if we actually trained anything ---
        if len(results_list) == 0:
            return None, None, None, "Error: No models were trained successfully. Check your data shape."

        # 4. Create Leaderboard DataFrame
        leaderboard = pd.DataFrame(results_list).sort_values(by="Val R¬≤", ascending=False)

        # 5. Create Bar Chart (The Only Plot)
        try:
            fig = plt.figure(figsize=(10, 6))
            sns.barplot(data=leaderboard, x='Val R¬≤', y='Model', palette='viridis')
            
            plt.title('Model Comparison: Validation R¬≤ (Higher is Better)', fontsize=14)
            plt.xlabel('R¬≤ Score')
            plt.ylabel('')
            plt.xlim(0, 1) 
            plt.grid(axis='x', linestyle='--', alpha=0.7)
            plt.tight_layout()
        except Exception as e:
            print(f"Plotting error: {e}")
            fig = None
        
        # 6. Save Best Model
        best_model_path = "best_model.pkl"
        if best_model:
            joblib.dump(best_model, best_model_path)
        

        # 6. TEST Scores (Final "Real World" Check) <-- NEW STEP
        test_preds = best_model.predict(X_test_final)
        test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
        test_r2 = r2_score(y_test, test_preds)
        status = (f"‚úÖ Training Complete!\n"
                  f"üèÜ Best Model: {best_model_name}\n"
                  f"üìà Best R¬≤: {best_score:.4f}\n"
                  f"üöÄ FINAL TEST SCORES (Held-out Data):\n"
                  f"‚Ä¢ R¬≤ ScX_train_processed, X_val_processed, X_test_processedore: {test_r2:.4f}\n"
                  f"‚Ä¢ RMSE:     {test_rmse:.4f}\n"
                  f"üíæ Saved to: {best_model_path}")
        
        # Return: Leaderboard, Bar Plot, File, Status
        return leaderboard, fig, best_model_path, status

    # ==========================
    # CLASSIFICATION PIPELINE
    # ==========================
    elif task_type == "Classification":
        for name, model in classification_models.items(): # Using classification_models dictionary
            try:
                # Use the Classification Worker
                metrics, _, trained_model, score = evaluate_classification_model(
                    model, X_train_final, y_train, X_val_final, y_val, name
                )
                if metrics:
                    results_list.append(metrics)
                    # Track winner by Accuracy
                    if score > best_score:
                        best_score = score
                        best_model = trained_model
                        best_model_name = name
            except Exception as e:
                print(f"Skipping {name}: {e}")
                continue

        if not results_list: return None, None, None, "Error: No classification models trained."

        # Leaderboard & Plot
        leaderboard = pd.DataFrame(results_list).sort_values(by="Val Accuracy", ascending=False)
        
        fig = plt.figure(figsize=(10, 6))
        sns.barplot(data=leaderboard, x='Val Accuracy', y='Model', palette='magma')
        plt.title('Classification Leaderboard: Validation Accuracy', fontsize=14)
        plt.xlim(0, 1)
        plt.tight_layout()
        
        # Final Test
        test_preds = best_model.predict(X_test_final)
        test_score = accuracy_score(y_test, test_preds)
        test_metric_name = "Accuracy"

    else:
        return None, None, None, f"Error: Unknown task type '{task_type}'"

    # ==========================
    # SHARED FINALIZATION
    # ==========================
    
    # Save Best Model
    best_model_path = "best_model.pkl"
    if best_model:
        joblib.dump(best_model, best_model_path)
    
    status = (f"‚úÖ Training Complete! Mode: {task_type}\n"
              f"üèÜ Best Model: {best_model_name}\n"
              f"üìä Val Score:  {best_score:.4f}\n"
              f"üöÄ TEST SET {test_metric_name}: {test_score:.4f}\n"
              f"üíæ Saved to: {best_model_path}")
    
    return leaderboard, fig, best_model_path, status

# hyperparameter tunning 

# grids for regrission and classification


In [55]:
# Define comprehensive hyperparameter grids

    #Regression
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200, 300],  # Number of trees
        'max_depth': [None, 10, 20, 30],       # Tree depth
        'min_samples_split': [2, 5, 10],       # Minimum samples to split
        'min_samples_leaf': [1, 2, 4],         # Minimum samples per leaf
        'max_features': ['auto', 'sqrt', 'log2']  # Features to consider for splits
    },
    
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],        # Number of boosting stages
        'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
        'max_depth': [3, 4, 5, 6],             # Maximum depth per tree
        'min_samples_split': [2, 5, 10],       # Minimum samples to split
        'subsample': [0.8, 0.9, 1.0]           # Fraction of samples for fitting
    },
    
    'Ridge Regression': {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],  # Regularization strength
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']  # Algorithm
    },
    
    'Voting Ensemble': {
        'ridge__alpha': [0.1, 1.0, 10.0],
        'rf__n_estimators': [50, 100],
        'rf__max_depth': [10, 20],
        'gb__n_estimators': [50, 100],
        'gb__learning_rate': [0.05, 0.1]
    }
} 



    # 2. Classification Grids (Your New Definitions)
param_grids_classification = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        # 'auto' is deprecated in new sklearn, mapping it to 'sqrt' is safer
        'max_features': ['sqrt', 'log2'] 
    },
    
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 4, 5, 6],
        'min_samples_split': [2, 5, 10],
        'subsample': [0.8, 0.9, 1.0]
    },
    
    'Logistic Regression': [
        # Grid 1: ElasticNet (requires saga)
        {
            'penalty': ['elasticnet'],
            'solver': ['saga'],
            'l1_ratio': [0.2, 0.5, 0.8],
            'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
        },
        # Grid 2: L1/L2 (works with liblinear or saga)
        {
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
        }
    ],
    
    "LightGBM": {
        "num_leaves": [15, 31, 63],
        "max_depth": [-1, 5, 10],
        "learning_rate": [0.05, 0.1, 0.2],
        "n_estimators": [100, 200],
        "subsample": [0.7, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0]
    },
    
    "XGBoost": {
        "learning_rate": [0.05, 0.1, 0.2],
        "max_depth": [3, 5, 7],
        "n_estimators": [100, 200, 400],
        "subsample": [0.7, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0]
    },
    
    'Support Vector Machine': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf']
    },
    
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance']
    }
}

In [56]:

# --- 10. Hyperparameter Tuning Function ---
def tune_models_pipeline(X_train_final, y_train, X_val_final, y_val,X_test_final,y_test, model_name, task_type):
    # 1. Safety Checks
    if X_train_final is None or y_train is None:
        return None, None, None, "Please run Advanced Processing (Tab 5) first."
    
    # Flatten y for training
    y_train = np.ravel(y_train)
    y_val = np.ravel(y_val)
    y_test = np.ravel(y_test)

    # ==========================
    # REGRESSION PIPELINE
    # ==========================
    if task_type == "Regression":
        
        # 2. Retrieve Model and Grid
        if model_name not in advanced_models:
            return None, None, None, f"Error: Model '{model_name}' not found in portfolio."
        
        # We need a grid. If not defined, return error.
        if model_name not in param_grids:
            return None, None, None, f"Error: No hyperparameter grid defined for '{model_name}'."
            
        base_model = advanced_models[model_name]
        grid = param_grids[model_name]
        
        try:
            # 3. Start MLflow Run
            with mlflow.start_run(run_name=f"Tune_{model_name}"):
                
                # 4. Optimization (Your Code Logic)
                search = RandomizedSearchCV(
                    estimator=base_model,
                    param_distributions=grid,
                    n_iter=20,  # Try 20 random combinations
                    cv=config.CV_FOLDS,
                    scoring='r2',
                    n_jobs=config.N_JOBS,
                    random_state=config.RANDOM_STATE
                    # verbose=1  <-- Removed verbose as it doesn't show in Gradio
                )
                
                # Train
                search.fit(X_train_final, y_train)
                
                # Get Best Results
                best_model = search.best_estimator_
                best_params = search.best_params_
                best_cv_score = search.best_score_
                
                # 5. Log to MLflow
                mlflow.log_params(best_params)
                mlflow.log_metric("best_cv_r2", best_cv_score)
                mlflow.sklearn.log_model(best_model, "tuned_model")
                
                # 6. Validate on Hold-out Set (Validation Data)
                preds = best_model.predict(X_val_final)
                val_rmse = np.sqrt(mean_squared_error(y_val, preds))
                val_r2 = r2_score(y_val, preds)
                # 6. TEST Scores (Final "Real World" Check) <-- NEW STEP
                test_preds = best_model.predict(X_test_final)
                test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
                test_r2 = r2_score(y_test, test_preds)
                
                # 7. Create Status Report
                status = (f"‚úÖ Tuning Complete for {model_name}!\n"
                          f"-----------------------------------\n"
                          f"üèÜ Best CV R¬≤:      {best_cv_score:.4f}\n"
                          f"üìä Validation R¬≤:   {val_r2:.4f}\n"
                          f"üìâ Validation RMSE: {val_rmse:.4f}\n\n"
                          f"üöÄ FINAL TEST SCORES (Held-out Data):\n"
                          f"‚Ä¢ R¬≤ Score: {test_r2:.4f}\n"
                          f"‚Ä¢ RMSE:     {test_rmse:.4f}\n"
                          f"‚öôÔ∏è Best Parameters:\n{best_params}")
                
                # 8. Plot Performance
                fig, ax = plt.subplots(figsize=(8, 6))
                sns.scatterplot(x=y_val, y=preds, alpha=0.6, ax=ax, color='#2ca02c')
                
                min_v, max_v = y_val.min(), y_val.max()
                ax.plot([min_v, max_v], [min_v, max_v], 'r--', lw=2, label="Ideal Fit")
                
                ax.set_title(f"Tuned {model_name} (Val R¬≤: {val_r2:.2f})")
                ax.set_xlabel("Actual")
                ax.set_ylabel("Predicted")
                ax.legend()
                plt.tight_layout()
                
                # Return plot as figure object
                
                # 9. Save Model File
                filename = f"tuned_{model_name.replace(' ', '_')}.pkl"
                joblib.dump(best_model, filename)
                
                return status, fig, filename, "Done"

        except Exception as e:
            return f"Tuning failed: {str(e)}", None, None, "Error"
        

    # ==========================
    # CLASSIFICATION PIPELINE
    # ==========================

    elif task_type == "Classification":
        # Retrieve Model and Grid
        if model_name not in classification_models:
            return None, None, None, f"Error: Model '{model_name}' not found in Classification list."
        if model_name not in param_grids_classification:
            return None, None, None, f"Error: No grid for '{model_name}'."

        base_model = classification_models[model_name]
        grid = param_grids_classification[model_name]
        metric = 'accuracy'

        try:
            with mlflow.start_run(run_name=f"Tune_{model_name}"):
                # Randomized Search
                search = RandomizedSearchCV(
                    estimator=base_model,
                    param_distributions=grid,
                    n_iter=20,
                    cv=config.CV_FOLDS,
                    scoring=metric,
                    n_jobs=config.N_JOBS,
                    random_state=config.RANDOM_STATE
                )
                
                search.fit(X_train_final, y_train)
                
                best_model = search.best_estimator_
                best_params = search.best_params_
                best_cv_score = search.best_score_
                
                # Scores
                val_preds = best_model.predict(X_val_final)
                val_score = accuracy_score(y_val, val_preds)
                
                test_preds = best_model.predict(X_test_final)
                test_score = accuracy_score(y_test, test_preds)
                
                # Log
                mlflow.log_params(best_params)
                mlflow.log_metric("test_accuracy", test_score)
                mlflow.sklearn.log_model(best_model, "tuned_model")
                
                # Status
                status = (f"‚úÖ Tuning Complete for {model_name}!\n"
                          f"-----------------------------------\n"
                          f"üèÜ Best CV Acc:   {best_cv_score:.2%}\n"
                          f"üìä Val Acc:       {val_score:.2%}\n"
                          f"üöÄ Test Set Acc:  {test_score:.2%}\n\n"
                          f"‚öôÔ∏è Best Parameters:\n{best_params}")
                
                # Plot: Confusion Matrix
                fig, ax = plt.subplots(figsize=(8, 6))
                cm = confusion_matrix(y_test, test_preds)
                sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', ax=ax)
                ax.set_title(f"Tuned {model_name} (Test Acc: {test_score:.2%})")
                ax.set_xlabel("Predicted")
                ax.set_ylabel("Actual")
                plt.tight_layout()
                
                # Save
                filename = f"tuned_{model_name.replace(' ', '_')}.pkl"
                joblib.dump(best_model, filename)
                
                return status, fig, filename, "Done"

        except Exception as e:
            return f"Tuning failed: {str(e)}", None, None, "Error"
    
    return "Unknown Error", None, None, "Error"

## üîç THE UI'S SETUP WITH GRADIO AND THE BUTTOM PROSSESORS

In [57]:

# --- The UI (using gr.Blocks) ---
with gr.Blocks(title="Auto-ML App") as demo:
    
    # --- State Components ---
    df_state = gr.State()
    X_state = gr.State()
    y_state = gr.State()
    
    X_train_processed_state = gr.State()
    y_train_state = gr.State()
    X_val_processed_state = gr.State()
    y_val_state = gr.State()
    X_test_processed_state = gr.State()
    y_test_state = gr.State()
    
    # This holds the DataFrame for plotting
    X_train_df_state = gr.State()

    # ... for getting the type of the dataset form the usee ...
    task_type_state = gr.State(value="Regression") # <--- The new memory slot

    # States for the FINAL, Scaled Data (after Tab 5)
    X_train_final_state = gr.State()
    X_val_final_state = gr.State()
    X_test_final_state = gr.State()

    gr.Markdown("<h1 style='text-align: center;'>Automated Model Trainer</h1>")
    
    with gr.Tabs():
        
        # --- Tab 1: Upload & Split ---
        with gr.TabItem("1. Upload & Split"):
            gr.Markdown("# üìà  Data Splitter")
    
            with gr.Row():
                file_input = gr.File(label="Upload your CSV", file_types=[".csv"])
                upload_button = gr.Button("1. Upload Data")
            
            df_output = gr.DataFrame(label="Data Preview ")

            gr.Markdown("---")
            
            y_column_input = gr.Textbox(label="Enter Target ('y') Column Name", placeholder="e.g., median_house_value")
            split_button = gr.Button("2. Split Data")
            status_output = gr.Textbox(label="Status")

            with gr.Row():
                X_output = gr.DataFrame(label="Features (X)")
                y_output = gr.DataFrame(label="Target (y)")

            upload_button.click(fn=load_and_store_data, inputs=[file_input], outputs=[df_output, df_state])
            split_button.click(fn=split_and_save_data, inputs=[df_state, y_column_input], outputs=[status_output, X_output, y_output, X_state, y_state])

        
        # --- Tab 2.5: Advanced Profiling (NEW) ---
        with gr.TabItem("2.5. Advanced Profiling"):
            gr.Markdown("## üìë Comprehensive Data Report")
            gr.Markdown("Generate a full automated report (histograms, correlations, warnings) .")
            
            report_btn = gr.Button("Generate Profile Report", variant="primary")
            
            # We use gr.HTML to render the interactive report
            report_output = gr.HTML(label="Profiling Report")
            
            # Connect it to the original dataframe (df_state)
            report_btn.click(
                fn=generate_profile_report,
                inputs=[df_state],
                outputs=[report_output]
            )
            
        # --- Tab 2: EDA ---
        with gr.TabItem("2. EDA"):
            gr.Markdown("## Simple Exploratory Data Analysis")
            with gr.Row():
                missing_button = gr.Button("Find Missing Values")
                categorical_button = gr.Button("Find Categorical Columns")
                shape_button = gr.Button("Show Data Shape")
            
            with gr.Row():
                missing_output = gr.DataFrame(label="Missing Values")
                categorical_output = gr.DataFrame(label="Categorical Columns")
            shape_output = gr.Textbox(label="Data Shape")
            
            shape_button.click(fn=get_shape, inputs=[df_state], outputs=[shape_output])
            missing_button.click(fn=find_missing_values, inputs=[df_state], outputs=[missing_output])
            categorical_button.click(fn=find_categorical_columns, inputs=[df_state], outputs=[categorical_output])

        # --- Tab 3: Preprocessing ---
        with gr.TabItem("3. Preprocessing"):
            gr.Markdown("Run the full preprocessing pipeline.")

            # --- NEW: Step 1 - The Task Type Selector ---
            # We add it here, but we don't change the main function yet.
            task_type_radio = gr.Radio(
                choices=["Regression", "Classification"], 
                label="Select Task Type", 
                value="Regression",
                info="This choice will be used in the next tab for Outlier Analysis."
            )
            # -------------------------------------------
            preprocess_button = gr.Button("3. Run Preprocessing")
            preprocess_status = gr.Textbox(label="Status", lines=4)
            preprocess_preview = gr.DataFrame(label="Processed X_train Preview")
            
            # --- FIXED OUTPUTS: Added X_train_df_state as 9th output ---

            # --- 1. Connect the Radio Button to the State ---
            # This updates 'task_type_state' instantly whenever the user changes the radio button
            task_type_radio.change(
                fn=update_task_type, 
                inputs=[task_type_radio], 
                outputs=[task_type_state] # <--- Saved to memory!
            )

            
            preprocess_button.click(
                fn=preprocess_data,
                inputs=[X_state, y_state],
                outputs=[
                    preprocess_status, 
                    preprocess_preview,
                    X_train_processed_state, y_train_state,
                    X_val_processed_state, y_val_state,
                    X_test_processed_state, y_test_state,
                    X_train_df_state # <--- This is the critical missing piece!
                ]
            )

        # --- Tab 4: Model EDA (ADDED) ---
        with gr.TabItem("4. Model EDA"):
            gr.Markdown("Explore the processed training data.")
            with gr.Row():
                plot_dists_btn = gr.Button("Plot All Distributions")
                plot_corr_btn = gr.Button("Generate Correlation Heatmap")
            
            plot_output = gr.Plot(label="Plot Output")
            
            plot_dists_btn.click(fn=plot_all_distributions, inputs=[X_train_df_state], outputs=[plot_output])
            plot_corr_btn.click(fn=plot_correlation, inputs=[X_train_df_state], outputs=[plot_output])

        # --- Tab 5: Advanced Processing (NEW) ---
        with gr.TabItem("5. Advanced Processing"):
            gr.Markdown("## üöÄ Stage 2: Advanced Transformation")
            gr.Markdown("Apply **Outlier Capping** and **Robust Scaling**")
            
            advanced_button = gr.Button("Run Advanced Pipeline")
            advanced_status = gr.Textbox(label="Status", lines=4)
            
            # Inputs: The arrays from Stage 1 + The Task Type
            # Outputs: The final arrays + Status box
            advanced_button.click(
                fn=scale_and_clean_data,
                inputs=[X_train_processed_state, X_val_processed_state, X_test_processed_state, task_type_state],
                outputs=[advanced_status, X_train_final_state, X_val_final_state, X_test_final_state]

            )

            # --- Tab 6: Auto-ML Training ---
        with gr.TabItem("6. Train Model"):
            gr.Markdown("## ü§ñ Auto-ML Training")
            gr.Markdown("Train the entire portfolio of models using your **Cleaned & Scaled Data**.")
            
            train_all_btn = gr.Button("üöÄ Run Auto-ML Experiment", variant="primary")
            train_status = gr.Textbox(label="Experiment Status", lines=8, max_lines=20, interactive=False)
            
            with gr.Row():
                leaderboard_output = gr.DataFrame(label="üèÜ Model Leaderboard")
                model_file_output = gr.File(label="üíæ Download Best Model")
            
            comparison_plot = gr.Plot(label="üìä R¬≤ Score Comparison")
            
            train_all_btn.click(
                fn=train_models_pipeline,
                inputs=[
                    X_train_final_state, # <--- The Final Clean Data
                    y_train_state, 
                    X_val_final_state,   # <--- The Final Clean Data
                    y_val_state,
                    X_test_final_state, # <--- Added Input
                    y_test_state,       # <--- Added Input 
                    task_type_state
                ],
                outputs=[leaderboard_output, comparison_plot, model_file_output, train_status]
            )

        # --- Tab 7: Hyperparameter Tuning ---
        with gr.TabItem("7. Tune Model"):
            gr.Markdown("## üéõÔ∏è Hyperparameter Tuning")
            gr.Markdown("Optimize a specific model to find the best settings using **RandomizedSearchCV**.")
            
            with gr.Row():
                # Dropdown: Only shows models that we have grids for
                tune_model_selector = gr.Dropdown(
                    choices=list(param_grids.keys()), 
                    label="Select Model to Tune",
                    value="Random Forest"
                )
                tune_btn = gr.Button("Start Tuning", variant="primary")
            
            tune_status = gr.Textbox(label="Tuning Results", lines=10)
            
            with gr.Row():
                tune_plot = gr.Plot(label="Tuned Model Performance")
                tuned_model_file = gr.File(label="Download Tuned Model")
            
            tune_btn.click(
                fn=tune_models_pipeline,
                inputs=[
                    X_train_final_state, 
                    y_train_state, 
                    X_val_final_state, 
                    y_val_state, 
                    X_test_final_state, # <--- Added Input
                    y_test_state,
                    tune_model_selector,
                    task_type_state
                ],
                outputs=[tune_status, tune_plot, tuned_model_file]
            )    
      
demo.launch()

* Running on local URL:  http://127.0.0.1:7868
* To create a public link, set `share=True` in `launch()`.


