<a href="https://colab.research.google.com/github/abdurrahman16/streamlitapp.py/blob/main/streamlit_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import streamlit as st
import pandas as pd
import numpy as np
import pickle
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import shap
from sklearn.preprocessing import StandardScaler

# Configure page
st.set_page_config(
    page_title="Breast Cancer Diagnosis ML Dashboard",
    page_icon="🏥",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Load cached data
@st.cache_data
def load_data():
    """Load all ML pipeline results"""
    # Load models
    with open('breast_cancer_ml_results/models/trained_models.pkl', 'rb') as f:
        models_data = pickle.load(f)

    # Load test data
    X_test_scaled = np.load('breast_cancer_ml_results/data/X_test_scaled.npy')
    X_test_original = np.load('breast_cancer_ml_results/data/X_test_original.npy')
    y_test = np.load('breast_cancer_ml_results/data/y_test.npy')
    feature_names = np.load('breast_cancer_ml_results/data/feature_names.npy')

    # Load SHAP results
    with open('breast_cancer_ml_results/explainability/shap/shap_results.pkl', 'rb') as f:
        shap_data = pickle.load(f)

    # Load LIME results
    with open('breast_cancer_ml_results/explainability/lime/lime_results.pkl', 'rb') as f:
        lime_data = pickle.load(f)

    # Load feature importance
    feature_importance = pd.read_csv('breast_cancer_ml_results/explainability/shap/feature_importance.csv')

    return {
        'models': models_data['models'],
        'scaler': models_data['scaler'],
        'model_performance': models_data['model_performance'],
        'X_test_scaled': X_test_scaled,
        'X_test_original': X_test_original,
        'y_test': y_test,
        'feature_names': feature_names,
        'shap_values': shap_data['shap_values'],
        'feature_importance': feature_importance,
        'lime_data': lime_data
    }

# Load data
data = load_data()

# Header
st.title("Breast Cancer Diagnosis ML Dashboard")
st.markdown("**Diagnostic Support with Explainable Predictions**")

# Sidebar
st.sidebar.header("Navigation")
page = st.sidebar.selectbox(
    "Select Analysis",
    ["Model Overview", "Individual Prediction", "Feature Analysis", "Model Comparison"]
)

# Model Overview Page
if page == "Model Overview":
    col1, col2, col3 = st.columns(3)

    # Performance metrics
    with col1:
        st.metric("Best Model", "Neural Network", "98.25% Accuracy")
    with col2:
        st.metric("Total Predictions", len(data['y_test']), "114 Test Cases")
    with col3:
        benign_pct = (1 - data['y_test'].mean()) * 100
        st.metric("Benign Cases", f"{benign_pct:.1f}%", f"{int(len(data['y_test']) * (1-data['y_test'].mean()))} samples")

    # Model performance comparison
    st.subheader("Model Performance Comparison")

    performance_data = []
    for model_name, metrics in data['model_performance'].items():
        performance_data.append({
            'Model': model_name,
            'Accuracy': metrics['accuracy'] * 100,
            'Precision': metrics['precision'] * 100,
            'Recall': metrics['recall'] * 100,
            'AUC-ROC': metrics['auc'] * 100
        })

    performance_df = pd.DataFrame(performance_data)

    fig = px.bar(
        performance_df.melt(id_vars=['Model'], var_name='Metric', value_name='Score'),
        x='Model', y='Score', color='Metric',
        title="Model Performance Metrics (%)",
        height=400
    )
    fig.update_layout(yaxis_range=[90, 100])
    st.plotly_chart(fig, use_container_width=True)

    # Top predictive features
    st.subheader("Most Important Diagnostic Features")

    top_features = data['feature_importance'].head(10)

    fig = px.bar(
        top_features,
        y='feature',
        x='importance',
        orientation='h',
        title="Top 10 Features Driving Malignancy Predictions",
        height=500
    )
    fig.update_layout(yaxis={'categoryorder': 'total ascending'})
    st.plotly_chart(fig, use_container_width=True)

    # Clinical insights
    st.subheader("Key Clinical Insights")
    col1, col2 = st.columns(2)

    with col1:
        st.info("""
        **Primary Predictors:**
        - Tumor size metrics (area, radius)
        - Surface irregularities (concave points)
        - Texture variations
        """)

    with col2:
        st.success("""
        **Model Reliability:**
        - 98.25% accuracy on test data
        - 100% precision (no false positives)
        - High consistency across all metrics
        """)

# Individual Prediction Page
elif page == "Individual Prediction":
    st.subheader("Individual Patient Analysis")

    # Sample selection
    sample_idx = st.selectbox(
        "Select Patient Sample",
        range(len(data['y_test'])),
        format_func=lambda x: f"Patient {x+1} ({'Malignant' if data['y_test'][x] == 1 else 'Benign'})"
    )

    # Get predictions for selected sample
    sample_data = data['X_test_scaled'][sample_idx:sample_idx+1]
    actual_label = 'Malignant' if data['y_test'][sample_idx] == 1 else 'Benign'

    # Model predictions
    col1, col2, col3 = st.columns(3)

    predictions = {}
    for model_name, model in data['models'].items():
        pred_proba = model.predict_proba(sample_data)[0][1]
        predictions[model_name] = pred_proba

        with col1 if model_name == 'Logistic Regression' else col2 if model_name == 'Random Forest' else col3:
            confidence = "High" if abs(pred_proba - 0.5) > 0.3 else "Medium" if abs(pred_proba - 0.5) > 0.15 else "Low"
            st.metric(
                model_name,
                f"{pred_proba:.1%}",
                f"Actual: {actual_label}, Confidence: {confidence}"
            )

    # Prediction explanation using SHAP
    st.subheader("Why This Prediction?")

    # Get SHAP values for this sample
    rf_shap_values = data['shap_values']['Random Forest'][sample_idx]

    # Create feature contribution chart
    feature_contributions = []
    for i, (feature, shap_val) in enumerate(zip(data['feature_names'], rf_shap_values)):
        if abs(shap_val) > 0.01:  # Only show significant contributions
            feature_contributions.append({
                'Feature': feature.replace('_', ' ').title(),
                'Contribution': shap_val,
                'Direction': 'Malignant' if shap_val > 0 else 'Benign',
                'Magnitude': abs(shap_val)
            })

    contrib_df = pd.DataFrame(feature_contributions).sort_values('Magnitude', ascending=False).head(10)

    if not contrib_df.empty:
        fig = px.bar(
            contrib_df,
            y='Feature',
            x='Contribution',
            color='Direction',
            orientation='h',
            title=f"Top Feature Contributions for Patient {sample_idx+1}",
            color_discrete_map={'Malignant': '#ff6b6b', 'Benign': '#4ecdc4'}
        )
        fig.update_layout(yaxis={'categoryorder': 'total ascending'})
        st.plotly_chart(fig, use_container_width=True)

    # Treatment recommendations
    if predictions['Neural Network'] > 0.7:
        st.error("**High Risk**: Recommend immediate biopsy and specialist consultation")
    elif predictions['Neural Network'] > 0.3:
        st.warning("**Moderate Risk**: Consider additional imaging and follow-up in 3-6 months")
    else:
        st.success("**Low Risk**: Routine monitoring recommended")

# Feature Analysis Page
elif page == "Feature Analysis":
    st.subheader("Feature Analysis & Patterns")

    # Feature selection
    selected_features = st.multiselect(
        "Select Features to Analyze",
        data['feature_names'],
        default=data['feature_importance']['feature'].head(5).tolist()
    )

    if selected_features:
        # Feature distribution comparison
        fig = make_subplots(
            rows=len(selected_features), cols=1,
            subplot_titles=[f.replace('_', ' ').title() for f in selected_features],
            vertical_spacing=0.1
        )

        for i, feature in enumerate(selected_features):
            feature_idx = list(data['feature_names']).index(feature)

            # Get feature values
            benign_values = data['X_test_original'][data['y_test'] == 0, feature_idx]
            malignant_values = data['X_test_original'][data['y_test'] == 1, feature_idx]

            # Add histograms
            fig.add_trace(
                go.Histogram(x=benign_values, name='Benign', opacity=0.7,
                           marker_color='#4ecdc4', legendgroup='benign',
                           showlegend=(i==0)),
                row=i+1, col=1
            )
            fig.add_trace(
                go.Histogram(x=malignant_values, name='Malignant', opacity=0.7,
                           marker_color='#ff6b6b', legendgroup='malignant',
                           showlegend=(i==0)),
                row=i+1, col=1
            )

        fig.update_layout(
            height=300*len(selected_features),
            title="Feature Distribution: Benign vs Malignant"
        )
        st.plotly_chart(fig, use_container_width=True)

        # Feature correlation heatmap
        if len(selected_features) > 1:
            st.subheader("Feature Correlations")

            # Create correlation matrix
            feature_indices = [list(data['feature_names']).index(f) for f in selected_features]
            selected_data = data['X_test_original'][:, feature_indices]
            correlation_matrix = np.corrcoef(selected_data.T)

            fig = px.imshow(
                correlation_matrix,
                x=selected_features,
                y=selected_features,
                color_continuous_scale='RdBu',
                aspect='auto',
                title="Feature Correlation Matrix"
            )
            st.plotly_chart(fig, use_container_width=True)

# Model Comparison Page
elif page == "Model Comparison":
    st.subheader("Detailed Model Comparison")

    # Prediction confidence comparison
    st.subheader("Prediction Confidence Distribution")

    confidence_data = []
    for model_name, model in data['models'].items():
        probabilities = model.predict_proba(data['X_test_scaled'])[:, 1]
        for i, prob in enumerate(probabilities):
            confidence_data.append({
                'Model': model_name,
                'Probability': prob,
                'Actual': 'Malignant' if data['y_test'][i] == 1 else 'Benign',
                'Correct': (prob > 0.5) == (data['y_test'][i] == 1)
            })

    confidence_df = pd.DataFrame(confidence_data)

    fig = px.box(
        confidence_df,
        x='Model',
        y='Probability',
        color='Actual',
        title="Prediction Confidence by Model and Actual Diagnosis"
    )
    fig.add_hline(y=0.5, line_dash="dash", line_color="red",
                  annotation_text="Decision Threshold")
    st.plotly_chart(fig, use_container_width=True)

    # Model agreement analysis
    st.subheader("Model Agreement Analysis")

    # Get predictions from all models
    all_predictions = {}
    for model_name, model in data['models'].items():
        all_predictions[model_name] = model.predict_proba(data['X_test_scaled'])[:, 1]

    # Find cases where models disagree
    disagreement_cases = []
    for i in range(len(data['y_test'])):
        preds = [all_predictions[model][i] > 0.5 for model in all_predictions.keys()]
        if not all(p == preds[0] for p in preds):  # Not all models agree
            disagreement_cases.append({
                'Sample': i+1,
                'Actual': 'Malignant' if data['y_test'][i] == 1 else 'Benign',
                'LR_Prob': all_predictions['Logistic Regression'][i],
                'RF_Prob': all_predictions['Random Forest'][i],
                'NN_Prob': all_predictions['Neural Network'][i]
            })

    if disagreement_cases:
        st.warning(f"Found {len(disagreement_cases)} cases where models disagree")
        disagreement_df = pd.DataFrame(disagreement_cases)
        st.dataframe(disagreement_df, use_container_width=True)
    else:
        st.success("All models agree on all test cases!")

    # Performance metrics table
    st.subheader("Detailed Performance Metrics")

    metrics_df = pd.DataFrame(data['model_performance']).T
    metrics_df = metrics_df.round(4)
    st.dataframe(metrics_df, use_container_width=True)

2025-09-08 00:54:45.356 No runtime found, using MemoryCacheStorageManager
