In [1]:
#Name:
#Section & Year:
#Date:
#AI-Powered Exploratory Data Analytics (EDA) using Hugging Face API and Gradio in Python:
#From Descriptive to Prescriptive Insights

import gradio as gr
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image
from huggingface_hub import InferenceClient
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, f1_score
from sklearn.feature_selection import mutual_info_classif
import warnings
import requests
warnings.filterwarnings("ignore")

#---------- CONFIG --------------------------
HF_TOKEN = "YOUR HUGGING FACE TOKEN"
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"

#Test if Hugging Face connection is live (supports both text-generation & conversational models)
def test_hf_connection(token, model):
    try:
        client = InferenceClient(model=model, token=token)
        # Some models (like Mistral) support only chat/completion, not text_generation
        try:
            _ = client.text_generation("Connection test", max_new_tokens=5)
            return client, "Hugging Face connection successful (text-generation)."
        except Exception:
            # Try conversational fallback
            _ = client.chat_completion(messages=[{"role": "user", "content": "Connection test"}], max_tokens=5)
            return client, "Hugging Face connection successful (chat model)."
    except requests.exceptions.RequestException:
        return None, "Network connection to Hugging Face failed."
    except Exception as e:
        return None, f"Hugging Face connection error: {str(e)}"

client, HF_STATUS = test_hf_connection(HF_TOKEN, MODEL_NAME)

# Display connection state for logging or Gradio UI
print(HF_STATUS)

# Visualization defaults
sns.set(style="whitegrid")
plt.rcParams.update({'figure.max_open_warning': 0})



✅ Hugging Face connection successful (chat model).


In [2]:
# -------- Helpers ----------
def fig_to_pil(dpi=150):
    buf = BytesIO()
    plt.savefig(buf, format="png", bbox_inches='tight', dpi=dpi)
    buf.seek(0)
    img = Image.open(buf).convert("RGB")
    buf.close()
    plt.close()
    return img


def load_file_to_df(file):
    """
    Robust loader that accepts:
      - gr.File upload (file-like / temporary path)
      - a plain path-like object (str)
      - a file-like object
    Returns (df, error_message_or_None)
    """
    if file is None:
        return None, "No file provided."
    try:
        # gr.File sometimes gives a SpooledTemporaryFile-like object with .name attribute.
        # If file is a dict (old gradio), try the common keys -> but generally gradio passes a file-like.
        # Accept strings (path) too.
        if isinstance(file, str):
            # path
            if file.lower().endswith(".csv"):
                df = pd.read_csv(file)
            else:
                df = pd.read_excel(file)
            return df, None

        # If file has attribute 'name' that's a path on disk
        name = getattr(file, "name", None)
        if isinstance(name, str) and name.lower().endswith(".csv"):
            # some environments (notably local testing) expose a filesystem path
            try:
                df = pd.read_csv(name)
                return df, None
            except Exception:
                # fallback to reading file object
                file.seek(0)
                df = pd.read_csv(file)
                return df, None
        if isinstance(name, str) and name.lower().endswith((".xls", ".xlsx")):
            try:
                df = pd.read_excel(name)
                return df, None
            except Exception:
                file.seek(0)
                df = pd.read_excel(file)
                return df, None

        # fallback: try reading as CSV first, then Excel
        try:
            file.seek(0)
        except Exception:
            pass
        try:
            df = pd.read_csv(file)
            return df, None
        except Exception:
            try:
                if hasattr(file, "seek"):
                    file.seek(0)
                df = pd.read_excel(file)
                return df, None
            except Exception as e:
                return None, f"Could not parse file as CSV or Excel: {e}"
    except Exception as e:
        return None, f"Error loading file: {e}"

In [3]:
# -------- Target detection ----------
TARGET_KEYWORDS = [
    "target", "label", "y", "price", "sales", "amount", "revenue", "score", "churn", "status", "outcome", "rating"
]


def detect_target_column(df):
    cols = list(df.columns)
    # 1) keyword matching
    for c in cols:
        name = c.lower()
        for kw in TARGET_KEYWORDS:
            # check tokens and underscore variant
            if kw in name.split() or kw in name.replace("_", " "):
                return c, f"Found keyword '{kw}' in column name."
    # 2) numeric heuristic
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    if num_cols:
        scored = []
        for c in num_cols:
            nunique = int(df[c].nunique(dropna=True))
            miss = float(df[c].isna().mean())
            var = float(df[c].var() if pd.notna(df[c].var()) else 0.0)
            score = (min(1, nunique / max(1, len(df) // 10)) * 0.5) + ((1 - miss) * 0.3) + (min(1, var / (var + 1e-9)) * 0.2)
            if nunique / max(1, len(df)) > 0.9:
                score *= 0.3
            scored.append((c, score, nunique, miss))
        scored.sort(key=lambda x: x[1], reverse=True)
        if scored:
            cand = scored[0][0]
            return cand, f"Numeric candidate selected (score={scored[0][1]:.2f}, unique={scored[0][2]}, missing={scored[0][3]:.2f})."
    # 3) categorical small-cardinality
    cat_cols = df.select_dtypes(include='object').columns.tolist()
    for c in cat_cols:
        nunique = df[c].nunique(dropna=True)
        if 2 <= nunique <= 20:
            return c, f"Categorical candidate with {nunique} classes."
    return None, "No obvious target detected. Please choose one."

In [4]:
# -------- LLM interpret helper ----------
def llm_interpret_short(title, details):
    if not client:
        return f"LLM disabled. {title}: {details}"
    prompt = f"""You are a data analyst. Given the chart/title: {title} and details: {details}
Provide a concise 2-sentence interpretation highlighting trends, anomalies, or predictive hints."""
    try:
        resp = client.chat.completions.create(model=MODEL_NAME,
                                             messages=[{"role": "user", "content": prompt}], max_tokens=150)
        return resp.choices[0].message["content"].strip()
    except Exception as e:
        return f"LLM failed: {e}"


def llm_final_summary(df, task_type, target, brief_visual_interps):
    if not client:
        return "LLM disabled."
    prompt = f"""
You are an experienced data scientist. The dataset columns: {', '.join(df.columns.tolist()[:12])}.
Task type: {task_type}. Target: {target}.
Visual interpretations:
{brief_visual_interps}

Produce a concise, structured 4-part analytics report:
1) Descriptive
2) Diagnostic
3) Predictive
4) Prescriptive
Keep it short and specific.
"""
    try:
        resp = client.chat.completions.create(model=MODEL_NAME,
                                             messages=[{"role": "user", "content": prompt}], max_tokens=400)
        return resp.choices[0].message["content"].strip()
    except Exception as e:
        return f"LLM failed: {e}"

In [5]:
# -------- Auto-visual selection ----------
def select_visuals_for_dataset(df, target=None):
    imgs, interps = [], []
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    cat_cols = df.select_dtypes(include='object').columns.tolist()
    date_cols = df.select_dtypes(include=["datetime64", "datetime64[ns]"]).columns.tolist()

    # Target-focused visuals
    if target and target in df.columns:
        try:
            if pd.api.types.is_numeric_dtype(df[target]):
                plt.figure(figsize=(5, 3))
                sns.histplot(df[target].dropna(), kde=True)
                plt.title(f"Target distribution: {target}")
                imgs.append(fig_to_pil())
                interps.append(llm_interpret_short(f"Target distribution: {target}", f"Histogram of {target}"))
                # top correlated numeric predictors
                if num_cols:
                    corr = df[num_cols + [target]].corr()[target].drop(labels=[target], errors='ignore').abs().sort_values(ascending=False)
                    top = corr.head(3).index.tolist()
                    for p in top:
                        plt.figure(figsize=(5, 4))
                        sns.scatterplot(data=df, x=p, y=target)
                        plt.title(f"{target} vs {p}")
                        imgs.append(fig_to_pil())
                        interps.append(llm_interpret_short(f"{target} vs {p}", f"Scatter and linear tendency between {p} and {target}."))
            else:
                plt.figure(figsize=(5, 3))
                vc = df[target].value_counts().head(10)
                sns.barplot(x=vc.values, y=vc.index)
                plt.title(f"Target class distribution: {target}")
                imgs.append(fig_to_pil())
                interps.append(llm_interpret_short(f"Target class distribution: {target}", f"Counts per class"))
                num_preds = [c for c in num_cols if c != target]
                if num_preds:
                    df_drop = df[[target] + num_preds].dropna()
                    if not df_drop.empty:
                        X = df_drop[num_preds].fillna(0)
                        y = pd.factorize(df_drop[target])[0]
                        mi = mutual_info_classif(X, y, discrete_features=False)
                        mi_series = pd.Series(mi, index=num_preds).sort_values(ascending=False)
                        top = mi_series.head(3).index.tolist()
                        for p in top:
                            plt.figure(figsize=(5, 4))
                            sns.boxplot(x=df[target].astype(str), y=df[p])
                            plt.title(f"{p} by {target}")
                            imgs.append(fig_to_pil())
                            interps.append(llm_interpret_short(f"{p} by {target}", f"Distribution of {p} across classes of {target}."))
        except Exception:
            # be resilient to plotting failures
            pass

    # General EDA if no target visuals produced
    if not imgs:
        try:
            if len(num_cols) >= 3:
                plt.figure(figsize=(6, 5))
                corr = df[num_cols].corr()
                sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
                plt.title("Correlation heatmap")
                imgs.append(fig_to_pil())
                interps.append(llm_interpret_short("Correlation heatmap", "Pairwise correlations among numeric variables."))
                for c in num_cols[:2]:
                    plt.figure(figsize=(5, 3))
                    sns.histplot(df[c].dropna(), kde=True)
                    plt.title(f"Distribution: {c}")
                    imgs.append(fig_to_pil())
                    interps.append(llm_interpret_short(f"Distribution: {c}", f"Histogram of {c}."))
            elif num_cols and cat_cols:
                plt.figure(figsize=(6, 4))
                sns.boxplot(x=cat_cols[0], y=num_cols[0], data=df)
                plt.title(f"{num_cols[0]} by {cat_cols[0]}")
                imgs.append(fig_to_pil())
                interps.append(llm_interpret_short(f"{num_cols[0]} by {cat_cols[0]}", "Boxplots per category."))
                plt.figure(figsize=(5, 3))
                vc = df[cat_cols[0]].value_counts().head(10)
                sns.barplot(x=vc.values, y=vc.index)
                plt.title(f"Counts: {cat_cols[0]}")
                imgs.append(fig_to_pil())
                interps.append(llm_interpret_short(f"Counts: {cat_cols[0]}", "Category frequency."))
            elif cat_cols:
                plt.figure(figsize=(5, 3))
                vc = df[cat_cols[0]].value_counts().head(10)
                sns.barplot(x=vc.values, y=vc.index)
                plt.title(f"Counts: {cat_cols[0]}")
                imgs.append(fig_to_pil())
                interps.append(llm_interpret_short(f"Counts: {cat_cols[0]}", "Category frequency."))
            elif date_cols and num_cols:
                d = date_cols[0]
                df_sorted = df.sort_values(d)
                plt.figure(figsize=(7, 3))
                plt.plot(df_sorted[d], df_sorted[num_cols[0]])
                plt.title(f"Trend: {num_cols[0]} over {d}")
                imgs.append(fig_to_pil())
                interps.append(llm_interpret_short(f"Trend {num_cols[0]} over {d}", "Time trend line."))
        except Exception:
            pass

    if not imgs:
        interps.append("No visuals generated — dataset may be empty or unsupported.")
    return imgs, interps

In [6]:
# -------- Quick modeling ----------
def run_quick_regression(df, target):
    res = {"notes": "", "metrics": {}, "images": [], "interps": []}
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    preds = [c for c in num_cols if c != target]
    if target not in df.columns or len(preds) == 0:
        res["notes"] = "Not enough numeric predictors for regression."
        return res
    corr = df[[target] + preds].corr()[target].abs().drop(labels=[target], errors='ignore').sort_values(ascending=False)
    top_preds = corr.head(3).index.tolist()
    df_drop = df[[target] + top_preds].dropna()
    if df_drop.shape[0] < 10:
        res["notes"] = "Too few rows to run regression robustly."
        return res
    X = df_drop[top_preds]; y = df_drop[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred); mae = mean_absolute_error(y_test, y_pred)
    res["metrics"] = {"r2": r2, "mae": mae, "n_train": len(X_train), "n_test": len(X_test)}
    try:
        p = top_preds[0]
        plt.figure(figsize=(5, 4))
        sns.scatterplot(x=df_drop[p], y=df_drop[target])
        plt.title(f"{target} vs {p}")
        res["images"].append(fig_to_pil())
        res["interps"].append(llm_interpret_short(f"{target} vs {p}", f"Scatter used in regression."))
    except Exception:
        pass
    return res


def run_quick_classification(df, target):
    res = {"notes": "", "metrics": {}, "images": [], "interps": []}
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    preds = [c for c in num_cols if c != target]
    if target not in df.columns:
        res["notes"] = "Target not in dataframe."
        return res
    df_drop = df[[target] + preds].dropna()
    if df_drop.shape[0] < 30 or len(preds) == 0:
        res["notes"] = "Too few rows or predictors for classification demo."
        return res
    X = df_drop[preds]; y = pd.factorize(df_drop[target])[0]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    model = RandomForestClassifier(n_estimators=50, random_state=0, max_depth=6).fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred); f1 = f1_score(y_test, y_pred, average='weighted')
    res["metrics"] = {"accuracy": acc, "f1_weighted": f1, "n_train": len(X_train), "n_test": len(X_test)}
    try:
        imps = pd.Series(model.feature_importances_, index=preds).sort_values(ascending=False).head(10)
        plt.figure(figsize=(6, 4))
        sns.barplot(x=imps.values, y=imps.index)
        plt.title("Top feature importances")
        res["images"].append(fig_to_pil())
        res["interps"].append(llm_interpret_short("Feature importances", "Relative importance of numeric predictors."))
    except Exception:
        pass
    return res

In [7]:
# --- Handlers (fixed to refresh dropdown and clear previous outputs)
def handle_upload(file):
    """
    Returns:
      - preview_text (str)
      - dropdown update (gr.update)
      - detection message (str)
      - gallery clear (gr.update)
      - interp_md clear (str)
      - final_md clear (str)
    The extra outputs ensure previous visuals/text are cleared when Detect Target is clicked again.
    """
    df, err = load_file_to_df(file)
    if err:
        # Clear dropdown + gallery + texts to avoid stale references causing errors later
        return (
            f"{err}",
            gr.update(choices=[], value=None, visible=False),
            "No data loaded.",
            gr.update(value=[]),  # clear gallery
            "",  # interp_md
            ""   # final_md
        )

    cand, reason = detect_target_column(df)
    cols = ["All"] + list(df.columns)
    msg = f"Detected target: {cand} — {reason}" if cand else "No target automatically detected. Please select one."
    preview_text = df.head(5).to_csv(index=False)

    # Ensure dropdown refreshed (choices replaced) and value set to candidate if present else "All".
    selected_val = cand if (cand in cols) else "All"
    return (
        preview_text,
        gr.update(choices=cols, value=selected_val, visible=True),
        msg,
        gr.update(value=[]),  # clear gallery
        "",  # clear interp_md
        ""   # clear final_md
    )

In [8]:
def run_targeted_eda(file, selected_target, progress=gr.Progress()):
    """
    Main runner. Returns (images, interp_md, final_md).
    Works when selected_target is "All" or any column (keeps robust checks).
    """
    progress(0, desc="Reading dataset...")
    df, err = load_file_to_df(file)
    if err:
        return None, "File loading failed.", ""
    progress(0.25, desc="Selecting visuals...")
    # If "All" or no selection -> general EDA
    if selected_target == "All" or selected_target is None:
        imgs, interp_list = select_visuals_for_dataset(df, target=None)
        progress(0.6, desc="Summarizing insights...")
        summary = llm_final_summary(df, "General EDA", "All columns", "\n".join(interp_list[:8]))
        md_text = "\n\n".join(f"**Visual {i+1}:** {t}" for i, t in enumerate(interp_list))
        progress(1.0, desc="Done")
        return imgs, md_text, summary

    # If selected_target exists, ensure it's in df
    if selected_target not in df.columns:
        # Refresh dropdown to current columns to prevent future errors
        cols = ["All"] + list(df.columns)
        # Return an informative message (interp_md) and keep final empty — also prompt UI to refresh dropdown value
        return None, (
            f"Selected target '{selected_target}' not found in dataset. Dropdown refreshed — please select again."
        ), ""

    imgs, interp_list = select_visuals_for_dataset(df, target=selected_target)
    # determine task type safely
    try:
        is_numeric = pd.api.types.is_numeric_dtype(df[selected_target])
    except Exception:
        is_numeric = False
    task_type = "Regression" if is_numeric else "Classification"
    progress(0.6, desc=f"Running quick {task_type} demo...")
    model_res = run_quick_regression(df, selected_target) if task_type == "Regression" else run_quick_classification(df, selected_target)
    brief_visual_interps = "\n".join(interp_list[:8])
    progress(0.9, desc="Generating final summary...")
    final_summary = llm_final_summary(df, task_type, selected_target, brief_visual_interps)
    md_parts = [f"**Visual {i+1}:** {t}" for i, t in enumerate(interp_list)]
    if model_res.get("metrics"):
        md_parts.append("\n**Quick model metrics:**")
        md_parts += [f"- {k}: {v}" for k, v in model_res["metrics"].items()]
    if model_res.get("notes"):
        md_parts.append(f"\n**Notes:** {model_res['notes']}")
    progress(1.0, desc="Done")
    return imgs + model_res.get("images", []), "\n\n".join(md_parts), final_summary

In [9]:
# -------- Gradio UI ----------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Auto-EDA v4 — Target Detection & Task-aware Analytics")
    gr.Markdown("Hugging Face LLM: " + ("connected" if client else " disabled"))

    with gr.Row():
        file_input = gr.File(label="Upload CSV / Excel")
        preview = gr.Textbox(label="CSV preview (first rows)", interactive=False)

    detect_btn = gr.Button("Detect Target")
    target_dropdown = gr.Dropdown(label="Select target column", choices=[], visible=False)
    detect_msg = gr.Markdown(label="Detection message")
    run_btn = gr.Button("Run Targeted Analysis", variant="primary")

    gallery = gr.Gallery(label="Generated Visuals", columns=2)
    interp_md = gr.Markdown(label="Per-visual auto-interpretations")
    final_md = gr.Markdown(label="Final analytics summary")

    # ---------- Functions for interactivity ----------
    def update_target_caption(selected_target):
        """Refresh caption text when user changes the detected target dropdown."""
        if selected_target == "All":
            return "General dataset analysis mode (All columns selected)."
        elif selected_target:
            return f"Target manually selected: **{selected_target}**"
        else:
            return "No target selected."

    # Detect target + refresh dropdown + clear visuals each time file is uploaded or detection rerun
    detect_btn.click(
        fn=handle_upload,
        inputs=[file_input],
        outputs=[preview, target_dropdown, detect_msg, gallery, interp_md, final_md]
    )

    # 🔄 When user changes target column manually, refresh caption text
    target_dropdown.change(
        fn=update_target_caption,
        inputs=target_dropdown,
        outputs=detect_msg
    )

    # Run analysis button — runs EDA, visuals, interpretation, and summary
    run_btn.click(
        fn=run_targeted_eda,
        inputs=[file_input, target_dropdown],
        outputs=[gallery, interp_md, final_md]
    )

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0f16fe709da182a49b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


