# 📊 Multi-Agent EDA System with Gemini and CrewAI
This notebook demonstrates a refined implementation of a multi-agent EDA system using Google Gemini and CrewAI-compatible agents.

In [None]:
%pip install -q pyautogen google-generativeai pandas matplotlib seaborn autogen


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from autogen import AssistantAgent, UserProxyAgent
import google.generativeai as genai


In [None]:
class GeminiLLM:
    def __init__(self, api_key, model="gemini-1.5-flash"):
        self.api_key = api_key
        genai.configure(api_key=api_key)
        self.model = model

    def config(self):
        return {
            "model": self.model,
            "api_key": self.api_key,
            "api_type": "google"
        }


In [None]:
class EDAAgentSystem:
    def __init__(self, api_key):
        self.llm = GeminiLLM(api_key)
        self.dataset = None
        self._init_agents()

    def _init_agents(self):
        config = {"config_list": [self.llm.config()], "temperature": 0.2}

        self.agents = {
            "data_cleaner": AssistantAgent(
                name="DataCleanerAgent",
                llm_config=config,
                system_message="Clean data, handle NaNs and outliers."
            ),
            "eda_analyst": AssistantAgent(
                name="EDAAnalystAgent",
                llm_config=config,
                system_message="Perform statistical summaries and distributions."
            ),
            "report_writer": AssistantAgent(
                name="ReportWriterAgent",
                llm_config=config,
                system_message="Compile the EDA results into a summary report."
            ),
            "user_proxy": UserProxyAgent(
                name="UserAgent",
                human_input_mode="NEVER",
                system_message="Represent the user and coordinate EDA agents."
            )
        }

    def generate_data(self, seed=42):
        np.random.seed(seed)
        df = pd.DataFrame({
            "age": np.random.randint(20, 60, 200),
            "income": np.random.normal(50000, 12000, 200),
            "satisfaction": np.random.uniform(1, 10, 200)
        })
        df.loc[np.random.choice(df.index, 20), "income"] = np.nan
        df.loc[np.random.choice(df.index, 10), "satisfaction"] = 15
        self.dataset = df
        print("✅ Dataset created with missing values and outliers.")
        return df


In [None]:
    def summarize_data(self):
        if self.dataset is None:
            raise ValueError("Dataset not loaded.")
        summary = {
            "Shape": self.dataset.shape,
            "Missing Values": self.dataset.isnull().sum().to_dict(),
            "Description": self.dataset.describe().to_dict()
        }
        return summary


In [None]:
# Instantiate and run
api_key = "your_gemini_api_key_here"
eda = EDAAgentSystem(api_key)
df = eda.generate_data()
eda_summary = eda.summarize_data()
eda_summary


In [None]:
# Plot income histogram
sns.histplot(df['income'].dropna(), kde=True, color='teal', bins=30)
plt.title("Income Distribution")
plt.xlabel("Income")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
