<a href="https://colab.research.google.com/github/colinmcnamara/austin_langchain/blob/main/labs/LangChain_107/langgraph_ai_data_scientist_report_writer/langgraph_ai_data_scientist_report_writer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LangGraph AI Data Scientist Report Writer

In [None]:
%%writefile requirements.txt
langchain
langchain_experimental
langchain-openai
typing-extensions
cohere
langgraph
colorama
aiofiles
md2pdf
docx
htmldocx

Writing requirements.txt


In [None]:
# First, we install langchain and our other dependencies/libraries.
# The -q flag is used to quiet install the packages (not show logs as packages are being installed)
# The -r flag is used to specify the path to the requirements.txt file.
!pip install -q -r requirements.txt

In [None]:
import os
from getpass import getpass
# If you have your API key stored in your GDrive, you can run this cell to have your API key passed to OpenAI

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

## Utils

In [None]:
%%writefile pdf_styles.css

body {
    font-family: 'Libre Baskerville', serif;
    font-size: 12pt; /* standard size for academic papers */
    line-height: 1.6; /* for readability */
    color: #333; /* softer on the eyes than black */
    background-color: #fff; /* white background */
    margin: 0;
    padding: 0;
}

h1, h2, h3, h4, h5, h6 {
    font-family: 'Libre Baskerville', serif;
    color: #000; /* darker than the body text */
    margin-top: 1em; /* space above headers */
}

h1 {
    font-size: 2em; /* make h1 twice the size of the body text */
}

h2 {
    font-size: 1.5em;
}

/* Add some space between paragraphs */
p {
    margin-bottom: 1em;
}

/* Style for blockquotes, often used in academic papers */
blockquote {
    font-style: italic;
    margin: 1em 0;
    padding: 1em;
    background-color: #f9f9f9; /* a light grey background */
}

/* You might want to style tables, figures, etc. too */
table {
    border-collapse: collapse;
    width: 100%;
}

table, th, td {
    border: 1px solid #ddd;
    text-align: left;
    padding: 8px;
}

th {
    background-color: #f2f2f2;
    color: black;
}

Writing pdf_styles.css


In [None]:
import aiofiles
import urllib
import shutil
import uuid
import mistune
import datetime
from md2pdf.core import md2pdf
from docx import Document
from htmldocx import HtmlToDocx
from colorama import Fore, Style
from enum import Enum
from IPython.display import HTML, display
from langchain.adapters.openai import convert_openai_messages
from langchain_openai import ChatOpenAI


def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

def call_model(prompt: list, model: str, max_retries: int = 2, response_format: str = None) -> str:

    optional_params = {}
    if response_format == 'json':
        optional_params = {
            "response_format": {"type": "json_object"}
        }

    lc_messages = convert_openai_messages(prompt)
    response = ChatOpenAI(model=model, max_retries=max_retries, model_kwargs=optional_params).invoke(lc_messages).content
    return response

async def write_to_file(filename: str, text: str) -> None:
    """Asynchronously write text to a file in UTF-8 encoding.

    Args:
        filename (str): The filename to write to.
        text (str): The text to write.
    """
    # Convert text to UTF-8, replacing any problematic characters
    text_utf8 = text.encode('utf-8', errors='replace').decode('utf-8')

    async with aiofiles.open(filename, "w", encoding='utf-8') as file:
        await file.write(text_utf8)


async def write_text_to_md(text: str, path: str) -> str:
    """Writes text to a Markdown file and returns the file path.

    Args:
        text (str): Text to write to the Markdown file.

    Returns:
        str: The file path of the generated Markdown file.
    """
    task = uuid.uuid4().hex
    file_path = f"{path}/{task}.md"
    await write_to_file(file_path, text)
    print(f"Report written to {file_path}")
    return file_path


async def write_md_to_pdf(text: str, path: str) -> str:
    """Converts Markdown text to a PDF file and returns the file path.

    Args:
        text (str): Markdown text to convert.

    Returns:
        str: The encoded file path of the generated PDF.
    """
    task = uuid.uuid4().hex
    file_path = f"{path}/{task}.pdf"

    try:
        md2pdf(file_path,
               md_content=text,
               # md_file_path=f"{file_path}.md",
               css_file_path="/content/pdf_styles.css",
               base_url=None)
        print(f"Report written to {file_path}")
    except Exception as e:
        print(f"Error in converting Markdown to PDF: {e}")
        return ""

    encoded_file_path = urllib.parse.quote(file_path)
    return encoded_file_path


async def write_md_to_word(text: str, path: str) -> str:
    """Converts Markdown text to a DOCX file and returns the file path.

    Args:
        text (str): Markdown text to convert.

    Returns:
        str: The encoded file path of the generated DOCX.
    """
    task = uuid.uuid4().hex
    file_path = f"{path}/{task}.docx"

    try:
        # Convert report markdown to HTML
        html = mistune.html(text)
        # Create a document object
        doc = Document()
        # Convert the html generated from the report to document format
        HtmlToDocx().add_html_to_document(html, doc)

        # Saving the docx document to file_path
        doc.save(file_path)

        print(f"Report written to {file_path}")

        encoded_file_path = urllib.parse.quote(f"{file_path}.docx")
        return encoded_file_path

    except Exception as e:
        print(f"Error in converting Markdown to DOCX: {e}")
        return ""

class AgentColor(Enum):
    BRAIN = Fore.LIGHTBLUE_EX
    DATA_ANALYST = Fore.CYAN
    WRITING_PLANNER = Fore.LIGHTGREEN_EX
    WRITER = Fore.MAGENTA
    PUBLISHER = Fore.CYAN
    QUALITY_PLANNER = Fore.LIGHTWHITE_EX
    DATA_REVIEWER = Fore.LIGHTYELLOW_EX

def print_agent_output(output:str, agent: str="BRAIN"):
    print(f"{AgentColor[agent].value}{agent}: {output}{Style.RESET_ALL}")

In [None]:
# Test the function with different values
print_agent_output("Hello, world!")
print_agent_output("This is a test.", agent="DATA_ANALYST")

[94mBRAIN: Hello, world![0m
[36mDATA_ANALYST: This is a test.[0m


## Memory

In [None]:
from typing import TypedDict, List

class BrainState(TypedDict):
    data_analyst_instructions: List[dict]
    data_points: List[dict]
    report_content: dict


## Data Analyst Agent

In [None]:
import os
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, Any, List
from langchain_core.runnables import RunnableLambda, RunnableParallel
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel

class AnalysisResult(BaseModel):
    key_metrics: dict
    description: str
    source: str
    generated_insight: str
    data_type: str
    image: dict
    instructions_given: str
    code_used: str

class DataAnalystAgent:
    def __init__(self, task: dict, agent_dir: str):
        self.task = task
        self.file_inputs = task.get("file_inputs", [])
        self.user_query = task.get("user_query", "")
        self.output_dir = agent_dir
        self.llm = ChatOpenAI(temperature=0, model=task.get("model", "gpt-3.5-turbo-0125"))
        self.parser = JsonOutputParser(pydantic_object=AnalysisResult)

    def _create_agents(self) -> Dict[str, Any]:
        df_agents = {}
        for file_path in self.file_inputs:
            df = pd.read_csv(file_path)
            df_agent = create_pandas_dataframe_agent(
                self.llm,
                df,
                verbose=True,
                agent_executor_kwargs={"handle_parsing_errors": True},
                agent_type="tool-calling",
            )
            df_agents[file_path] = df_agent

        print_agent_output(f"Created {len(df_agents)} agents for files: {', '.join(df_agents.keys())}\n",
                           "DATA_ANALYST")
        return df_agents

    def _format_prompt(self, instruction: str, file_path: str) -> str:
        image_path = os.path.join(self.output_dir, f"{os.path.basename(file_path)}_{instruction.replace(' ', '_')}.png")
        image_caption = f"Generated image for instruction: {instruction}"

        if "visualize" in instruction.lower() or "plot" in instruction.lower() or "chart" in instruction.lower():
            visualization_instructions = (
                f"Generate visualizations as needed, save the image at the path: {image_path} "
                f"and use the caption: {image_caption}. Make sure the code to save the image is included in the 'code_used' key as a string.\n"
            )
            image_key_content = f'{{"image_path": "{image_path}", "image_caption": "{image_caption}"}}'
        else:
            visualization_instructions = (
                "Do not create or save any images. Ensure the image key returns empty strings.\n"
            )
            image_key_content = '{"image_path": "", "image_caption": ""}'

        prompt_text = (
            f"Please analyze the data and provide the output in the following JSON format:\n"
            f"{{\n"
            f'  "key_metrics": "Extracted key metrics from the data analysis.",\n'
            f'  "description": "Description of the data point extracted from the output.",\n'
            f'  "source": {file_path},\n'
            f'  "generated_insight": "Insight generated from the data analysis.",\n'
            f'  "data_type": "Type of the data analyzed (e.g., numerical, categorical).",\n'
            f'  "image": {image_key_content},\n'
            f'  "instructions_given": "Instructions given for the data analysis.",\n'
            f'  "code_used": "Python code used to generate the data analysis results as a string."\n'
            f"}}\n\n"
            f"Follow these instructions:\n"
            f"{instruction}\n\n"
            f"{visualization_instructions}\n"
            f"Ensure the 'code_used' key in the JSON output contains the exact Python code used as a string.\n"
            f"ONLY return valid JSON as your output. Make sure your JSON output is properly formatted and valid."
        )

        return prompt_text

    def _run_instruction(self, file_path: str, instruction: str) -> Dict[str, Any]:
        df_agent = self.df_agents[file_path]
        formatted_prompt = self._format_prompt(instruction, file_path)

        dataframe_response = df_agent.invoke(formatted_prompt)

        json_conversion_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", "Convert the following DataFrame description to JSON format without any additional text."),
                ("human", "{query}")
            ]
        )
        json_parser = JsonOutputParser(pydantic_object=AnalysisResult)

        conversion_query = {"query": dataframe_response}
        json_chain = json_conversion_prompt | self.llm | json_parser
        json_result = json_chain.invoke(conversion_query)


        return json_result

    def run_parallel(self, brain_state: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
        print_agent_output(
          f"Starting the data analysis process with model: {self.task.get('model')} and file inputs: {self.file_inputs}'...\n",
          "DATA_ANALYST",
        )
        self.df_agents = self._create_agents()

        instructions = brain_state["data_analyst_instructions"]
        formatted_instructions = "\n".join(f"{i+1}. {inst}" for i, inst in enumerate(instructions))
        print_agent_output(
            f"Got these instructions:\n{formatted_instructions}",
            "DATA_ANALYST",
        )
        runnables = {}

        for instruction in instructions:
            for file_path in self.file_inputs:
                file_name = os.path.basename(file_path)
                runnables[f"{file_name}_{instruction}"] = RunnableLambda(
                    lambda _, file_path=file_path, instruction=instruction: self._run_instruction(file_path, instruction)
                )

        parallel_executor = RunnableParallel(runnables)
        aggregated_runnable_results = parallel_executor.invoke(None)

        # Process results
        data_points = []
        for i, result in enumerate(aggregated_runnable_results.values()):
            point = i + 1
            data_point = {}
            data_point["id"] = f"data_point_{point}"
            data_point.update(result)

            # Create a single string with all key-value pairs formatted nicely
            data_point_str = f"Extracted data_point {point}:\n"
            for key, value in data_point.items():
                data_point_str += f"  {key}: {value}\n"

            print_agent_output(data_point_str, "DATA_ANALYST")

            data_points.append(data_point)

        print(f"Final data points: {data_points}\n")
        return {"data_points": data_points}

### Let's test it!

In [None]:
task = {
    "task": "data_analysis",
    "model": "gpt-4o",
    "file_inputs": [
        "/content/sample_data/california_housing_test.csv",
        "/content/sample_data/california_housing_train.csv",
    ],
    "user_query": "Analyze the housing data to uncover significant trends",
    "publish_formats": {
        "markdown": True,
        "pdf": False,
        "docx": False
    },
    "follow_guidelines": True,
    "guidelines": [
        "Use APA format.",
        "Ensure all sources are cited inline as markdown hyperlinks.",
        "Do not include any external sources or references.",
        "Only use the data provided.",
        "Clearly state if the data is insufficient for detailed analysis."
    ],
    "verbose": True
}

output_dir = "/content/data_analyst_outputs/"

# Clear the contents of the directory if it already exists
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# Define the BrainState with additional instructions
brain_state = BrainState(
    data_analyst_instructions=[
        "Get the head of the data",
        "Generate summary statistics for the data",
        "Visualize the distribution of the median house value",
    ]
)

# Initialize the DataAnalystAgent
data_analyst_agent = DataAnalystAgent(task, output_dir)

# Generate multiple data points
data_points = data_analyst_agent.run_parallel(brain_state)

# Output the results
print("Output the results")
print("_______________________________")
print("\n")
print(data_points)

[36mDATA_ANALYST: Starting the data analysis process with model: gpt-4o and file inputs: ['/content/sample_data/california_housing_test.csv', '/content/sample_data/california_housing_train.csv']'...
[0m
[36mDATA_ANALYST: Created 2 agents for files: /content/sample_data/california_housing_test.csv, /content/sample_data/california_housing_train.csv
[0m
[36mDATA_ANALYST: Got these instructions:
1. Get the head of the data
2. Generate summary statistics for the data
3. Visualize the distribution of the median house value[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df.head().to_dict()'}`


[0m[36;1m[1;3m{'longitude': {0: -122.05, 1: -118.3, 2: -117.81, 3: -118.36, 4: -119.67}, 'lat

## Writer Agent

In [None]:
sample_json = """
{
  "table_of_contents": "A table of contents in markdown syntax (using '-') based on the research headers and subheaders.",
  "title": "A well formed title for this report that clearly conveys what is discussed"
  "introduction": "An in-depth introduction to the topic in markdown syntax, clearly stating whether the provided data is sufficient to draw meaningful conclusions.",
  "conclusion": "A conclusion to the entire research based on all provided data in markdown syntax, explicitly mentioning if the data is insufficient for detailed analysis and conclusions.",
  "sources": "A list with strings of all used source links in the provided data points in markdown syntax. For example: ['-[filename or source name](source url or filepath)', ...]"
}
"""

class WriterAgent:
    def __init__(self, task: dict):
        self.task = task
        self.query = task.get("user_query")
        self.follow_guidelines = task.get("follow_guidelines")
        self.guidelines = task.get("guidelines")
        self.llm = task.get("model")
        self.verbose = task.get("verbose")

    def get_headers(self, middle_sections: List[str]):
        return {
            "title": "Title",
            "date": "Date",
            "introduction": "Introduction",
            "table_of_contents": "Table of Contents",
            "middle_sections": middle_sections,
            "conclusion": "Conclusion",
            "sources": "Sources"
        }

    def format_prompt(self, query, data, follow_guidelines, guidelines, section="main", section_title=None):
        sources = json.dumps(list(set(data_point['source'] for data_point in data)))
        formatted_guidelines = f"Guidelines: {guidelines}" if follow_guidelines else ""
        section_instruction = ""
        return_format = sample_json

        if section == "section" and section_title:
            section_instruction = f"Section title:{section_title}\nPlease write a detailed section on the above title using the provided data."
            return_format = '{"section_content": "The content for the specified section in markdown syntax (do NOT include the section title or use ## to write a title)"}'

        prompt_content = (
            f"Today's date is {datetime.now().strftime('%d/%m/%Y')}.\n"
            f"Query or Topic: {query}\n"
            f"Research data: {json.dumps(data, indent=2)}\n"
            f"Sources: {sources}\n"
            f"You MUST include any relevant sources as markdown hyperlinks.\n"
            f"Do not include headers in the results.\n"
            f"Only use the data provided; do not include any external sources or references.\n"
            f"Do not fabricate data or conclusions.\n"
            f"Example format for references: 'This point is supported by... ([source name](source url or filepath))'\n\n"
            f"If the data points do not contain quantifiable data, clearly state that the data is insufficient for a detailed report or meaningful conclusions.\n\n"
            f"{formatted_guidelines}\n"
            f"{section_instruction}\n"
            f"Return only a JSON in the following format (without json markdown):\n"
            f"{return_format}"
        )

        return [
            {
                "role": "system",
                "content": (
                    f"You are an expert technical report writer specializing in analyzing and presenting\n"
                    f"quantifiable data to inform decision-making processes. Your sole purpose is to generate\n"
                    f"well-written, data-driven reports based on the provided sources using simple language."
                )
            },
            {
                "role": "user",
                "content": prompt_content
            }
        ]

    def write_main_sections(self, brain_state: BrainState):
        print_agent_output("Generating main sections of the report...", agent="WRITER")
        data = brain_state.get("data_points")

        prompt = self.format_prompt(self.query, data, self.follow_guidelines, self.guidelines, section="main")
        print_agent_output(f"Main section prompt: {prompt}", agent="WRITER")

        response = call_model(prompt, self.llm, max_retries=2, response_format='json')
        print_agent_output(f"Main section response: {response}", agent="WRITER")

        return json.loads(response)

    def revise_headers(self, headers: dict):
        print_agent_output("Revising headers based on guidelines...", agent="WRITER")
        prompt = [{
            "role": "system",
            "content": """You are a research writer.
            Your sole purpose is to revise the headers data based on the given guidelines."""
                    }, {
                        "role": "user",
                        "content": f"""Your task is to revise the given headers JSON based on the guidelines given.
            You are to follow the guidelines but the values should be in simple strings, ignoring all markdown syntax.
            You must return nothing but a JSON in the same format as given in headers data.
            Guidelines: {self.guidelines}\n
            Headers Data: {headers}\n
            """
        }]

        response = call_model(prompt, self.llm, response_format='json')
        print_agent_output(f"Revised headers response: {response}", agent="WRITER")

        return {"headers": json.loads(response)}

    def fill_sections(self, brain_state: BrainState, table_of_contents: str):
        print_agent_output("Filling in the middle sections of the report...", agent="WRITER")
        data = brain_state.get("data_points")
        sections = table_of_contents.split('\n')

        print_agent_output(f"sections: {sections}", agent="WRITER")

        main_sections = [section.strip('- ') for section in sections[1:-2]]  # Excluding 'Introduction', 'Conclusion', and 'Sources'
        filled_sections = {}

        for section_title in main_sections:
            print_agent_output(f"Generating content for section: {section_title}", agent="WRITER")
            prompt = self.format_prompt(self.query, data, self.follow_guidelines, self.guidelines,
                                        section="section", section_title=section_title)
            print_agent_output(f"Section prompt for {section_title}: {prompt}", agent="WRITER")

            response = call_model(prompt, self.llm, max_retries=2, response_format='json')
            print_agent_output(f"Section response for {section_title}: {response}", agent="WRITER")
            section_content = json.loads(response).get("section_content")
            filled_sections[section_title] = section_content

        return filled_sections, main_sections

    def write_report(self, brain_state: BrainState):
        print_agent_output("Starting report generation...", agent="WRITER")
        report_layout_content = self.write_main_sections(brain_state)

        if self.verbose:
            print_agent_output(f"Initial report layout content: {report_layout_content}", agent="WRITER")

        table_of_contents = report_layout_content.get("table_of_contents")
        middle_sections_content, middle_sections_headers = self.fill_sections(brain_state, table_of_contents)

        headers = self.get_headers(middle_sections_headers)
        if self.follow_guidelines:
            print_agent_output("Revising layout based on guidelines...", agent="WRITER")
            headers = self.revise_headers(headers=headers).get("headers")

        final_report = {**report_layout_content, "middle_sections": middle_sections_content, "headers": headers}
        print_agent_output(f"Final report content: {final_report}", agent="WRITER")
        return {"report_content" : final_report}

## Let's test it!

In [None]:
data_points = [
      {
            "id": "data_point_1",
            "key_metrics": "The dataset includes key metrics such as longitude, latitude, housing median age, total rooms, total bedrooms, population, households, median income, and median house value.",
            "description": "The data points represent various attributes of housing in California, including geographical coordinates, housing characteristics, and economic factors.",
            "source": "/content/sample_data/california_housing_test.csv",
            "generated_insight": "The dataset provides a comprehensive overview of housing conditions in different regions of California, which can be used for further analysis such as predicting house prices or understanding demographic distributions.",
            "data_type": "Numerical",
            "image": {
                  "image_path": "",
                  "image_caption": ""
            },
            "instructions_given": "Get the head of the data",
            "code_used": "df.head()"
      },
      {
            "id": "data_point_2",
            "key_metrics": "The dataset includes key metrics such as longitude, latitude, housing median age, total rooms, total bedrooms, population, households, median income, and median house value.",
            "description": "The data points represent various attributes of housing in California, including geographical coordinates, housing characteristics, and economic factors.",
            "source": "/content/sample_data/california_housing_train.csv",
            "generated_insight": "The dataset provides a comprehensive overview of housing conditions in California, which can be used for further analysis such as predicting house prices or understanding demographic distributions.",
            "data_type": "Numerical",
            "image": {
                  "image_path": "",
                  "image_caption": ""
            },
            "instructions_given": "Get the head of the data",
            "code_used": "df.head()"
      },
      {
            "id": "data_point_3",
            "key_metrics": "Summary statistics including count, mean, standard deviation, min, 25th percentile, median, 75th percentile, and max for each column.",
            "description": "The dataset contains information about housing in California, including geographical coordinates, housing age, number of rooms, population, households, median income, and median house value.",
            "source": "/content/sample_data/california_housing_test.csv",
            "generated_insight": "The average median house value is approximately $205,846.27, with a standard deviation of $113,119.69. The median income ranges from $0.50 to $15.00, with an average of $3.81.",
            "data_type": "Numerical",
            "image": {
                  "image_path": "",
                  "image_caption": ""
            },
            "instructions_given": "Generate summary statistics for the data",
            "code_used": "summary_stats = df.describe()\nsummary_stats"
      },
      {
            "id": "data_point_4",
            "key_metrics": {
                  "longitude": {
                        "count": 17000,
                        "mean": -119.562108,
                        "std": 2.005166,
                        "min": -124.35,
                        "25%": -121.79,
                        "50%": -118.49,
                        "75%": -118.0,
                        "max": -114.31
                  },
                  "latitude": {
                        "count": 17000,
                        "mean": 35.625225,
                        "std": 2.13734,
                        "min": 32.54,
                        "25%": 33.93,
                        "50%": 34.25,
                        "75%": 37.72,
                        "max": 41.95
                  },
                  "housing_median_age": {
                        "count": 17000,
                        "mean": 28.589353,
                        "std": 12.586937,
                        "min": 1.0,
                        "25%": 18.0,
                        "50%": 29.0,
                        "75%": 37.0,
                        "max": 52.0
                  },
                  "total_rooms": {
                        "count": 17000,
                        "mean": 2643.664412,
                        "std": 2179.947071,
                        "min": 2.0,
                        "25%": 1462.0,
                        "50%": 2127.0,
                        "75%": 3151.25,
                        "max": 37937.0
                  },
                  "total_bedrooms": {
                        "count": 17000,
                        "mean": 539.410824,
                        "std": 421.499452,
                        "min": 1.0,
                        "25%": 297.0,
                        "50%": 434.0,
                        "75%": 648.25,
                        "max": 6445.0
                  },
                  "population": {
                        "count": 17000,
                        "mean": 1429.573941,
                        "std": 1147.852959,
                        "min": 3.0,
                        "25%": 790.0,
                        "50%": 1167.0,
                        "75%": 1721.0,
                        "max": 35682.0
                  },
                  "households": {
                        "count": 17000,
                        "mean": 501.221941,
                        "std": 384.520841,
                        "min": 1.0,
                        "25%": 282.0,
                        "50%": 409.0,
                        "75%": 605.25,
                        "max": 6082.0
                  },
                  "median_income": {
                        "count": 17000,
                        "mean": 3.883578,
                        "std": 1.908157,
                        "min": 0.4999,
                        "25%": 2.566375,
                        "50%": 3.5446,
                        "75%": 4.767,
                        "max": 15.0001
                  },
                  "median_house_value": {
                        "count": 17000,
                        "mean": 207300.912353,
                        "std": 115983.764387,
                        "min": 14999.0,
                        "25%": 119400.0,
                        "50%": 180400.0,
                        "75%": 265000.0,
                        "max": 500001.0
                  }
            },
            "description": "The dataset contains information about housing in California, including geographical coordinates, housing age, number of rooms, population, households, median income, and median house value.",
            "source": "/content/sample_data/california_housing_train.csv",
            "generated_insight": "The median house value in the dataset ranges from $14,999 to $500,001, with a mean value of approximately $207,301. The median income varies significantly, with a mean of approximately $3.88.",
            "data_type": "Numerical",
            "image": {
                  "image_path": "",
                  "image_caption": ""
            },
            "instructions_given": "Generate summary statistics for the data",
            "code_used": "summary_stats = df.describe()\nsummary_stats"
      },
      {
            "id": "data_point_5",
            "key_metrics": {
                  "mean_median_house_value": 205846.275,
                  "median_median_house_value": 177650.0,
                  "std_median_house_value": 113119.68746964628,
                  "min_median_house_value": 22500.0,
                  "max_median_house_value": 500001.0
            },
            "description": "The dataset contains information about housing in California, including median house values, which range from $22,500 to $500,001. The average median house value is approximately $205,846.28, with a standard deviation of $113,119.69.",
            "source": "/content/sample_data/california_housing_test.csv",
            "generated_insight": "The distribution of median house values shows a significant spread, with a notable peak around the lower end of the value range, indicating a higher frequency of lower-valued houses.",
            "data_type": "Numerical",
            "image": {
                  "image_path": "/content/data_analyst_outputs/california_housing_test.csv_Visualize_the_distribution_of_the_median_house_value.png",
                  "image_caption": "Generated image for instruction: Visualize the distribution of the median house value"
            },
            "instructions_given": "Visualize the distribution of the median house value",
            "code_used": "import matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Set the style of the visualization\nsns.set(style=\"whitegrid\")\n\n# Create the plot\nplt.figure(figsize=(10, 6))\nsns.histplot(df['median_house_value'], bins=30, kde=True)\nplt.title('Distribution of Median House Value')\nplt.xlabel('Median House Value')\nplt.ylabel('Frequency')\n\n# Save the plot\nimage_path = '/content/data_analyst_outputs/california_housing_test.csv_Visualize_the_distribution_of_the_median_house_value.png'\nplt.savefig(image_path)\nplt.close()"
      },
      {
            "id": "data_point_6",
            "key_metrics": {
                  "mean_median_house_value": 207300.91235294117,
                  "median_median_house_value": 180400.0,
                  "std_median_house_value": 115983.76438720885,
                  "min_median_house_value": 14999.0,
                  "max_median_house_value": 500001.0
            },
            "description": "The dataset contains information about housing in California, including the median house value. Key metrics such as mean, median, standard deviation, minimum, and maximum values of the median house value have been extracted.",
            "source": "/content/sample_data/california_housing_train.csv",
            "generated_insight": "The median house value in California varies significantly, with a mean value of approximately $207,301 and a standard deviation of about $115,984. The values range from $14,999 to $500,001.",
            "data_type": "Numerical",
            "image": {
                  "image_path": "/content/data_analyst_outputs/california_housing_train.csv_Visualize_the_distribution_of_the_median_house_value.png",
                  "image_caption": "Generated image for instruction: Visualize the distribution of the median house value"
            },
            "instructions_given": "Visualize the distribution of the median house value",
            "code_used": "import matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Visualize the distribution of the median house value\nplt.figure(figsize=(10, 6))\nsns.histplot(df['median_house_value'], bins=50, kde=True)\nplt.title('Distribution of Median House Value')\nplt.xlabel('Median House Value')\nplt.ylabel('Frequency')\n\n# Save the image\nimage_path = '/content/data_analyst_outputs/california_housing_train.csv_Visualize_the_distribution_of_the_median_house_value.png'\nplt.savefig(image_path)\nplt.close()"
      }
]


In [None]:
import json
from datetime import datetime

# New task object for input
task = {
    "task": "data_analysis",
    "model": "gpt-4o",
    "file_inputs": [
        "/content/sample_data/california_housing_test.csv",
        "/content/sample_data/california_housing_train.csv",
    ],
    "user_query": "Analyze the housing data to uncover significant trends",
    "publish_formats": {
        "markdown": True,
        "pdf": False,
        "docx": False
    },
    "follow_guidelines": True,
    "guidelines": [
        "Use APA format.",
        "Ensure all sources are cited inline as markdown hyperlinks.",
        "Do not include any external sources or references.",
        "Only use the data provided.",
        "Clearly state if the data is insufficient for detailed analysis."
    ],
    "verbose": True
}

# Creating brain_state with provided data points
brain_state = BrainState(
    data_points=data_points
)


print("Test with actual quantifiable data points:\n\n")
writer_agent = WriterAgent(task)
result = writer_agent.write_report(brain_state)
print(json.dumps(result, indent=4))

Test with actual quantifiable data points:


[35mWRITER: Starting report generation...[0m
[35mWRITER: Generating main sections of the report...[0m
[35mWRITER: Main section prompt: [{'role': 'system', 'content': 'You are an expert technical report writer specializing in analyzing and presenting\nquantifiable data to inform decision-making processes. Your sole purpose is to generate\nwell-written, data-driven reports based on the provided sources using simple language.'}, {'role': 'user', 'content': 'Today\'s date is 29/05/2024.\nQuery or Topic: Analyze the housing data to uncover significant trends\nResearch data: [\n  {\n    "id": "data_point_1",\n    "key_metrics": "The dataset includes key metrics such as longitude, latitude, housing median age, total rooms, total bedrooms, population, households, median income, and median house value.",\n    "description": "The data points represent various attributes of housing in California, including geographical coordinates, housing charact

## Publisher Agent

In [None]:
class PublisherAgent:
    def __init__(self, task: dict, output_dir: str):
        self.output_dir = output_dir
        self.publish_formats = task.get("publish_formats")
        self.verbose = task.get("verbose")

    async def publish_research_report(self, report_content: dict):
        layout = self.generate_layout(report_content)
        await self.write_report_by_formats(layout)
        return layout

    def generate_layout(self, report_content: dict):
        headers = report_content.get("headers")
        middle_sections = report_content.get("middle_sections")

        sections = '\n\n'.join(middle_sections[section] for section in headers.get("middle_sections"))
        sources = '\n'.join(report_content.get("sources"))

        layout = f"""# {report_content.get('title')}
        #### Date: {headers.get('date')}

        ## {headers.get("introduction")}
        {report_content.get('introduction')}

        ## {headers.get("table_of_contents")}
        {report_content.get('table_of_contents')}

        {sections}

        ## {headers.get("conclusion")}
        {report_content.get('conclusion')}

        ## {headers.get("sources")}
        {sources}
        """
        return layout

    async def write_report_by_formats(self, layout: str):
        if self.publish_formats.get("pdf"):
            await write_md_to_pdf(layout, self.output_dir)
        if self.publish_formats.get("docx"):
            await write_md_to_word(layout, self.output_dir)
        if self.publish_formats.get("markdown"):
            await write_text_to_md(layout, self.output_dir)

    async def run(self, brain_state: BrainState):
        report_content = brain_state.get("report_content")
        print_agent_output(output="Publishing final research report based on retrieved data...", agent="PUBLISHER")
        final_research_report = await self.publish_research_report(report_content)
        return {"report": final_research_report}

In [None]:
report_content = {
    "table_of_contents": "- Introduction\n- Analysis of Key Metrics\n  - Longitude and Latitude\n  - Housing Median Age\n  - Total Rooms and Total Bedrooms\n  - Population and Households\n  - Median Income and Median House Value\n- Conclusion\n- Sources",
    "title": "Analysis of Housing Trends in California",
    "introduction": "This report aims to analyze the housing data from California to uncover significant trends. The provided datasets contain various numerical metrics such as longitude, latitude, housing median age, total rooms, total bedrooms, population, households, median income, and median house value. Through examining these datasets, we can gain insights into the housing characteristics and trends across different locations in California. This data is sufficient to draw meaningful conclusions about the housing market trends in this region.",
    "conclusion": "The analysis of the provided housing data reveals several significant trends in California's housing market. The geographical metrics indicate a concentration of housing around specific latitudes and longitudes. Housing median ages show a relatively balanced distribution, while the total number of rooms and bedrooms varies widely, indicating diversity in housing sizes. The population and household metrics suggest significant variation, likely reflecting different types of communities. Median income and house values also display wide ranges, with notable skewness towards higher values. Overall, the data is sufficient to provide a comprehensive overview of the housing trends in California.",
    "sources": [
        "-[california_housing_test.csv](/content/sample_data/california_housing_test.csv)",
        "-[california_housing_train.csv](/content/sample_data/california_housing_train.csv)"
    ],
    "middle_sections": {
        "Analysis of Key Metrics": "The provided datasets offer a comprehensive view of the housing characteristics in California. The analysis of key metrics reveals several significant trends and insights.\n\nThe dataset from [california_housing_test.csv](source:/content/sample_data/california_housing_test.csv) includes summary statistics for the following metrics: longitude, latitude, housing median age, total rooms, total bedrooms, population, households, median income, and median house value. Key insights from this data show that the median house value ranges from $22,500 to $500,001 with a mean value of $205,846. This suggests a right-skewed distribution, further supported by the median value of $177,650 and a standard deviation of $113,119. Additionally, the median income in this dataset has a mean of 3.807272, indicating a moderate income level across the regions sampled.\n\nSimilarly, the dataset from [california_housing_train.csv](source:/content/sample_data/california_housing_train.csv) provides an extended view, with data counts of up to 17,000 entries. The median house value in this dataset exhibits a mean of $207,301 and a median of $180,400, with values ranging from $14,999 to $500,001. These statistics suggest a significant variation in housing prices across different regions. The median income in this dataset shows a mean of 3.883578, slightly higher than the test dataset, indicating a consistent but slightly varied income level.\n\nThe summary statistics for both datasets indicate that the housing median age has a mean of approximately 28.7 years, with most houses being between 18 to 37 years old. The total number of rooms and bedrooms also varies widely, with mean values around 2,600 rooms and 530 bedrooms per entry.\n\nVisualizations of the median house values from both datasets confirm the skewness observed in the numerical summaries. The histograms generated show a concentration of house values in the lower to mid-range, with fewer houses in the higher price brackets.\n\nOverall, the housing data from these datasets suggest that California's housing market displays significant variability in terms of house values, income levels, and housing characteristics. The data hint at underlying trends such as a right-skewed distribution of house prices and a moderately aged housing stock. These insights can be crucial for stakeholders looking to understand or predict housing market behaviors in California.",
        "Longitude and Latitude": "The datasets from the California housing data provide detailed insights into the geographical distribution of housing characteristics through longitude and latitude coordinates. The summary statistics for longitude and latitude from both the test and train datasets reveal significant trends and variations across different regions of California.\n\nThe test dataset comprises 3,000 entries, with longitude values ranging from -124.18 to -114.49 and latitude values ranging from 32.56 to 41.92 ([source]( /content/sample_data/california_housing_test.csv)). Key statistics include a mean longitude of -119.5892 and a mean latitude of 35.63539. The standard deviation for longitude is 1.994936, and for latitude, it is 2.12967. This indicates a broad geographical spread, covering coastal and inland areas.\n\nSimilarly, the train dataset, with 17,000 entries, shows longitude values ranging from -124.35 to -114.31 and latitude values from 32.54 to 41.95 ([source]( /content/sample_data/california_housing_train.csv)). The mean longitude is -119.562108, and the mean latitude is 35.625225, with standard deviations of 2.005166 for longitude and 2.13734 for latitude. These figures are consistent with the test dataset, reinforcing the diverse geographical coverage.\n\nBoth datasets show that the 25th percentile of longitude is approximately -121.79, and the 75th percentile is around -118.0, suggesting that a significant portion of the data falls within these longitudes. For latitude, the 25th percentile is close to 33.93, and the 75th percentile is around 37.72, indicating that the majority of the data points are concentrated within these latitudinal ranges.\n\nOverall, the geographical coordinates in the datasets highlight the extensive coverage across California's various regions, capturing both coastal and inland areas, which is crucial for understanding housing trends and patterns.",
        "Housing Median Age": "The dataset provides comprehensive details on the housing median age across various locations in California. Analyzing the summary statistics from the datasets, we can observe significant trends in the age of the housing stock. The housing median age, which represents the age of the buildings, is a key metric indicating the period during which most of the housing structures were built.\n\nFrom the [test dataset](/content/sample_data/california_housing_test.csv), the housing median age has a mean value of approximately 28.85 years, with a standard deviation of around 12.56 years. The minimum age recorded is 1 year, while the maximum age is 52 years. The data is distributed with a 25th percentile (Q1) of 18 years, a median (Q2) of 29 years, and a 75th percentile (Q3) of 37 years.\n\nSimilarly, the [train dataset](/content/sample_data/california_housing_train.csv) shows a mean housing median age of about 28.59 years, with a standard deviation of 12.59 years. The minimum and maximum ages are identical to those in the test dataset, being 1 year and 52 years, respectively. The 25th percentile (Q1) is 18 years, the median (Q2) is 29 years, and the 75th percentile (Q3) is 37 years.\n\nThese statistics indicate that the majority of the housing stock in California is relatively old, with a substantial portion of homes being over 30 years old. This trend can have several implications, including the need for renovations, updates to meet modern building codes, and potential historical value. The consistency between the test and train datasets reinforces the reliability of these insights.\n\nThis point is supported by the detailed summary statistics from the datasets ([test dataset](/content/sample_data/california_housing_test.csv), [train dataset](/content/sample_data/california_housing_train.csv)).",
        "Total Rooms and Total Bedrooms": "The datasets provide a comprehensive overview of housing characteristics in California, including the total number of rooms and total bedrooms across various locations. The analysis of these metrics shows significant trends and distributions that can inform housing policies and market strategies.\n\nIn the [California Housing Test dataset](source), the total number of rooms ranges from 6 to 30,450 with a mean of approximately 2,600 rooms. The distribution is highly variable, with a standard deviation of around 2,156 rooms. The median number of rooms is 2,106, indicating that half of the observations have fewer rooms than this value. Similarly, the total number of bedrooms varies widely, from 2 to 5,419 bedrooms, with a mean of about 530 bedrooms and a standard deviation of approximately 416 bedrooms. The median number of bedrooms is 437.\n\nThe [California Housing Train dataset](source) shows a similar pattern. The total number of rooms ranges from 2 to 37,937, with a mean of around 2,644 rooms and a standard deviation of about 2,180 rooms. The median number of rooms is 2,127, suggesting a slightly higher central tendency compared to the test dataset. The total number of bedrooms in this dataset ranges from 1 to 6,445, with a mean of approximately 539 bedrooms and a standard deviation of around 421 bedrooms. The median number of bedrooms is 434.\n\nThe variability in the number of total rooms and bedrooms across the datasets indicates a diverse housing stock in California, with some areas having significantly larger homes than others. This diversity is essential for understanding the housing market dynamics and can help in tailoring housing policies to meet the needs of different communities.",
        "Population and Households": "The analysis of the housing data from California reveals significant insights into population and households characteristics. The datasets ([california_housing_test.csv](source url)) and ([california_housing_train.csv](source url)) provide detailed statistics across multiple entries.\n\nThe population data shows a wide range of values. In the test dataset, the population count spans from a minimum of 5 to a maximum of 11,935, with a mean of approximately 1,402.80 and a standard deviation of 1,030.54. Similarly, in the training dataset, the population ranges from 3 to 35,682, with a mean of about 1,429.57 and a higher standard deviation of 1,147.85. This suggests that there are areas with both very low and extremely high populations, indicating a significant variation in population density across different regions of California.\n\nRegarding households, the test dataset shows a distribution from a minimum of 2 to a maximum of 4,930 households, with a mean of approximately 489.91 and a standard deviation of 365.42. In the training dataset, the household count ranges from 1 to 6,082, with a mean of about 501.22 and a standard deviation of 384.52. This indicates that household sizes and counts also vary widely, reflecting diverse living arrangements and possibly varying levels of urbanization in different regions.\n\nThese findings are supported by the descriptive statistics from the datasets ([source](source url)). The significant standard deviations in both population and households data highlight the diversity in housing and living conditions across California. These variations can have important implications for policy-making, urban planning, and resource allocation.",
        "Median Income and Median House Value": "The dataset provides comprehensive insights into housing characteristics in California, specifically focusing on median income and median house values. The data includes both a test set and a training set, each offering valuable metrics for analysis ([/content/sample_data/california_housing_test.csv](source url or filepath), [/content/sample_data/california_housing_train.csv](source url or filepath)):\n\n**Median Income:**\n\n- In the test set, the median income has a mean of approximately 3.807, with a standard deviation of 1.855. The minimum observed median income is 0.4999, and the maximum is 15.0001. The 25th percentile of median income is 2.544, the median (50th percentile) is 3.487, and the 75th percentile is 4.656 ([/content/sample_data/california_housing_test.csv](source url or filepath)).\n- In the training set, the mean median income is slightly higher at 3.884, with a standard deviation of 1.908. The minimum and maximum are consistent with the test set (0.4999 and 15.0001, respectively). The 25th percentile is 2.566, the median is 3.545, and the 75th percentile is 4.767 ([/content/sample_data/california_housing_train.csv](source url or filepath)).\n\n**Median House Value:**\n\n- For the test set, the median house value ranges from $22,500 to $500,001, with a mean value of approximately $205,846 and a median value of $177,650. The standard deviation is $113,120, suggesting significant variability in house prices. The distribution is right-skewed, with the 25th percentile at $121,200 and the 75th percentile at $263,975 ([/content/sample_data/california_housing_test.csv](source url or filepath)).\n- The training set shows a similar range for median house values, from $14,999 to $500,001, with a mean of approximately $207,301 and a median of $180,400. The standard deviation is $115,984. The 25th and 75th percentiles are $119,400 and $265,000, respectively ([/content/sample_data/california_housing_train.csv](source url or filepath)).\n\n**Visual Representation:**\n\n- To better understand the distribution of median house values, histograms were created for both datasets. These visualizations confirm the right-skewed nature of the data, with a significant number of houses valued at the upper end of the range. The histograms further illustrate the variability and central tendency of house prices in California ([/content/sample_data/california_housing_test.csv](source url or filepath), [/content/sample_data/california_housing_train.csv](source url or filepath)).\n\nIn summary, the analysis of median income and median house values reveals a broad spectrum of economic conditions in California's housing market. The substantial range and variability in house values suggest diverse housing affordability and market conditions across the state."
    },
    "headers": {
        "title": "Title",
        "date": "Date",
        "introduction": "Introduction",
        "table_of_contents": "Table of Contents",
        "middle_sections": [
            "Analysis of Key Metrics",
            "Longitude and Latitude",
            "Housing Median Age",
            "Total Rooms and Total Bedrooms",
            "Population and Households",
            "Median Income and Median House Value"
        ],
        "conclusion": "Conclusion",
        "sources": "Sources"
    }
}


## Let's test it!

In [None]:
import asyncio
import nest_asyncio
import json

nest_asyncio.apply()
# Sample task dictionary
task = {
    "task": "data_analysis",
    "model": "gpt-4o",
    "file_inputs": [
        "/content/sample_data/california_housing_test.csv",
        "/content/sample_data/california_housing_train.csv",
    ],
    "user_query": "Analyze the housing data to uncover significant trends",
    "publish_formats": {
        "markdown": True,
        "pdf": False,
        "docx": False
    },
    "follow_guidelines": True,
    "guidelines": [
        "Use APA format.",
        "Ensure all sources are cited inline as markdown hyperlinks.",
        "Do not include any external sources or references.",
        "Only use the data provided.",
        "Clearly state if the data is insufficient for detailed analysis."
    ],
    "verbose": True
}

# Creating a brain_state with the provided report_content
brain_state = BrainState(
    data_points=[],
    report_content=report_content,
)

output_dir = "/content/output"

# Clear the contents of the directory if it already exists
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

async def test_publisher_agent():
    print("Test PublisherAgent with provided report content:")
    output_dir = "/content/output"
    publisher_agent = PublisherAgent(task, output_dir)
    result = await publisher_agent.run(brain_state)
    print(json.dumps(result, indent=4))

# Run the test
asyncio.run(test_publisher_agent())

## The Brain

In [None]:
import os
import time
from langgraph.graph import StateGraph, END

class BrainState(TypedDict):
    data_analyst_instructions: List[str]
    data_points: List[dict]
    report_content: dict

class Brain:
    def __init__(self, task: dict):
        self.task_id = int(
            time.time()
        )

        self.output_dir = f"/content/outputs/run_{self.task_id}_{task.get('user_query')[0:40]}"
        self.agent_dir = f"/content/agent_files/run_{self.task_id}_{task.get('user_query')[0:40]}"

        self.task = task

        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.agent_dir, exist_ok=True)

    def init_team(self):
        data_analyst = DataAnalystAgent(self.task, self.agent_dir)
        writer = WriterAgent(self.task)
        publisher = PublisherAgent(self.task, self.output_dir)

        workflow = StateGraph(BrainState)

        workflow.add_node("data analyst", data_analyst.run_parallel)
        workflow.add_node("writer", writer.write_report)
        workflow.add_node("publisher", publisher.run)

        workflow.set_entry_point("data analyst")

        # Add the edges that always occur
        workflow.add_edge("data analyst", "writer")
        workflow.add_edge("writer", "publisher")
        workflow.add_edge("publisher", END)

        return workflow

    async def run_task(self):
      analysis_team = self.init_team()
      chain = analysis_team.compile()

      print_agent_output(
          f"Starting the data analysis process on....'{self.task.get('user_query')}'...",
          "BRAIN",
      )

      initial_state = {
          "data_analyst_instructions": [
            "Get the head of the data",
            "Generate summary statistics for the data",
            "Visualize the distribution of the median house value",
          ],
          "data_points": [],
          "report_content": {}
      }

      result = await chain.ainvoke(initial_state)

      return result


In [None]:
import asyncio
import nest_asyncio
import json

task = {
    "task": "data_analysis",
    "model": "gpt-4o",
    "file_inputs": [
        "/content/sample_data/california_housing_test.csv",
        "/content/sample_data/california_housing_train.csv",
    ],
    "user_query": "Analyze the housing data to uncover significant trends",
    "publish_formats": {
        "markdown": True,
        "pdf": False,
        "docx": False
    },
    "follow_guidelines": True,
    "guidelines": [
        "Use APA format.",
        "Ensure all sources are cited inline as markdown hyperlinks.",
        "Do not include any external sources or references.",
        "Only use the data provided.",
        "Clearly state if the data is insufficient for detailed analysis."
    ],
    "verbose": True
}

async def main():
  brain = Brain(task)
  report = await brain.run_task()

  return report

nest_asyncio.apply()
result = asyncio.run(main())
print(json.dumps(result, indent=4))

[94mBRAIN: Starting the data analysis process on....'Analyze the housing data to uncover significant trends'...[0m
[36mDATA_ANALYST: Starting the data analysis process with model: gpt-4o and file inputs: ['/content/sample_data/california_housing_test.csv', '/content/sample_data/california_housing_train.csv']'...
[0m
[36mDATA_ANALYST: Created 2 agents for files: /content/sample_data/california_housing_test.csv, /content/sample_data/california_housing_train.csv
[0m
[36mDATA_ANALYST: Got these instructions:
1. Get the head of the data
2. Generate summary statistics for the data
3. Visualize the distribution of the median house value[0m


[1m> Entering new AgentExecutor chain...[0m

[1m> Entering new AgentExecutor chain...[0m



[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m

[1m> Entering new AgentExecutor chain...[0m

[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'summ