In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# mypy: disable-error-code="import-not-found"

# The notebook should be executed from the project root directory
import os
import sys
from pathlib import Path

if "_correct_path" not in locals():
    os.chdir("..")
    sys.path.append(".")
    print(f"changed dir to {Path('.').resolve()})")
    _correct_path = True

In [None]:
import os

import polars as pl

from utils.schema import AnalystDataset

In [None]:
dataset_url = "https://s3.amazonaws.com/datarobot_public_datasets/10k_diabetes_20.csv"

df = pl.read_csv(dataset_url, infer_schema_length=10000)

# Create dataset dictionary
dataset = AnalystDataset(
    name=os.path.splitext(os.path.basename(dataset_url))[0], data=df
)

In [None]:
from datetime import datetime

from utils.api import cleanse_dataframe

start = datetime.now()

cleansed_data = await cleanse_dataframe(dataset)

end = datetime.now()
print(f"Time taken: {end - start}")
analysis_data = cleansed_data

In [None]:
cleansed_data.cleaning_report

In [None]:
from utils.api import suggest_questions

suggested_questions = await suggest_questions([analysis_data])

In [None]:
from utils.api import rephrase_message
from utils.schema import ChatRequest

question = "What is the relationship between length of stay and readmission?"
chat_response = await rephrase_message(
    messages=ChatRequest(
        messages=[
            {
                "role": "user",
                "content": question,
            },
            {
                "role": "user",
                "content": question + "Please order the chart by readmission rate",
            },
        ],
    )
)

In [None]:
from utils.api import get_dictionary

dictionary = await get_dictionary(analysis_data)

In [None]:
from utils.analyst_db import AnalystDB

analyst_db = await AnalystDB.create(
    "user_123",
    ".",
    "chats",
    "datasets",
)

await analyst_db.register_dataset(analysis_data)

await analyst_db.register_data_dictionary(dictionary)

In [None]:
from utils.api import run_analysis
from utils.schema import RunAnalysisRequest

analysis_request = RunAnalysisRequest(
    dataset_names=[analysis_data.name],
    question=chat_response,
)
analysis_result = await run_analysis(analysis_request, analyst_db=analyst_db)

In [None]:
import asyncio

from utils.api import get_business_analysis, run_charts
from utils.schema import (
    DataDictionary,
    GetBusinessAnalysisRequest,
    RunChartsRequest,
)

# Prepare requests
chart_request = RunChartsRequest(
    dataset=analysis_result.dataset,
    question=chat_response,
)

business_request = GetBusinessAnalysisRequest(
    dataset=analysis_result.dataset,
    dictionary=DataDictionary.from_analyst_df(analysis_result.dataset.to_df()),
    question=chat_response,
)

# Create and start tasks immediately
charts_task = asyncio.create_task(run_charts(chart_request))
business_task = asyncio.create_task(get_business_analysis(business_request))

In [None]:
import plotly.offline as pyo

from utils.schema import GetBusinessAnalysisResult, RunChartsResult

pyo.init_notebook_mode()

tasks = [charts_task, business_task]

# Wait for each task to complete
for coro in asyncio.as_completed(tasks):
    result = await coro

    # Determine which task completed by checking the result structure
    if isinstance(result, RunChartsResult) and (result.fig1 or result.fig2):
        if result.fig1:
            pyo.iplot(result.fig1)
        if result.fig2:
            pyo.iplot(result.fig2)

    elif isinstance(result, GetBusinessAnalysisResult):
        print(f"Bottom Line:\n{(result.bottom_line or '')}")

        print(f"Additional Insights:\n{result.additional_insights}")

        print("Follow-up Questions:")
        for q in result.follow_up_questions:
            print(f"- {q}")

In [None]:
with open("tests/models/run_analysis_result.json", "w") as f:
    f.write(analysis_result.model_dump_json(indent=4))
with open("tests/models/run_charts_result.json", "w") as f:
    f.write(charts_task.result().model_dump_json(indent=4))
with open("tests/models/run_business_result.json", "w") as f:
    f.write(business_task.result().model_dump_json(indent=4))

In [None]:
import asyncio

from utils.api import get_dictionary
from utils.database_helpers import Database

db_tables = Database.get_tables()
db_dataset_names = await Database.get_data(
    *db_tables, analyst_db=analyst_db, sample_size=5000
)

db_datasets = await asyncio.gather(
    *[analyst_db.get_dataset(name) for name in db_dataset_names]
)
db_dictionaries = await asyncio.gather(
    *[get_dictionary(db_dataset) for db_dataset in db_datasets]
)

In [None]:
from utils.api import suggest_questions

suggested_questions = await suggest_questions(db_datasets)

In [None]:
suggested_questions

In [None]:
from utils.api import run_database_analysis
from utils.schema import RunDatabaseAnalysisRequest

db_run_analysis = await run_database_analysis(
    RunDatabaseAnalysisRequest(
        dataset_names=[db_datasets[0].name],
        question="How does loan default rate relate to type of loan?",
    ),
    analyst_db=analyst_db,
)

In [None]:
db_run_analysis.code