In [1]:
# system
import os
import json

# langchain
from langchain_openai import AzureChatOpenAI

# utils
from utils.templates import (
    get_analytics_code_generator_chain,
    analytics_table_data_template,
)
from utils.models import GeneratedCodeAnalytics



In [2]:
os.environ["AZURE_OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
os.environ["AZURE_OPENAI_ENDPOINT"] = os.environ["OPENAI_API_BASE"]
os.environ["AZURE_OPENAI_API_VERSION"] = os.environ["OPENAI_API_VERSION"]
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = "firstcontact-gpt4-turbo"

del os.environ["OPENAI_API_BASE"]

In [3]:
llm_json = AzureChatOpenAI(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
    model="gpt-4-128k",
    model_kwargs={"response_format": {"type": "json_object"}}
)

## Extractors

In [4]:
def generate_analytics_table_data(table_names, table_info):
    constructed_str = ""

    for name, (cols, types) in zip(table_names, table_info):
        constructed_str += analytics_table_data_template.format(
            name=name, cols=cols, types=types
        )

    return constructed_str

def generate_analysis_code(query):
    metadata = json.load(open("metadata.json", "r"))

    analytics_code_chain = get_analytics_code_generator_chain(llm_json)
    data = generate_analytics_table_data(
        metadata["table_names"], metadata["table_metadata"]
    )

    result: GeneratedCodeAnalytics = analytics_code_chain.invoke(
        {"query": query, "data": data}
    )

    constructed_str = f"{result.code}\n\n# Following files are saved\n"
    constructed_str += f"files = {str(result.files)}"
    constructed_str += f"\n\n# Assumptions\n"

    if result.assumptions:
        for a in result.assumptions:
            constructed_str += f"#     {a}\n"
    else:
        constructed_str += f"#     None made\n"

    constructed_str += f"\n# Feedback\n"

    if result.feedback:
        for a in result.feedback:
            constructed_str += f"#     {a}\n"
    else:
        constructed_str += f"#     None given\n"

    return constructed_str

In [5]:
execution_code = generate_analysis_code(
    "Plot the frequency of karyotypic sex in a pie chart."
    " Create a CSV file of individual's id, ethnicity and sex."
)

In [6]:
print(execution_code)

# Define the path for the output CSV file
output_csv_path = '/tmp/individuals_ethnicity_sex.csv'

# Extract individual's id, ethnicity, and sex from the dataframe
extracted_data = data[['id', 'ethnicity', 'sex']]

# Save the extracted data to a CSV file
extracted_data.to_csv(output_csv_path, index=False)

# Plot the frequency of karyotypic sex in a pie chart
karyotypic_sex_counts = data['karyotypicSex'].value_counts()
plt.figure(figsize=(8, 6))
plt.pie(karyotypic_sex_counts, labels=karyotypic_sex_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Frequency of Karyotypic Sex')

# Save the pie chart
pie_chart_path = '/tmp/karyotypic_sex_pie_chart.png'
plt.savefig(pie_chart_path, bbox_inches='tight')


# Following files are saved
files = ['/tmp/individuals_ethnicity_sex.csv', '/tmp/karyotypic_sex_pie_chart.png']

# Assumptions
#     Assuming 'ethnicity' and 'sex' columns contain string representations of the respective attributes.
#     Assuming 'sex' column does not require tran