In [1]:
# system
import os
import json

# langchain
from langchain_openai import AzureChatOpenAI
from langchain_ollama import ChatOllama

# data science
import pandas as pd


# utils
from utils.templates import (
    scope_extractor_template,
    variant_extractor_template,
    filter_extractor_template,
    granularity_extractor_template,
)


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_ollama.embeddings import OllamaEmbeddings


In [2]:
os.environ["AZURE_OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
os.environ["AZURE_OPENAI_ENDPOINT"] = os.environ["OPENAI_API_BASE"]
os.environ["AZURE_OPENAI_API_VERSION"] = os.environ["OPENAI_API_VERSION"]
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = "firstcontact-gpt4-turbo"

del os.environ["OPENAI_API_BASE"]

In [3]:
open_models = [
    "mistral-nemo:12b-instruct-2407-q6_K",
    "llama3.1:8b-instruct-q6_K",
    "gemma2:9b-instruct-q6_K",
    "qwen2:7b-instruct",
]
models = {
    "gpt4": AzureChatOpenAI(
        openai_api_version=os.environ["OPENAI_API_VERSION"],
        azure_deployment="firstcontact-gpt4-turbo",
        model="gpt-4-128k",
        model_kwargs={"response_format": {"type": "json_object"}},
    )
}
models.update(
    {
        model: ChatOllama(model=model, temperature=0, num_predict=1024, format="json")
        for model in open_models
    }
)

## Extract Scope

In [4]:
for name, model in models.items():
    test_data = pd.read_excel("./Queries-and-answers.xlsx", sheet_name=0, index_col=None)
    
    for row in test_data.iterrows():
        index, series = row
        entry = dict(series)
        query = entry['question']
        answer = entry['scope']
        template = scope_extractor_template.format(query=query)
        predition = model.invoke(template).content
        print(json.dumps({"name": name, "query": query, "answer": answer, "predition": json.loads(predition)}, indent=4))
        break


{
    "name": "gpt4",
    "query": "What are the genomic variations found at position 123456 on chromosome 7?",
    "answer": "genomic_variants",
    "predition": {
        "scope": "g_variants"
    }
}
{
    "name": "mistral-nemo:12b-instruct-2407-q6_K",
    "query": "What are the genomic variations found at position 123456 on chromosome 7?",
    "answer": "genomic_variants",
    "predition": {
        "scope": "g_variants"
    }
}
{
    "name": "llama3.1:8b-instruct-q6_K",
    "query": "What are the genomic variations found at position 123456 on chromosome 7?",
    "answer": "genomic_variants",
    "predition": {
        "scope": "g_variants"
    }
}
{
    "name": "gemma2:9b-instruct-q6_K",
    "query": "What are the genomic variations found at position 123456 on chromosome 7?",
    "answer": "genomic_variants",
    "predition": {
        "scope": "g_variants"
    }
}
{
    "name": "qwen2:7b-instruct",
    "query": "What are the genomic variations found at position 123456 on chromoso

## Extract Filters

In [5]:
for name, model in models.items():
    test_data = pd.read_excel(
        "./Queries-and-answers.xlsx", sheet_name=2, index_col=None
    )

    for row in test_data.iterrows():
        index, series = row
        entry = dict(series)
        query = entry["question"]
        answer_scope = entry["scope"]
        answer_filters = entry["filters_and_scope"]
        scope_prediction = model.invoke(
            scope_extractor_template.format(query=query)
        ).content
        filters_prediction = model.invoke(
            filter_extractor_template.format(query=query)
        ).content
        print(
            json.dumps(
                {
                    "name": name,
                    "query": query,
                    "answer_scope": answer_scope,
                    "answer_filters": answer_filters,
                    "scope_prediction": json.loads(scope_prediction),
                    "filters_prediction": json.loads(filters_prediction),
                },
                indent=4,
            )
        )
        break

{
    "name": "gpt4",
    "query": "How many patients with breast cancer are in the repository?",
    "answer_scope": "Individuals",
    "answer_filters": "breast cancer (Individuals)",
    "scope_prediction": {
        "scope": "cohorts"
    },
    "filters_prediction": {
        "filters": [
            {
                "term": "breast cancer",
                "scope": "individuals"
            }
        ]
    }
}
{
    "name": "mistral-nemo:12b-instruct-2407-q6_K",
    "query": "How many patients with breast cancer are in the repository?",
    "answer_scope": "Individuals",
    "answer_filters": "breast cancer (Individuals)",
    "scope_prediction": {
        "scope": "individuals"
    },
    "filters_prediction": {
        "filters": [
            {
                "term": "disease=breast cancer",
                "scope": "individuals"
            }
        ]
    }
}
{
    "name": "llama3.1:8b-instruct-q6_K",
    "query": "How many patients with breast cancer are in the repository

## Extract Granularity

In [6]:
for name, model in models.items():
    test_data = pd.read_excel(
        "./Queries-and-answers.xlsx", sheet_name=3, index_col=None
    )

    for row in test_data.iterrows():
        index, series = row
        entry = dict(series)
        query = entry["question"]
        answer = entry["granularity"]
        prediction = model.invoke(
            granularity_extractor_template.format(query=query)
        ).content
        print(
            json.dumps(
                {
                    "name": name,
                    "query": query,
                    "answer": answer,
                    "prediction": json.loads(prediction),
                },
                indent=4,
            )
        )
        break

{
    "name": "gpt4",
    "query": "How many patients with breast cancer are in the repository?",
    "answer": "count",
    "prediction": {
        "granularity": "count"
    }
}
{
    "name": "mistral-nemo:12b-instruct-2407-q6_K",
    "query": "How many patients with breast cancer are in the repository?",
    "answer": "count",
    "prediction": {
        "granularity": "count"
    }
}
{
    "name": "llama3.1:8b-instruct-q6_K",
    "query": "How many patients with breast cancer are in the repository?",
    "answer": "count",
    "prediction": {
        "granularity": "count"
    }
}
{
    "name": "gemma2:9b-instruct-q6_K",
    "query": "How many patients with breast cancer are in the repository?",
    "answer": "count",
    "prediction": {
        "granularity": "count"
    }
}
{
    "name": "qwen2:7b-instruct",
    "query": "How many patients with breast cancer are in the repository?",
    "answer": "count",
    "prediction": {
        "granularity": "count"
    }
}


## Extract Variants and Filters

In [7]:
for name, model in models.items():
    test_data = pd.read_excel(
        "./Queries-and-answers.xlsx", sheet_name=1, index_col=None
    )

    for row in test_data.iterrows():
        index, series = row
        entry = dict(series)
        query = entry["question"]
        answer_var = f'{entry["chromosome"]}:{entry["start"]}-{entry["end"]}'
        answer_filters = entry["filters"]
        filters_prediction = model.invoke(
            filter_extractor_template.format(query=query)
        ).content
        variant_prediction = model.invoke(
            variant_extractor_template.format(query=query)
        ).content
        print(
            json.dumps(
                {
                    "name": name,
                    "query": query,
                    "answer_var": answer_var,
                    "answer_scope": answer_scope,
                    "answer_filters": answer_filters,
                    "filters_prediction": json.loads(filters_prediction),
                    "variant_prediction": json.loads(variant_prediction),
                },
                indent=4,
            )
        )
        break

{
    "name": "gpt4",
    "query": "I want genomic variations from 550k to 650k from second chromosome, from patients having renal failure",
    "answer_var": "2:550000.0-650000.0",
    "answer_scope": "Individuals",
    "answer_filters": "renal failure",
    "filters_prediction": {
        "filters": [
            {
                "term": "renal failure",
                "scope": "individuals"
            }
        ]
    },
    "variant_prediction": {
        "success": true,
        "assembly_id": "unknown",
        "chromosome": "2",
        "start": [
            550000
        ],
        "end": [
            650000
        ]
    }
}
{
    "name": "mistral-nemo:12b-instruct-2407-q6_K",
    "query": "I want genomic variations from 550k to 650k from second chromosome, from patients having renal failure",
    "answer_var": "2:550000.0-650000.0",
    "answer_scope": "Individuals",
    "answer_filters": "renal failure",
    "filters_prediction": {
        "filters": [
            {
   