# Test Azure GPT-4 based model with RAG store of Top 500 Bioconductor package

### Get relevant keys from Azure

In [2]:
import os, openai, requests
import pandas as pd
    
## Declare keys
api_key = os.getenv("OPENAI_API_KEY")
search_key = os.getenv("OPENAI_SEARCH_KEY")
endpoint = os.getenv("OPENAI_ENDPOINT")

Declare the deployment name, endpoint and the name of the search index.

In [3]:

openai.api_type = "azure"
# Azure OpenAI on your own data is only supported by the 2023-08-01-preview API version
openai.api_version = "2023-08-01-preview"

# Azure OpenAI setup
openai.api_base = endpoint
openai.api_key = api_key 
deployment_id = "gpt-4-test" 

# Azure AI Search setup
search_endpoint = "https://hmsaisearch.search.windows.net"; # Add your Azure AI Search endpoint here
search_index_name = "bioc-top-500"; # Add your Azure AI Search index name here
    
# https://hms-it-openai.openai.azure.com/openai/deployments/gpt-4-test/extensions/chat/completions?api-version=2024-02-15-preview

Create a "client" object with AzureOpenAI with the correct api key.

In [4]:
# https://hms-it-openai.openai.azure.com/openai/deployments/gpt-4-test/extensions/chat/completions?api-version=2024-02-15-preview

client = openai.AzureOpenAI(
    base_url=f"{endpoint}/openai/deployments/{deployment_id}/extensions",
    api_key=api_key,
    api_version="2023-08-01-preview",
)

Note: This step is to set up the RAG store i.e "bring your own data" search index that has to be registered with the deployment in hand - `gpt-4-test`.

In [5]:
def setup_byod(deployment_id: str) -> None:
    """Sets up the OpenAI Python SDK to use your own data for the chat endpoint.
 
    :param deployment_id: The deployment ID for the model to use with your own data.

    To remove this configuration, simply set openai.requestssession to None.
    """

    class BringYourOwnDataAdapter(requests.adapters.HTTPAdapter):

     def send(self, request, **kwargs):
         request.url = f"{openai.api_base}/openai/deployments/{deployment_id}/extensions/chat/completions?api-version={openai.api_version}"
         return super().send(request, **kwargs)

    session = requests.Session()

    # Mount a custom adapter which will use the extensions endpoint for any call using the given `deployment_id`
    session.mount(
        prefix=f"{openai.api_base}/openai/deployments/{deployment_id}",
        adapter=BringYourOwnDataAdapter()
    )

    openai.requestssession = session

setup_byod(deployment_id)

### Generic function to ask a question from our BYOD model with GPT-4

In [6]:
def ask_rag(question, verbose = False):

    completion = client.chat.completions.create(
        messages=[{"role": "user", "content": question}],
        model=deployment_id,
        extra_body={
            "dataSources": [
                {
                    "type": "AzureCognitiveSearch",
                    "parameters": {
                        "endpoint": search_endpoint,
                        "key": search_key,
                        "indexName": search_index_name,
                        "roleInformation": "Act as an expert in the R programming language and the Bioconductor suite of packages.  ​\n\nYour job is to advise users on the usage of the various Bioconductor packages considering the documents you have in the data store.  ​\n\nTo complete this task, you can use the data you have stored that contain the vignettes of all the packages in Bioconductor and all the reference files of every function in every package of Bioconductor. ​You may also answer some general R, general programming, or Biomedical information.\n\nIf you do not know the answer ask the user to refer to https://bioconductor.org. \n\nAdd a disclaimer at the end of each response saying this model works only on the top 500 most used Bioconductor packages."
                    }
                }
            ]
        }
    )
    if (verbose == True):
        print(f"{completion.choices[0].message.role}: {completion.choices[0].message.content}")

    return(completion.choices[0].message.content)

In [7]:
# `context` is in the model_extra for Azure
# print(f"\nContext: {completion.choices[0].message.model_extra['context']['messages'][0]['content']}")

### Questions for SummarizedExperiemnt

In [8]:
## summarized experiment 
question = "How many classes are there in the Summarized Experiment package? Just give me a number."

ask_rag(question, verbose=True)

assistant: The SummarizedExperiment package contains two classes: SummarizedExperiment and RangedSummarizedExperiment [doc1].


'The SummarizedExperiment package contains two classes: SummarizedExperiment and RangedSummarizedExperiment [doc1].'

### Questions for DESeq2

In [9]:
## DESeq2

question = "DESeq2 performs normalization by estimating size factors for each sample. If your experiment has 5 samples, how many size factors will DESeq2 estimate?"

ask_rag(question, verbose=True)

assistant: DESeq2 estimates size factors for each sample in your experiment. Therefore, if your experiment has 5 samples, DESeq2 will estimate 5 size factors, one for each sample [doc1]. 

Please note that this model works only on the top 500 most used Bioconductor packages. For more specific or advanced usage, please refer to the official Bioconductor documentation at https://bioconductor.org.


'DESeq2 estimates size factors for each sample in your experiment. Therefore, if your experiment has 5 samples, DESeq2 will estimate 5 size factors, one for each sample [doc1]. \n\nPlease note that this model works only on the top 500 most used Bioconductor packages. For more specific or advanced usage, please refer to the official Bioconductor documentation at https://bioconductor.org.'

### Questions for limma

In [10]:
## Limma
question = "You use limma to analyze RNA-seq data from a case-control study with 30 control samples and 30 case samples. After fitting the linear model, how many coefficients will be estimated by limma for the gene expression data (assuming no additional covariates are included in the model)?"

ask_rag(question, verbose=True)

assistant: In the scenario you described, where you have a case-control study with 30 control samples and 30 case samples, and you're using the limma package to analyze RNA-seq data, the number of coefficients estimated by limma for the gene expression data would be 2. 

This is because in a basic case-control study design, the linear model in limma would typically include an intercept term and a term for the case-control status. The intercept term represents the baseline level of gene expression (usually corresponding to the control group), and the case-control term represents the difference in gene expression between the case group and the control group. 

So, in this scenario, limma would estimate one coefficient for the intercept and one coefficient for the case-control status, resulting in a total of 2 coefficients [doc2][doc4].

Please note that this model works only on the top 500 most used Bioconductor packages. For more complex study designs or additional information, please r

"In the scenario you described, where you have a case-control study with 30 control samples and 30 case samples, and you're using the limma package to analyze RNA-seq data, the number of coefficients estimated by limma for the gene expression data would be 2. \n\nThis is because in a basic case-control study design, the linear model in limma would typically include an intercept term and a term for the case-control status. The intercept term represents the baseline level of gene expression (usually corresponding to the control group), and the case-control term represents the difference in gene expression between the case group and the control group. \n\nSo, in this scenario, limma would estimate one coefficient for the intercept and one coefficient for the case-control status, resulting in a total of 2 coefficients [doc2][doc4].\n\nPlease note that this model works only on the top 500 most used Bioconductor packages. For more complex study designs or additional information, please refer

### Questions for SingleCellExperiment

This questions doesn't give a good answer.

In [11]:
question = "how many slots does the SingleCellExperiment object have?"

ask_rag(question, verbose=True)

assistant: The SingleCellExperiment object is a complex object with multiple slots. However, the exact number of slots is not specified in the retrieved documents. The SingleCellExperiment object is designed to store single-cell experiment data, including assay data, feature data, and cell metadata. It is a part of the Bioconductor project and is used extensively in R packages for single-cell data analysis.

For detailed information about the structure and slots of the SingleCellExperiment object, please refer to the official Bioconductor documentation at https://bioconductor.org.

Please note that this model works only on the top 500 most used Bioconductor packages.


'The SingleCellExperiment object is a complex object with multiple slots. However, the exact number of slots is not specified in the retrieved documents. The SingleCellExperiment object is designed to store single-cell experiment data, including assay data, feature data, and cell metadata. It is a part of the Bioconductor project and is used extensively in R packages for single-cell data analysis.\n\nFor detailed information about the structure and slots of the SingleCellExperiment object, please refer to the official Bioconductor documentation at https://bioconductor.org.\n\nPlease note that this model works only on the top 500 most used Bioconductor packages.'

## Read in Bioc QA top 10 

Read in with Pandas

In [12]:
top_10_qs = pd.read_csv('bioc_qa_top10.csv')

In [13]:
top_10_qs

Unnamed: 0,AID,QID,Question,Response
0,answer1,question1,I am a bit confused about the concepts of the ...,The thing to understand is that terms like FDR...
1,answer2,question2,I am working on RNA-Seq data. I'm using DESeq2...,"Just to be clear, there's an important differe..."
2,answer3,question3,I am new in this kind of analysis and I have a...,There is no good way to do a DE analysis of RN...
3,answer4,question4,I am testing salmon and kallisto for RNA-seq. ...,To answer your questions:1) scaledTPM is TPM's...
4,answer5,question5,In all RNA-seq analysis applications they talk...,The most complete explanation of what the disp...
5,answer6,question6,I know findOverlaps() from GenomicRanges packa...,"From the discussion below, an efficient starti..."
6,answer7,question7,I have just downloaded CNV level 3 files from ...,"I wrote two helper functions, explained belowg..."
7,answer8,question8,How can I filter out the genes with low read c...,If you want to filter out genes with low expre...
8,answer9,question9,I am analysing my RNA-Seq data with DESeq2. At...,You can use the ensembldb package to do the ma...
9,answer10,question10,How do I merge a list of GRanges? What I want ...,Merge is a pretty vague term. My understanding...


Ask the RAG model with the top 500 bioconductor packages questions from the `top_10_qs.csv` and append the responses to `azure_rag_responses`.

In [14]:
azure_rag_responses = []
for question in top_10_qs["Question"]:
    response = ask_rag(question)
    azure_rag_responses.append(response)


## We now run the same model without RAG i.e just a generic GPT-4 model

In [15]:
client = openai.AzureOpenAI(
    base_url=f"{endpoint}/openai/deployments/{deployment_id}/extensions",
    api_key=api_key,
    api_version="2023-08-01-preview",
)

def ask_gpt4(question, verbose = False):

    completion = client.chat.completions.create(
            messages=[{"role": "user", "content": question}],
            model=deployment_id,
            extra_body={
                "dataSources": [
                    {
                        "type": "AzureCognitiveSearch",
                        "parameters": {
                            "endpoint": search_endpoint,
                            "key": search_key,
                            "indexName": search_index_name
                        }
                    }
                ]
            },
            temperature=0.0
        )
    
    if (verbose == True):
        print(f"{completion.choices[0].message.role}: {completion.choices[0].message.content}")

    return(completion.choices[0].message.content)    

ask_gpt4("Hello world", verbose=True)

assistant: Hello! How can I assist you today?


'Hello! How can I assist you today?'

In [18]:
ask_gpt4("Are you trained on any data like specifc vignettes from Bioconductor", verbose = True)


assistant: As an AI model developed by OpenAI, I don't have access to my training data, but I was trained on a mixture of licensed data, data created by human trainers, and publicly available data. This corpus was used to pre-train me on a range of language tasks, such as translation and summarization. However, OpenAI has not publicly disclosed the specifics of the individual datasets used, including whether data from Bioconductor vignettes was used in my training. I should note that while I strive to provide accurate and timely information based on my training, I don't have the ability to access or retrieve personal data unless it has been shared with me in the course of our conversation. I am designed to respect user privacy and confidentiality.


"As an AI model developed by OpenAI, I don't have access to my training data, but I was trained on a mixture of licensed data, data created by human trainers, and publicly available data. This corpus was used to pre-train me on a range of language tasks, such as translation and summarization. However, OpenAI has not publicly disclosed the specifics of the individual datasets used, including whether data from Bioconductor vignettes was used in my training. I should note that while I strive to provide accurate and timely information based on my training, I don't have the ability to access or retrieve personal data unless it has been shared with me in the course of our conversation. I am designed to respect user privacy and confidentiality."

In [32]:
top_10_qs.insert(4, "Response_Azure_Bioc_RAG", azure_bioc_responses, True)
top_10_qs.to_csv("top_10_qs_with_azure_RAG.csv", index=False)
