# Running Experiments using Azure Endpoints

In [1]:
import numpy as np
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("ChanceFocus/flare-fiqasa")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'query', 'answer', 'text', 'choices', 'gold'],
        num_rows: 750
    })
    test: Dataset({
        features: ['id', 'query', 'answer', 'text', 'choices', 'gold'],
        num_rows: 235
    })
    valid: Dataset({
        features: ['id', 'query', 'answer', 'text', 'choices', 'gold'],
        num_rows: 188
    })
})

In [4]:
print(dataset["train"]["query"][1])
print(dataset["train"]["answer"][1])

What is the sentiment of the following financial headline: Positive, Negative, or Neutral?
Text: Greene King's third quarter sales boosted by festive season
Answer:
positive


We can use AzureMLChatOnlineEndPoint to send structured prompt to Azure LLM Endpoint and get a formatted response using langchain community library.

In [5]:
import os
from langchain_community.chat_models.azureml_endpoint import AzureMLChatOnlineEndpoint, AzureMLEndpointApiType, LlamaChatContentFormatter
from langchain_core.messages import HumanMessage

In [13]:
chat = AzureMLChatOnlineEndpoint(
    endpoint_url= os.environ.get("MISTRAL_LARGE_ENDPOINT_URL"),
    endpoint_api_type=AzureMLEndpointApiType.serverless, 
    endpoint_api_key= os.environ.get("MISTRAL_LARGE_ENDPOINT_API_KEY"),
    content_formatter=LlamaChatContentFormatter(),
)

In [14]:
query = ''.join([dataset["train"]["query"][0], " Explain your reasoning."])
response = chat.invoke([HumanMessage(content=query)])
print(response)

content='The sentiment of the post is Neutral. The author acknowledges that the numbers for $LULU looked good, but not great, showing a balanced perspective. They also express optimism about the conference call, which might seem positive. However, their overall tone is more speculative and uncertain, rather than definitively positive or negative. Therefore, the sentiment can be best described as neutral.' type='assistant'


The below reasoning task took about 90 minutes to be completed.

In [15]:
!mkdir ./data/mistral-large/flare_fiqasa_reasoned_answers
!mkdir ./data/mistral-large/flare_fiqasa_oneword_answers

In [16]:
import pandas as pd

# Create a dictionary with query and response data
data = {'query': [""], 'response': [""]}

# Create the DataFrame object
df = pd.DataFrame(data)

for i in range(len(dataset["train"]["query"])):
    # Genereate response for the given query
    try:
        q = dataset["train"]["query"][i]
        chat_query = ''.join([q, " Explain your reasoning."])
        response = chat.invoke([HumanMessage(content=chat_query)])
        # Add the response to a dataframe
        df.loc[i] = [q, response.content]
        if i> 0 and i % 20 == 0:
            print(f"Processed {i} queries")
            # Saving these responses to a CSV file
            df.to_csv("./data/mistral-large/flare_fiqasa_reasoned_answers/flare_fiqasa_chatbot_responses_with_reasoning_{}.csv".format(i), index=False)
            del df
            df = pd.DataFrame(data)
    except OSError:
        continue
        # If a response is not generated/could not be allocated to memory, continue to the next query

Processed 20 queries
Processed 40 queries
Processed 60 queries
Processed 80 queries
Processed 100 queries
Processed 120 queries
Processed 140 queries
Processed 160 queries
Processed 180 queries
Processed 200 queries
Processed 220 queries
Processed 240 queries
Processed 260 queries
Processed 280 queries
Processed 300 queries
Processed 320 queries
Processed 340 queries
Processed 360 queries
Processed 380 queries
Processed 400 queries
Processed 420 queries
Processed 440 queries
Processed 460 queries
Processed 480 queries
Processed 500 queries
Processed 520 queries
Processed 540 queries
Processed 560 queries
Processed 580 queries
Processed 600 queries
Processed 620 queries
Processed 640 queries
Processed 660 queries
Processed 680 queries
Processed 700 queries
Processed 720 queries
Processed 740 queries


The below classifcation task took about 10 minutes to be completed.

In [17]:
# Create a dictionary with query and response data
data = {'query': [""], 'response': [""]}

# Create the DataFrame object
df = pd.DataFrame(data)

for i in range(len(dataset["train"]["query"])):
    try:
        # Genereate response for the given query
        q = dataset["train"]["query"][i]
        chat_query = ''.join([q, " Return only one word answer from this list: ['negative', 'positive', 'neutral']."])
        response = chat.invoke([HumanMessage(content=chat_query)])
        # Add the response to a dataframe
        df.loc[i] = [q, response.content]
        if i> 0 and i % 20 == 0:
            print(f"Processed {i} queries")
            # Saving these responses to a CSV file
            df.to_csv("./data/mistral-large/flare_fiqasa_oneword_answers/flare_fiqasa_chatbot_responses_with_one_word_{}.csv".format(i), index=False)
            del df
            df = pd.DataFrame(data)
    except OSError:
        continue

Processed 20 queries
Processed 40 queries
Processed 60 queries
Processed 80 queries
Processed 100 queries
Processed 120 queries
Processed 140 queries
Processed 160 queries
Processed 180 queries
Processed 200 queries
Processed 240 queries
Processed 260 queries
Processed 280 queries
Processed 300 queries
Processed 320 queries
Processed 340 queries
Processed 360 queries
Processed 380 queries
Processed 400 queries
Processed 420 queries
Processed 440 queries
Processed 460 queries
Processed 480 queries
Processed 500 queries
Processed 520 queries
Processed 540 queries
Processed 560 queries
Processed 580 queries
Processed 600 queries
Processed 620 queries
Processed 640 queries
Processed 660 queries
Processed 680 queries
Processed 700 queries
Processed 720 queries
Processed 740 queries
