# Try sending answers directly to the OpenAI API on Azure

using GPT-4o, which has a context limit of 128k

working in my conda env 
```
conda create --name openai-wsl python=3.10 numpy pandas jupyter matplotlib python-dotenv
conda activate openai-wsl
pip install --upgrade openai 
```

In [None]:
import os
import pandas as pd

from openai import AzureOpenAI
from dotenv import load_dotenv

In [None]:
# get the APIU key, endpoint, and set other necessary inof
load_dotenv()
my_azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
my_api_key  = os.getenv("AZURE_OPENAI_API_KEY")

my_api_version = "2024-02-15-preview"
deployment_name = "test_GPT_4o"

In [None]:
froots = [
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M0_IF_Reflection_Questions_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M1_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M2_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M3_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M4_IF_Reflection_Question_cleaned",
    "INCLU1x_IF_Responses_-_ALL_RUNS_041924_M5_IF_Reflection_Question_cleaned" ,
]

In [None]:
# count the number of words in each file
# from OpenAI: https://platform.openai.com/tokenizer "A helpful rule of thumb is that one token generally corresponds to ~4 characters of text for common English text. This translates to roughly ¾ of a word (so 100 tokens ~= 75 words)."
# the largest GPT model has 128K tokens context limit (so it won't be able to ingest everything)
for f in froots:
    df = pd.read_csv("../../data/" + f + ".csv")
    all_answers = df['student_responses'].str.cat(sep=' ').split(' ')
    print(f, len(all_answers)*4/3.)

In [None]:
instructions_before = "Below is a list of items each starting with [item]. I will want you to summarize this list."
instructions_after = "That was the last item in the list.  Now summarize these items with a new list of 10 or less unique themes.  Do not repeat any item from above verbatim in your themes.  Each theme should be only one short sentence.  Only return the short one-sentence themes."

# define the openAI client 
client = AzureOpenAI(
    azure_endpoint = my_azure_endpoint,
    api_key = my_api_key,
    api_version = my_api_version
)

In [None]:
def get_LLM_input_max_index(input_list, max_tokens = 1048, start_index = 0, system_text = '', extra_tokens_per_item = 0):
    # determine how many lines I can feed into the LLM

    n_tokens = len(system_text.split())*4/3.
    for index, item in enumerate(input_list):
        if (index > start_index):
            n_tokens_item = len(item.split())*4/3.
            n_tokens += n_tokens_item + extra_tokens_per_item
            if (n_tokens > max_tokens):
                return index - 1

    return index

def run_openai_api(froot, max_tokens, start_index = 0):
    df = pd.read_csv("../../data/" + froot + ".csv")
    responses = df['student_responses'].to_list()

    # determine how many answers I can fit in one call
    end_index = get_LLM_input_max_index(responses, start_index = start_index, max_tokens = max_tokens, system_text = instructions_before + " " + instructions_after, extra_tokens_per_item = 1)

    # define the message to the LLM
    input_list = ''
    for item in responses[start_index:end_index]:
        input_list += '[item] ' + item + '\n'
    messages=[
        {"role":"system", "content":instructions_before},
        {"role":"user", "content": input_list},
        {"role":"system", "content":instructions_after}
    ]
 
    # Get the response from the client
    response = client.chat.completions.create(
        model = deployment_name,
        messages = messages
    )

    # print the result
    out_message = response.choices[0].message.content
    return len(responses), end_index, input_list, out_message

def write_summary_file(froot, start_index, end_index, n_answers, output):
    with open(os.path.join('tables','openAI',froot + "_GPT4o_startindex" + str(start_index) +".txt"), 'w') as f:
        f.write('file root : ' + froot + '\n\n')
        f.write('total number of answers in file : ' + str(n_answers) + '\n')
        f.write('start index of answers included : ' +  str(start_index) + '\n')
        f.write('end index of answers included : ' +  str(end_index - 1) + '\n')
        f.write('number of answers included : ' +  str(end_index - start_index) + '\n\n')
        f.write('OpenAI GPT-4o response :\n')
        f.write(output)


In [None]:
max_tokens = 128000
#start_indices = np.zeros_like(froots)

# after first run 
# start_indices = [1195, 1751, 831, 1391, 1174, 1442]

# after second run
start_indices = [2489, -1, 1726, -1, -1, -1]

# close enough to all data after second run for all

#need to rerun, M1, M3, M4, M5
for i,froot in enumerate(froots):
    # if (i in [1,3,4,5]):
    if (start_indices[i] > 0):
        print(froot)

        n_answers, end_index, input, output = run_openai_api(froot, max_tokens, start_index = start_indices[i])
        write_summary_file(froot, start_indices[i], end_index, n_answers, output)
        print(output)
        
        print('')
