In [None]:
# Arguments
f='./depmap/Model.csv.gz'
host='127.0.0.1'
port='9999'
model='meta-llama/Meta-Llama-3-70B-Instruct'

#port='8999'
#model='mistral-community/Mixtral-8x22B-v0.1'


In [None]:
from openai import OpenAI

# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = 'cmsc-35360'
#openai_api_key = 'EMPTY'

openai_api_base = f"http://{host}:{port}/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

In [None]:
import pandas as pd

def load_data(f):
    df=pd.read_csv(f)
    print(f'done importing dataframe (rows, columns) = {df.shape}.')
    return df

In [None]:
import random

def select_random_element_not_equal_to(array, correct):
    '''
    Given an array of unique answers, select an answer not
    equal to the correct answer.
    '''
    if len(array) == 0:
        raise ValueError("The array is empty")

    # Filter out elements equal to `correct`
    filtered_array = [element for element in array if element != correct]
    
    if not filtered_array:
        raise ValueError("No valid elements to choose from")

    # Select a random element from the filtered list
    return random.choice(filtered_array)

In [None]:
def construct_question(df, index, array):
    '''
    Construct a multiple choice question from df using row at index.
    '''
    if index < 0 or index >= len(df):
        raise IndexError("Index out of bounds")
    
    cell_line_name = df.iloc[index]['CellLineName']
    correct_answer = df.iloc[index]['OncotreePrimaryDisease']

    wrong_answer_1 = select_random_element_not_equal_to(array, correct_answer)
    wrong_answer_2 = select_random_element_not_equal_to(array, correct_answer)
    wrong_answer_3 = select_random_element_not_equal_to(array, correct_answer)

    while wrong_answer_1 == wrong_answer_2 or wrong_answer_1 == wrong_answer_3 or wrong_answer_2 == wrong_answer_3:
        wrong_answer_1 = select_random_element_not_equal_to(array, correct_answer)
        wrong_answer_2 = select_random_element_not_equal_to(array, correct_answer)
        wrong_answer_3 = select_random_element_not_equal_to(array, correct_answer)
        
    answers = [correct_answer,
               wrong_answer_1,
               wrong_answer_2,
               wrong_answer_3]
    random.shuffle(answers)

    d = {"a": answers[0],
         "b": answers[1],
         "c": answers[2],
         "d": answers[3],
        }

    for choice in d.keys():
        if d[choice] == correct_answer:
            correct_choice = choice
    
    # question = f'''The tumor cell line named {cell_line_name} is
    # question = f'''The tumor cell line named {cell_line_name} is a biological model for which primary disease?
    question = f'''The cell line named {cell_line_name} is a biological model for which primary disease?

a) {d['a']}
b) {d['b']}
c) {d['c']}
d) {d['d']}'''
    
    return question, correct_choice, correct_answer, cell_line_name

In [None]:
import json
from tqdm import tqdm
from datetime import datetime

df = load_data(f)
array = df['OncotreePrimaryDisease'].unique()


sys_reg_prompt = '''You are a cancer research scientist studying the potential effects of various small molecules, peptides, and 
antibiodies on tumor cell growth. You will be presented with a series of multiple choice questions. Please select the correct
choice. Return the answer in json format {"CHOICE": choice, "ANSWER": answer} where choice is only the alphabetic character 
associated with the full answer.'''

 
for i in range(0, 1):
    num_correct = 0
    total = 0
    responses = []


    for i in tqdm( range(df.shape[0]) ):
        # get question, correct choice and answer
        question, correct_choice, correct_answer, cell_line_name = construct_question(df, i, array)
        
        # construct message
        messages=[
            {"role": "system",
             "content": sys_reg_prompt
            },
            {"role": "user",
             "content": question
            },

        ]
        print(f'{sys_reg_prompt}')
        print(f'{question}')
        
        chat_response = client.chat.completions.create(
            model=model,
            # logprobs=1,
            # top_logprobs=1,
            messages=messages,
            #temperature=0.0,
            #max_tokens=2000,
            stream=False,
        )
        print(f'{chat_response}')
        
        try:
            response = json.loads(chat_response.choices[0].message.content)
            response['CORRECT CHOICE'] = correct_choice
            response['CORRECT ANSWER'] = correct_answer
            response['CELL_LINE_NAME'] = cell_line_name
    
            if response['CHOICE'] == correct_choice:
                response['SCORE'] = 1
                num_correct = num_correct + 1
            else:
                response['SCORE'] = 0
    
            total = total + 1
            responses.append(response)

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            print(f"{question}")
            print(f"{chat_response.choices[0].message.content}")

            response["CHOICE"] =  "e"
            response["ANSWER"] =  chat_response.choices[0].message.content
            response['CORRECT CHOICE'] = correct_choice
            response['CORRECT ANSWER'] = correct_answer
            response['CELL_LINE_NAME'] = cell_line_name
            response['SCORE'] = 0
            
            responses.append(response)
            pass
    
    print(f'{num_correct} correct responses out of {total}')
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

    response_df = pd.DataFrame(responses)
    response_df.to_csv(f'model_name_eval_{timestamp}.tsv', index=None, sep="\t")

    #with open('model_name_eval_summary.txt', 'a') as f:
    #    print(f'{timestamp}\t{num_correct} correct responses out of {total}', file=f)

In [None]:
response_df