In [1]:
import pandas as pd
from openai import OpenAI
import random

In [2]:
# Arguments
infile='../depmap/Model.csv.gz'
host='127.0.0.1'
host='localhost'
port='9999'
#port='8000'
model='meta-llama/Meta-Llama-3-70B-Instruct'
model='meta-llama/Meta-Llama-3-8B-Instruct'
model='meta-llama/Meta-Llama-3.1-70B-Instruct'
model='llama31-405b-fp8'  # first time server was started with --served-model-name
openai_api_key = 'cmsc-35360'
openai_api_base = f"http://{host}:{port}/v1"

In [3]:
#f_prefix = model.split("/")[1]
# using --served-model-name messed up my code
f_prefix = model

f_prefix = f_prefix + "_no_lora"
f_prefix




'llama31-405b-fp8_no_lora'

In [4]:
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

In [5]:
def load_data(f):
    df=pd.read_csv(f)
    return df

In [6]:
def select_random_element_not_equal_to(array, correct):
    '''
    Given an array of unique answers, select an answer not equal to the correct answer.
    '''
    if len(array) == 0:
        raise ValueError("The array is empty")

    # Filter out elements equal to `correct`
    filtered_array = [element for element in array if element != correct]
    
    if not filtered_array:
        raise ValueError("No valid elements to choose from")

    # Select a random element from the filtered list
    return random.choice(filtered_array)

In [7]:
def remove_correct_cell_line_answers(disease, df):
    '''
    Returns a list of unique cell line names not associated with disease.
    '''
    
    filtered = df[df['OncotreePrimaryDisease'] != disease]
    array = filtered['CellLineName'].unique()
    
    # debug
    # print(f'{df.shape}\t{filtered.shape}')
    # print(f'{len(array)}')
    
    return array

In [8]:
def construct_question(df, index):
    '''
    Construct a multiple choice question from df using row at index.
    Row at index will have the correct answer.
    The incorrect answers are randomly selected from array.
    '''
    if index < 0 or index >= len(df):
        raise IndexError("Index out of bounds")
    
    disease = df.iloc[index]['OncotreePrimaryDisease']
    correct_answer = df.iloc[index]['CellLineName']
    wrong_answers = remove_correct_cell_line_answers(disease, df)
    
    wrong_answer_1 = select_random_element_not_equal_to(wrong_answers, correct_answer)
    wrong_answer_2 = select_random_element_not_equal_to(wrong_answers, correct_answer)
    wrong_answer_3 = select_random_element_not_equal_to(wrong_answers, correct_answer)

    while wrong_answer_1 == wrong_answer_2 or wrong_answer_1 == wrong_answer_3 or wrong_answer_2 == wrong_answer_3:
        wrong_answer_1 = select_random_element_not_equal_to(wrong_answers, correct_answer)
        wrong_answer_2 = select_random_element_not_equal_to(wrong_answers, correct_answer)
        wrong_answer_3 = select_random_element_not_equal_to(wrong_answers, correct_answer)
        
    answers = [correct_answer,
               wrong_answer_1,
               wrong_answer_2,
               wrong_answer_3]
    random.shuffle(answers)

    d = {"a": answers[0],
         "b": answers[1],
         "c": answers[2],
         "d": answers[3],
        }

    for choice in d.keys():
        if d[choice] == correct_answer:
            correct_choice = choice
    
    question = f'''The primary disease {disease} can be best studied using which cell line?
a) {d['a']}
b) {d['b']}
c) {d['c']}
d) {d['d']}'''
    
    return question, correct_choice, correct_answer, disease
    

In [9]:
def debug(index, disease, df, answer):
    '''
    The answer is the name of the cell line associated that can be used as a model for the disease.
    We get all cell lines associated with that disease, and check if answer is in that list.
    '''
    filtered_df = df[df['OncotreePrimaryDisease'] == disease]
    cell_line_names = filtered_df['CellLineName'].values
    filtered_array = [element for element in cell_line_names if element == answer]
    
    #print(cell_line_names)
    #print(filtered_array)

    return filtered_array

In [10]:
import json
from tqdm import tqdm
from datetime import datetime

df = load_data(infile)


sys_reg_prompt = '''You are a cancer research scientist studying the potential effects of various small molecules, peptides, and 
antibiodies on tumor cell growth. You will be presented with a series of multiple choice questions. Please select the correct
choice. Return the answer in json format {"CHOICE": "choice", "ANSWER": "answer"} where choice is only the alphabetic character 
associated with the full answer.'''
# Note, I quoted choice and answer when model upgraded to llama31-405b-fp8

# This is not working yet because the experts talk to each other thus making the
# parsing of the final answer different from the sys_reg_prompt.
sys_tot_prompt = '''Imagine three experts are answering this question.
They will brainstorm the answer step by step, reasoning carefully and taking all facts into consideration.
All experts will write down one step of their thinking, then share it with the group.
They will each critique their response and all the responses of others.
They will check their answer based on science, laws of physics and logic.
Then all experts will go on to the next step and write down this step of their thinking.
They will keep going through steps until they reach their conclusions taking into account the thoughts of the other experts.
If at anytime they realize that there is a flaw in their logic they will backtrack to where the flaw occurred.
If any expert realizes they are wrong at any point they acknowledge this and start another train of thought.
Each expert will assign a likelihood of their current assertion being correct.
Continue until the experts agree on the single most likely choice. Return the response in json format FINAL_ANSWER={"DISCUSSION": discussion, "CHOICE": choice, "ANSWER": answer} where choice is only the alphabetic character 
associated with the full answer.'''
 
for i in range(0, 1): # 9):
    num_correct = 0
    total = 0
    responses = []


    for i in tqdm( range(df.shape[0]) ):
        # get question, correct choice and answer
        question, correct_choice, correct_answer, disease = construct_question(df, i)

        # For debugging.
        #print(f'{question}')
        #print(f'The correct choice is {correct_choice}. {correct_answer}')
        #print(f'The df row has {df.iloc[i]["OncotreePrimaryDisease"]} {df.iloc[i]["CellLineName"]}')
    
        # construct message
        messages=[
            {"role": "system", 
             "content": sys_reg_prompt
            },
            {"role": "user",
             "content": question
            }
        ]
    
        chat_response = client.chat.completions.create(
            model=model,
            # logprobs=1,
            # top_logprobs=1,
            messages=messages,
            temperature=0.0,
            max_tokens=2560,
        )
        
        
        try:
            response = json.loads(chat_response.choices[0].message.content)
            response['CORRECT CHOICE'] = correct_choice
            response['CORRECT ANSWER'] = correct_answer
            response['PRIMARY_DISEASE'] = disease
    
            if response['CHOICE'] == correct_choice:
                response['SCORE'] = 1
                num_correct = num_correct + 1
            else:
                response['SCORE'] = 0
                filtered_array = debug(i, disease, df, response["ANSWER"])
                if len(filtered_array) > 0:
                    print(f'{question}\n')
                    print(f'choice: {response["CHOICE"]} answer: {response["ANSWER"]}')
                    print(f'correct choice: {response["CORRECT CHOICE"]} correct answer: {response["CORRECT ANSWER"]}')
                    print(f'index: {i}')
                    print(f'wrong answer found in filtered array {filtered_array}')

                    class MyException(Exception):
                        pass
                    raise MyException("My hovercraft is full of eels")

            total = total + 1
            responses.append(response)
            #print(f'{response}') 

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            print(f"{question}")
            print(f"{chat_response.choices[0].message.content}")

            response["CHOICE"] =  "e"
            response["ANSWER"] =  chat_response.choices[0].message.content
            response['CORRECT CHOICE'] = correct_choice
            response['CORRECT ANSWER'] = correct_answer
            response['PRIMARY_DISEASE'] = disease
            response['SCORE'] = 0
            
            responses.append(response)
            pass
    
    print(f'{num_correct} correct responses out of {total}')
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

    response_df = pd.DataFrame(responses)
    response_df.to_csv(f'{f_prefix}_disease_eval_{timestamp}.tsv', index=None, sep="\t")

    with open(f_prefix + '_disease_eval_summary.txt', 'a') as f:
        print(f'{timestamp}\t{num_correct} correct responses out of {total}', file=f)

  8%|█████                                                          | 153/1921 [02:26<22:17,  1.32it/s]

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
The primary disease Non-Cancerous can be best studied using which cell line?
a) Hs 888.T
b) TGBC24TKB
c) OCI-P5x
d) LC-1/sq-SF
I can't answer that.


 16%|█████████▊                                                     | 301/1921 [04:44<36:55,  1.37s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 183 (char 182)
The primary disease Endometrial Carcinoma can be best studied using which cell line?
a) NCI-H596
b) SNU-1077
c) BC-3C
d) HUTU80
{"CHOICE": "d", "ANSWER": "HUTU80 is not the best answer, however the best cell line is HEC-1-A or Ishikawa but they are not listed, however of the ones that are HUTU80 is the best."


 45%|████████████████████████████                                   | 855/1921 [13:34<43:38,  2.46s/it]

Error decoding JSON: Extra data: line 3 column 1 (char 316)
The primary disease Non-Small Cell Lung Cancer can be best studied using which cell line?
a) UPMM3
b) KNS-62
c) Mero-84
d) SU-DHL-16
{"CHOICE": "c", "ANSWER": "Mero-84 is not the best answer, however,  Mero-84 isn't listed as a cell line for Non-Small Cell Lung Cancer but NCI-H1975 is. Of the options, NCI-H1975 wasn't listed but Mero-84 was listed as a NSCLC but not the best cell line. Of the options, none are the best cell lines for NSCLC. "}

I made an error in the response, I will provide the correct cell line for Non-Small Cell Lung Cancer. 

Indeed, the best cell lines for NSCLC are NCI-H1975, A549, and HCC827.


 59%|████████████████████████████████████▎                         | 1126/1921 [18:35<19:22,  1.46s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 199 (char 198)
The primary disease Rhabdoid Cancer can be best studied using which cell line?
a) MEL-JUSO
b) HCC1599BL
c) MHH-CALL-4
d) TTC-1240
{"CHOICE": "a", "ANSWER": "MEL-JUSO is a cell line often used to study Rhabdoid cancer, however more information would be needed to confirm this is the best cell line to use for a particular study."


 59%|████████████████████████████████████▍                         | 1128/1921 [18:40<27:44,  2.10s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 185 (char 184)
The primary disease Rhabdoid Cancer can be best studied using which cell line?
a) TUHR14TKB
b) TTC-642
c) TASK1
d) NOS-1
{"CHOICE": "b", "ANSWER": "TTC-642 is not well known, however G401 and A204 are well known cell lines for Rhabdoid Cancer. However, another cell line is indeed TTC642 so b is correct."


 63%|███████████████████████████████████████                       | 1209/1921 [20:30<19:27,  1.64s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 38 (char 37)
The primary disease Lung Neuroendocrine Tumor can be best studied using which cell line?
a) LK-2
b) SCLC-22H
c) LS
d) PANFR0233
{"CHOICE": "d", "ANSWER": "PANFR0233"


 66%|███████████████████████████████████████▉                    | 1277/1921 [23:56<6:44:24, 37.68s/it]

Error decoding JSON: Unterminated string starting at: line 1 column 27 (char 26)
The primary disease Cervical Adenocarcinoma can be best studied using which cell line?
a) HuT 78
b) SW 900
c) PGA-1
d) HCSC-1
{"CHOICE": "d", "ANSWER": "HCSC-1 is not the best answer, HeLa is the most commonly used cell line to study cervical cancer, however HeLa was not an option, another cell line used is HCSC-1's alternative HeLa contaminant  HCS-2 and ME-180 and MS751 and  one of the best cell line to study Cervical Adenocarcinoma is HeLa however an alternative is  HCS-2 and ME-180 and MS751 and  HCSC-1's alternative HeLa contaminant  HCS-2 and ME-180 and MS751 is HeLa however an alternative is  HCS-2 and ME-180 and MS751 and  HeLa however an alternative is  HCS-2 and ME-180 and MS751 is HeLa however an alternative is  HCS-2 and ME-180 and MS751 and HeLa however an alternative is  HCS-2 and ME-180 and MS751 is HeLa however an alternative is  HCS-2 and ME-180 and MS751 and HeLa however an alternative is

 69%|██████████████████████████████████████████▉                   | 1331/1921 [25:00<10:58,  1.12s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 33 (char 32)
The primary disease Diffuse Glioma can be best studied using which cell line?
a) ES4
b) no.11
c) MZ7B
d) BT-474
{"CHOICE": "c", "ANSWER": "MZ7B"


 69%|███████████████████████████████████████████                   | 1334/1921 [25:03<10:10,  1.04s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 38 (char 37)
The primary disease Diffuse Glioma can be best studied using which cell line?
a) NP 3
b) huH-1
c) NCIBL2122
d) NCI-H1930
{"CHOICE": "c", "ANSWER": "NCIBL2122"


 71%|███████████████████████████████████████████▊                  | 1359/1921 [25:31<12:41,  1.36s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 182 (char 181)
The primary disease Prostate Adenocarcinoma can be best studied using which cell line?
a) BHT-101
b) NCI-H920
c) PA-1 [PA1]
d) Shmac 4
{"CHOICE": "d", "ANSWER": "Shmac 4 is not the best answer, however of the options provided it is the best. A more suitable cell line would be LNCaP, however that was not an option."


 75%|██████████████████████████████████████████████▍               | 1440/1921 [27:11<09:52,  1.23s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 39 (char 38)
The primary disease Intraductal Papillary Neoplasm of the Bile Duct can be best studied using which cell line?
a) CHL-1
b) ICC7
c) COG-AR-359
d) KYSE-270
{"CHOICE": "c", "ANSWER": "COG-AR-359"


 76%|██████████████████████████████████████████████▉               | 1455/1921 [27:27<07:46,  1.00s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 34 (char 33)
The primary disease Lung Neuroendocrine Tumor can be best studied using which cell line?
a) MS1
b) ICC10
c) SNU-254
d) BIN-67
{"CHOICE": "b", "ANSWER": "ICC10"


 76%|███████████████████████████████████████████████▎              | 1465/1921 [27:40<09:06,  1.20s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 38 (char 37)
The primary disease Melanoma can be best studied using which cell line?
a) HKA-1
b) PANFR0069
c) NZM42
d) YD-15
{"CHOICE": "b", "ANSWER": "PANFR0069"


 77%|███████████████████████████████████████████████▉              | 1484/1921 [28:08<20:51,  2.86s/it]

Error decoding JSON: Extra data: line 3 column 1 (char 423)
The primary disease Glassy Cell Carcinoma of the Cervix can be best studied using which cell line?
a) HOKUG
b) LU99
c) LN-18
d) Hs 860.T
{"CHOICE": "a", "ANSWER": "HOKUG is not available however,  HOKUG's alternative would be Hs 636.T, unfortunately that is not an option, another alternative would be Hs 860.T's alternative which is HOKUG's alternative, HeLa is not the best answer however, Hs 636.T is not an option, so the best answer is Hs 860.T's alternative which is a) not available so the best answer is a) HOKUG's alternative which is d) Hs 860.T."}

However since HOKUG is not available 
{"CHOICE": "d", "ANSWER": "Hs 860.T"}


 88%|██████████████████████████████████████████████████████▊       | 1698/1921 [32:43<04:15,  1.15s/it]

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
The primary disease Non-Cancerous can be best studied using which cell line?
a) MC-CAR
b) PCI-38
c) HNT-34
d) NCI-H2228
I can't answer that.


 93%|█████████████████████████████████████████████████████████▋    | 1789/1921 [34:47<04:18,  1.95s/it]

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
The primary disease Non-Cancerous can be best studied using which cell line?
a) JVE-127
b) NCIBL2052
c) HCC2935
d) NCI-H187
I'm unable to determine the correct cell line to study Non-Cancerous disease as the options provided do not seem to match well-known cell lines for studying non-cancerous diseases. However, I can provide a general answer.

{"CHOICE": "b", "ANSWER": "NCIBL2052"}


 93%|█████████████████████████████████████████████████████████▊    | 1791/1921 [34:50<03:24,  1.57s/it]

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
The primary disease Non-Cancerous can be best studied using which cell line?
a) NUGC-3
b) NCIBL209
c) NCI-H1563
d) HCC1187
I'm unable to determine the correct cell line to study Non-Cancerous disease as the options provided are all cancer cell lines.


 94%|██████████████████████████████████████████████████████████▏   | 1804/1921 [35:07<02:29,  1.28s/it]

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
The primary disease Non-Cancerous can be best studied using which cell line?
a) HCC1395BL
b) OCI-Ly12
c) SNU-175
d) PACADD-135
I can't answer that question with the information you have provided.


 98%|████████████████████████████████████████████████████████████▉ | 1887/1921 [37:01<00:41,  1.23s/it]

Error decoding JSON: Expecting ',' delimiter: line 1 column 34 (char 33)
The primary disease Invasive Breast Carcinoma can be best studied using which cell line?
a) PCI-38
b) NH93T
c) TUR
d) MUTZ-8
{"CHOICE": "b", "ANSWER": "NH93T"


 99%|█████████████████████████████████████████████████████████████▍| 1904/1921 [37:32<01:12,  4.29s/it]

Error decoding JSON: Extra data: line 3 column 1 (char 890)
The primary disease Non-Cancerous can be best studied using which cell line?
a) ABM-T0557
b) CHLA-10
c) Mino
d) U-118 MG
{"CHOICE": "d", "ANSWER": "U-118 MG is not the best answer as it is a glioblastoma cell line. However, the question is unclear as to what 'Non-Cancerous' disease is being referred to. Typically, a non-cancerous cell line would be used to study non-cancerous diseases. An example would be HEK-293 cell line which is non-cancerous and often used in research. However, given the options, U-118 MG is the least likely to be correct answer as the other cell lines are more related to non-cancerous conditions or are unclear. ABM-T0557 is unclear. CHLA-10 is a cell line from a Ewing's Sarcoma but has characteristics of a Mesenchymal Stem Cell line. Mino is a cell line that exhibits characteristics of a Burkitt lymphoma but also has characteristics of a non-cancerous cell line. So the best answer given the options is lik

100%|█████████████████████████████████████████████████████████████▋| 1912/1921 [37:42<00:11,  1.23s/it]

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
The primary disease Non-Cancerous can be best studied using which cell line?
a) UPCI-SCC-026
b) TTC-709
c) ABM-T0796
d) NCI-H345
I can't answer that.


100%|█████████████████████████████████████████████████████████████▊| 1916/1921 [37:46<00:05,  1.07s/it]

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
The primary disease Non-Cancerous can be best studied using which cell line?
a) ABM-T0797
b) NCI-H1694
c) TKKK
d) PACADD-165
I'm unable to verify the primary disease associated with the cell lines you've listed.


100%|██████████████████████████████████████████████████████████████| 1921/1921 [37:55<00:00,  1.18s/it]

1598 correct responses out of 1899



