### This notebook aims to create a train/test split of Karolina's LEK dataset.

In [2]:
import pandas as pd
import re

In [16]:
# Data loading
file_path = '/code/self-explanation/thesis_answers.csv'

# Read the CSV file using pandas with semicolon as delimiter
df = pd.read_csv(file_path, delimiter=';')

# 975 datapoints
df.head()

Unnamed: 0,Question,Version,Year,Category,Answer,%A,%B,%C,%D,%E,...,Prompt_1_Llama_13b,GPT 4,GPT 3,Bard,Vicuna 7B,Vicuna 13B,Vicuna 33B,Llama 7B,Llama 13B,Llama 70B
0,A 66-year-old male who has been treated only f...,Spring,2021,Internal Dieseases,C,9.7,18.7,38.6,25.7,7.5,...,0,C,D,C,A,B,B,0,C,C
1,An absolute contraindication to fibrynolysis i...,Spring,2021,Internal Dieseases,A,61.2,6.2,1.5,9.4,20.9,...,0,E,E,E,E,E,E,0,E,C
2,Hypercalcemia related to malignancy: A. is mos...,Spring,2021,Internal Dieseases,B,2.4,52.4,1.3,39.1,4.8,...,0,B,D,D,D,D,D,0,D,D
3,Lung cancer in Poland: A. is the most common m...,Spring,2021,Internal Dieseases,D,1.7,1.1,3.4,37.4,56.4,...,0,D,D,E,D,D,D,0,D,D
4,Cancer of unknown primary site: A. is found in...,Spring,2021,Internal Dieseases,D,6.3,19.0,5.0,38.1,31.6,...,0,D,D,D,D,D,E,D,D,E


### Data processing

In [3]:
# Create option columns 'A', 'B', 'C', 'D' in the df
def split_question_options(row):
    text = row['Question']
    pattern = r"^(.*?)\s*A\.\s*(.*?)\s*B\.\s*(.*?)\s*C\.\s*(.*?)\s*D\.\s*(.*?)\s*(?:E\.\s*(.*?))?\s*$"
    match = re.match(pattern, text)
    if match:
        question, option_a, option_b, option_c, option_d, *option_e = match.groups()
        option_e = option_e[0] if option_e else ""  # Handle optional option E
        return pd.Series([question, option_a, option_b, option_c, option_d, option_e, row['Answer']])
    else:
        return pd.Series([text, "", "", "", "", "", row['Answer']])

# Apply the function to each row
split_df = df.apply(split_question_options, axis=1)

# Rename columns and reset index
split_df.columns = ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']
split_df = split_df.reset_index().rename(columns={'index': 'index'})

split_df.head()

Unnamed: 0,index,prompt,A,B,C,D,E,answer
0,0,A 66-year-old male who has been treated only f...,stop atrial fibrillation by electrical cardiov...,"stop atrial fibrillation by pharmacotherapy, r...",slow down HR to the safe and well-tolerated le...,slow down HR to the safe and well-tolerated le...,refer the patient to cardiology dept. for urge...,C
1,1,An absolute contraindication to fibrynolysis is:,ischaemic stroke within the last six months.,pregnancy.,infectious endocarditis.,active peptic ulcer disease.,all of the above.,A
2,2,Hypercalcemia related to malignancy:,is most frequent in patients with chronic myel...,may be related to parathormone secretion.,should always be treated with oral biphosphona...,answers A and B are correct.,"answers A, B, and C are correct.",B
3,3,Lung cancer in Poland:,is the most common malignant disease in men.,is the most common cause of cancer-related dea...,is the most common cause of cancer-related dea...,answers B and C are correct.,"answers A, B, C are correct.",D
4,4,Cancer of unknown primary site:,is found in 30% of all cases of malignant dise...,at least half of the cases have morphology of ...,should be diagnosed using immunochemistry.,answers B and C are correct.,"answers A, B, and C are correct.",D


In [4]:
from sklearn.model_selection import train_test_split

# Splitting the DataFrame into 80% train and 20% test
train_df, test_df = train_test_split(split_df, test_size=0.20, random_state=42)

# Checking the size of the resulting train and test DataFrames
train_size = len(train_df)
test_size = len(test_df)

train_size, test_size

(780, 195)

In [5]:
train_df.head()

Unnamed: 0,index,prompt,A,B,C,D,E,answer
969,969,A potential relationship between a given disea...,cross-sectional study.,case-control study.,ecological study.,correlational study.,descriptive study.,B
678,678,Which of the following is not an indication fo...,post-inflammatory liver cirrhosis.,Budd-Chiari syndrome.,Wilson’s disease.,primary and secondary biliary liver cirrhosis.,disseminated hepatocellular carcinoma.,E
894,894,A patient presents to the gynaecological outpa...,overactive bladder.,exercise-induced urinary incontinence caused b...,mixed urinary incontinence.,evacuation of accumulated lymph fluid through ...,vesicovaginal fistula.,E
33,33,Superior vena cava syndrome:,most commonly occurs in the course of lung can...,may be associated with vessel thrombosis.,is a contraindication to the use of corticoste...,A and B are correct.,"A, B and C are correct.",D
31,31,The treatment of acute hyperkalemia includes t...,156.,245.,356.,236.,136.,D


In [14]:
# Saving the train_df and test_df to local CSV files
train_df.to_csv('/code/llm-fine-tuning/llm_finetuning/thesis_questions_train.csv', index=False)
test_df.to_csv('/code/llm-fine-tuning/llm_finetuning/thesis_questions_test.csv', index=False)

# Check alignment between pre-trained answers using perplexity ranking with Karo results

In [3]:
test_df = pd.read_csv('/code/llm-fine-tuning/llm_finetuning/thesis_questions_test.csv')


In [6]:
test_df.head()

Unnamed: 0,index,prompt,A,B,C,D,E,answer
0,199,The HLY indicator means:,human development index.,healthy life years.,quality-adjusted life years.,metric area index.,humanitarian list of the year.,B
1,789,The symptoms of sarcoidosis include: 1) skin l...,1236.,49.,239.,3689.,all the above.,E
2,174,CURB-65 score facilitates making decision on t...,urinary incontinence.,pneumonitis.,acute pancreatitis.,ischemic brain stroke.,myocarditis.,B
3,467,The absolute indications for the surgical trea...,lack of improvement after conservative treatment.,infected pancreatic necrosis.,bleeding to the abdominal cavity.,gastrointestinal perforation.,abdominal compartment syndrome.,A
4,66,Fever and small itchy blisters that form crust...,erythema infectiosum.,impetigo.,HSV infection.,chickenpox.,measles.,D


In [12]:
# Load my results on LLMs using perplexity ranking
results_df = pd.read_csv('/code/llm-fine-tuning/llm_finetuning/test_results.csv')
results_df = results_df[['index', 'llama2_7B_pt_top3', 'llama2_7B_chat_pt_top3', 'vicuna_7B_pt_top3']]

In [14]:
results_df['llama2_7B_pt_top1'] = results_df['llama2_7B_pt_top3'].str[0]
results_df['llama2_7B_chat_pt_top1'] = results_df['llama2_7B_chat_pt_top3'].str[0]
results_df['vicuna_7B_pt_top1'] = results_df['vicuna_7B_pt_top3'].str[0]

In [15]:
results_df.head()

Unnamed: 0,index,llama2_7B_pt_top3,llama2_7B_chat_pt_top3,vicuna_7B_pt_top3,llama2_7B_pt_top1,llama2_7B_chat_pt_top1,vicuna_7B_pt_top1
0,199,B A D,B E C,B C A,B,B,B
1,789,E D B,E A D,E D C,E,E,E
2,174,E B C,C B E,C A D,E,C,C
3,467,B E D,B A E,B D A,B,B,B
4,66,B E D,B A E,B A C,B,B,B


In [20]:
# Load Karo's results on LLMs for directly generating response
file_path = '/code/self-explanation/thesis_answers.csv'

# Read the CSV file using pandas with semicolon as delimiter
karo_df = pd.read_csv(file_path, delimiter=';')
karo_df = karo_df[['Llama 7B', 'Vicuna 7B']]
karo_df.reset_index(inplace=True) # Add an 'index' column
karo_df = karo_df.rename(columns={'Llama 7B': 'Karo_llama2_7B', 'Vicuna 7B': 'Karo_vicuna_7B'})

# 975 datapoints
karo_df.head()

Unnamed: 0,index,Karo_llama2_7B,Karo_vicuna_7B
0,0,0,A
1,1,0,E
2,2,0,D
3,3,0,D
4,4,D,D


In [22]:
# Merge two dfs on the test set
merged_df = results_df.merge(karo_df, on='index', how='inner')
merged_df.head()

Unnamed: 0,index,llama2_7B_pt_top3,llama2_7B_chat_pt_top3,vicuna_7B_pt_top3,llama2_7B_pt_top1,llama2_7B_chat_pt_top1,vicuna_7B_pt_top1,Karo_llama2_7B,Karo_vicuna_7B
0,199,B A D,B E C,B C A,B,B,B,0,B
1,789,E D B,E A D,E D C,E,E,E,0,E
2,174,E B C,C B E,C A D,E,C,C,0,D
3,467,B E D,B A E,B D A,B,B,B,B,B
4,66,B E D,B A E,B A C,B,B,B,0,0


In [32]:
# Compute percentage of matches
matches = sum((merged_df['Karo_llama2_7B'] == merged_df['llama2_7B_chat_pt_top1']) & (merged_df['Karo_llama2_7B'] != '0'))
non_zero_karo = sum(merged_df['Karo_llama2_7B'] != '0')
percentage = (matches / non_zero_karo) * 100
print(f"The percentage of matched entries in LLaMA-2-chat-7B is: {percentage}")

The percentage of matched entries in LLaMA-2-chat-7B is: 47.88732394366197


In [31]:
matches = sum((merged_df['Karo_vicuna_7B'] == merged_df['vicuna_7B_pt_top1']) & (merged_df['Karo_llama2_7B'] != '0'))
non_zero_karo = sum(merged_df['Karo_vicuna_7B'] != '0')
percentage = (matches / non_zero_karo) * 100
print(f"The percentage of matched entries in Vicuna-7B is: {percentage}")

The percentage of matched entries in Vicuna-7B is: 13.661202185792352
