#### Create small train and validation set from Med-QA dataset

In [1]:
from datasets import load_dataset

# Load the med_qa dataset and specifically the 'med_qa_tw_en_bigbio_qa' subset
dataset = load_dataset("bigbio/med_qa", "med_qa_tw_en_bigbio_qa")

# Shuffle the dataset and sample random entries to make up train and validation sets
subset = dataset["train"].shuffle(seed=42).select(range(250))

#### View the dataset

In [2]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [3]:
show_random_elements(subset)

Unnamed: 0,id,question_id,document_id,question,type,choices,context,answer
0,10311,10311,10311,"39 year old woman suffering from mediastinal Hodgkin's disease, a series of neurological disorders, cerebral spinal fluid of normal, magnetic resonance imaging found demyelination (demyelination) area, after getting into a coma, four months after the onset of death. Her brain changes are:",multiple_choice,"[Progressive multifocal leukoencephalopathy (progressive multifocal leukoencephalopathy), Aspergillus disease (aspergillosis), Hodgkin's disease transfer, Cerebral infarction (cerebral infarction)]",,[Progressive multifocal leukoencephalopathy (progressive multifocal leukoencephalopathy)]
1,1533,1533,1533,Stroke is the most common type in what?,multiple_choice,"[Cerebral hemorrhage, Subarachnoid hemorrhage, Arteriovenous malformation rupture, Ischemic Stroke]",,[Ischemic Stroke]
2,4539,4539,4539,Bactericidal action of penicillin (penicillin) because it can inhibit the activity of these enzymes is what?,multiple_choice,"[Lactam enzymes (β-lactamase), Peptidase enzyme group (transpeptidase), Ligase (ligase), Aldolase (Aldolase)]",,[Peptidase enzyme group (transpeptidase)]
3,6076,6076,6076,Which of the following is more suitable for patients with movement training for strength after a heart transplant?,multiple_choice,"[Anaerobic threshold, Eighty-five percent of maximum heart rate, Sixty percent of maximum heart rate, (Maximum heart rate during exercise tolerance tests - resting heart rate) × (40 - 60)% + resting heart rate]",,[Anaerobic threshold]
4,2154,2154,2154,"Related to cell cycle (cell cycle) of the narrative, Which statement is correct?",multiple_choice,"[In turn can be divided into M phase, Gl phase, G2 phase, S phase, and then back to the M phase, May be subjected to various MAP kinase (mitogen-activated protein kinase) Regulation, The growth was inhibited by an outer cytokines (growth factor), As long as cell survival, cell cycle will continue to be]",,[May be subjected to various MAP kinase (mitogen-activated protein kinase) Regulation]
5,4031,4031,4031,"Affective disorders in the so-called shield type depressive (masked depression), in which of the following occur up to?",multiple_choice,"[teens, Adults, Old people, child]",,[Old people]
6,9242,9242,9242,"38 A 45 year old male, 7 years ago, found a lymphoma, remission after treatment has been no recurrence, and no other abnormalities. The most recent blood test results are as follows: Hb 12.1 g / dL, the number of erythrocytes 5.41 × 106 / mm3, MCV 69.5 fL, white blood cell count 4,640 / mm3, normal classification, platelet count 174,000 / mm3; hemosiderin (ferritin) 277 ng / mL, hemoglobin electrophoresis display HbA2 2.1% (normal <3.5%), HbF 1.1% (normal <2.0%). The man most likely have which of the following diseases?\n",multiple_choice,"[a-Thalassemia, b-Thalassemia, anemia of chronic disease, Sideroblastic anemia]",,[a-Thalassemia]
7,5227,5227,5227,"A full-term baby born after crying loudly, good limbs motility, general ruddy complexion, heart rate about 140 / min, 40 breaths / min, suction will stimulate sneezing reaction, his Apgar Score is a bit?",multiple_choice,"[4, 6, 8, 10]",,[10]
8,6405,6405,6405,"Help ⾏ device (ambulation aids) are often ⾒ the harness, which of the following description of an error relating to help ⾏'s?",multiple_choice,"[Using the filter aid ⾏ destination time include improving energy balance by Use care has redistributed lower limb weight bearing areas and reduce the pain of lower limbs, ⾏ aid is only available care has ⽀ amount of support, unable to provide sensory feedback, To select Using that kind of help ⼀ ⾏ is decided by those who need to Using balance can help load-bearing and care has been taken, Can help ⾏ as an upper extremity extending]",,"[⾏ aid is only available care has ⽀ amount of support, unable to provide sensory feedback]"
9,9685,9685,9685,Which of the following is not a non-immune type water births (hydrops fetalis) reasons?,multiple_choice,"[anemia, Ventricular septal defect, The first 13 pairs of chromosome abnormalities, Cytomegalovirus infection]",,[Ventricular septal defect]


#### Process the data structure to be compatible for Autotrain - to have the 'text' column

In [4]:
# Convert the subset to a Pandas DataFrame 
df = subset.to_pandas()

# Remove some columns 
df = df[['question', 'choices', 'answer']]

# Modify the content of some columns
df['answer'] = df['answer'].str[-1] # Remove the square brackets around the string
df = df.fillna("")

df.head()


Unnamed: 0,question,choices,answer
0,"In order to deal with the SARS epidemic, epide...","[Low false positive, Low false negative, High ...",High sensitivity
1,Related to cell cycle (cell cycle) of the narr...,"[In turn can be divided into M phase, Gl phase...",May be subjected to various MAP kinase (mitoge...
2,"For the diagnosis of pelvic hemorrhage, punctu...",[The posterior vaginal fornix (posterior forni...,The posterior vaginal fornix (posterior fornix...
3,Account of the tracheal stenosis surgery anest...,[Must first dilation tube may be less than 5 m...,Anesthesia should first plug the endotracheal ...
4,The following groups of amino acids that direc...,"[tyrosine, glycine, glutamine, glutamate, cyst...","glycine, arginine, methionine"


In [5]:
# Create a 'text' column in the df in the required format
text_col = []
for _, row in df.iterrows():
    prompt = str(row["question"])
    instruction = "Choose one choice from the following: \n" + str(row["choices"]) + "\n\n"
    # input_query = str(row["question"])
    response = str(row["answer"])

    text = (
        prompt
        + "### Instruction:\n"
        + instruction
        + "\n### Response:\n"
        + response
    )

    text_col.append(text) 

df.loc[:, "text"] = text_col

print(df.head())

                                            question  \
0  In order to deal with the SARS epidemic, epide...   
1  Related to cell cycle (cell cycle) of the narr...   
2  For the diagnosis of pelvic hemorrhage, punctu...   
3  Account of the tracheal stenosis surgery anest...   
4  The following groups of amino acids that direc...   

                                             choices  \
0  [Low false positive, Low false negative, High ...   
1  [In turn can be divided into M phase, Gl phase...   
2  [The posterior vaginal fornix (posterior forni...   
3  [Must first dilation tube may be less than 5 m...   
4  [tyrosine, glycine, glutamine, glutamate, cyst...   

                                              answer  \
0                                   High sensitivity   
1  May be subjected to various MAP kinase (mitoge...   
2  The posterior vaginal fornix (posterior fornix...   
3  Anesthesia should first plug the endotracheal ...   
4                      glycine, arginine, meth

In [59]:
# # Create a 'text' column in the df in the required format
# text_col = []
# for _, row in df.iterrows():
#     prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Follow the instruction to provide an answer based on the input. \n\n"
#     instruction = "For the question given in the input, choose one choice from the following: \n" + str(row["choices"]) + "\n\n"
#     input_query = str(row["question"])
#     response = str(row["answer"])

#     if len(input_query.strip()) == 0:
#         text = prompt + "### Instruction:\n" + instruction + "\n### Response:\n" + response
#     else:
#         text = (
#             prompt
#             + "### Instruction:\n"
#             + instruction
#             + "\n### Input:\n"
#             + input_query
#             + "\n### Response:\n"
#             + response
#         )

#     text_col.append(text) 

# df.loc[:, "text"] = text_col

# print(df.head())

                                            question  \
0  In order to deal with the SARS epidemic, epide...   
1  Related to cell cycle (cell cycle) of the narr...   
2  For the diagnosis of pelvic hemorrhage, punctu...   
3  Account of the tracheal stenosis surgery anest...   
4  The following groups of amino acids that direc...   

                                             choices  \
0  [Low false positive, Low false negative, High ...   
1  [In turn can be divided into M phase, Gl phase...   
2  [The posterior vaginal fornix (posterior forni...   
3  [Must first dilation tube may be less than 5 m...   
4  [tyrosine, glycine, glutamine, glutamate, cyst...   

                                              answer  \
0                                   High sensitivity   
1  May be subjected to various MAP kinase (mitoge...   
2  The posterior vaginal fornix (posterior fornix...   
3  Anesthesia should first plug the endotracheal ...   
4                      glycine, arginine, meth

In [6]:
df['text'][2]

"For the diagnosis of pelvic hemorrhage, puncture fluid pouch typically performed surgery (culdocentesis) via the following where?### Instruction:\nChoose one choice from the following: \n['The posterior vaginal fornix (posterior fornix of vagina)'\n 'Vaginal vault (anterior fornix of vagina)'\n 'Bladder wall (posterior wall of urinary bladder)'\n 'Rectal wall (anterior wall of rectum)']\n\n\n### Response:\nThe posterior vaginal fornix (posterior fornix of vagina)"

#### (Optional) Create the train-val split for autotrain

In [54]:
# If only give a train.csv to autotrain, it will automatically split it for evaluation
df.to_csv('train.csv', index=False)

# from sklearn.model_selection import train_test_split

# train_df, val_df = train_test_split(df, train_size=2000, test_size=500, random_state=42)

# # Save the dataframes to csv files
# train_df.to_csv('train.csv', index=False)
# val_df.to_csv('val.csv', index=False)