In [40]:
from google.colab import drive
drive.mount('/content/drive')

DIRECTORY = "/content/drive/MyDrive/AIN413"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
!pip install -q datasets

In [42]:
from datasets import load_dataset, concatenate_datasets, Dataset
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')

In [43]:
# 112k
#hlmagic = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k", split='train')

# 34k
mm_flashcards = load_dataset("medalpaca/medical_meadow_medical_flashcards", split='train')

# 10k
mm_wikidoc = load_dataset("medalpaca/medical_meadow_wikidoc", split='train')

# 6k
mm_wikidoc_patient = load_dataset("medalpaca/medical_meadow_wikidoc_patient_information", split='train')

# 10k
mm_medqa = load_dataset("medalpaca/medical_meadow_medqa", split='train')

# 2k, replicate
#mm_mediqa = load_dataset("medalpaca/medical_meadow_mediqa", split='train')

# 47k
medquad = load_dataset("lavita/MedQuAD", split='train')

# 1.4k
dermatology = load_dataset("Mreeb/Dermatology-Question-Answer-Dataset-For-Fine-Tuning", split='train')

# 3.5k, replicate
#mental_health = load_dataset("Amod/mental_health_counseling_conversations", split='train')

# 200k
#pubmed_qa = load_dataset("lavita/medical-qa-datasets", "pubmed-qa", split='train')

In [44]:
def remove_redundant_columns(dataset, columns_to_keep):
    return dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])

def rename_columns(dataset, columns, new_columns):
    temp = dataset
    for old, new in zip(columns, new_columns):
        temp = temp.rename_column(old, new)
    return temp

def append_instruction(example):
    example['instruction'] = example['instruction'] + ", you are a medical assistant."
    return example

def add_instruction(dataset, instruction):
    inst_column = [instruction] * len(dataset)
    return dataset.add_column('instruction', inst_column)

def alter_instruction(dataset, instruction):
    if type(instruction) is str:
        new_inst = [instruction + str(value) for value in dataset['instruction']]
    else:
        new_inst = instruction
    to_keep = list(set(dataset.column_names) - set(['instruction']))
    temp = remove_redundant_columns(dataset, to_keep)
    temp = temp.add_column('instruction', new_inst)
    return temp

def filter_incomplete(example, column_name):
    input_text = example[column_name]
    word_count = len(input_text.split())
    if word_count < 20:
        return False
    return True

#hlmagic = hlmagic.filter(lambda x: filter_incomplete(x, "input"))
#hlmagic = hlmagic.filter(lambda x: filter_incomplete(x, "output"))

mm_flashcards = mm_flashcards.map(append_instruction)
mm_flashcards = mm_flashcards.shuffle(seed=1).select(indices=range(3500))

mm_wikidoc = mm_wikidoc.map(append_instruction)
mm_wikidoc = mm_wikidoc.shuffle(seed=1).select(indices=range(2500))

mm_wikidoc_patient = mm_wikidoc_patient.map(append_instruction)
mm_wikidoc_patient = mm_wikidoc_patient.shuffle(seed=1).select(indices=range(2500))

mm_medqa = mm_medqa.map(append_instruction)
mm_medqa = mm_medqa.shuffle(seed=1).select(indices=range(1500))

#mm_mediqa = rename_columns(mm_mediqa, ['input', 'instruction'], ['temp_instruction', 'temp_input'])
#mm_mediqa = rename_columns(mm_mediqa, ['temp_instruction', 'temp_input'], ['instruction', 'input'])
#mm_mediqa = alter_instruction(mm_mediqa, "Answer this question truthfully according to this context: ")

medquad = remove_redundant_columns(medquad, ['question', 'answer'])
medquad = rename_columns(medquad, ['question', 'answer'], ['input', 'output'])
medquad = add_instruction(medquad, "Answer this question truthfully, you are a medical assistant.")
medquad = medquad.shuffle(seed=1).select(indices=range(3500))

dermatology = remove_redundant_columns(dermatology, ['prompt', 'response'])
dermatology = rename_columns(dermatology, ['prompt', 'response'], ['input', 'output'])
dermatology = add_instruction(dermatology, "Answer this question truthfully, you are a medical assistant.")
dermatology = mm_medqa.shuffle(seed=1).select(indices=range(500))

#mental_health = rename_columns(mental_health, ['Context', 'Response'], ['input', 'output'])
#mental_health = add_instruction(mental_health, "Answer this question truthfully")

#pubmed_qa = remove_redundant_columns(pubmed_qa, ['QUESTION', 'CONTEXTS', 'LONG_ANSWER'])
#pubmed_qa = rename_columns(pubmed_qa, ['QUESTION', 'CONTEXTS', 'LONG_ANSWER'], ['input', 'instruction', 'output'])
#pubmed_qa_inst = ["Answer this question truthfully according to this context: " + " ".join(i) for i in pubmed_qa['instruction']]
#pubmed_qa = alter_instruction(pubmed_qa, pubmed_qa_inst)

In [45]:
dataset = concatenate_datasets([mm_flashcards, mm_wikidoc, mm_wikidoc_patient, mm_medqa, medquad, dermatology])
dataset = dataset.shuffle()

In [46]:
df = dataset.to_pandas()

In [47]:
def fit_prompt_format(sample):
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{sample['output']}<|eot_id|>"""
    return prompt

df['prompt'] = df.apply(fit_prompt_format, axis=1)

In [48]:
df

Unnamed: 0,input,output,instruction,prompt
0,What medical condition is characterized by def...,Hirschsprung disease is a medical condition th...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
1,What are the brand names of Cefotetan Injection ?,,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
2,What stage is the meningioma in?,There is no established system for the staging...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
3,Q:A 24-year-old man is brought to the emergenc...,"D: Opening pressure: 15 cm H2O, color: clear, ...",Please answer with one of the option in the br...,<|begin_of_text|><|start_header_id|>system<|en...
4,What is the name of the type of anemia that ca...,The type of anemia that can result from the us...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
...,...,...,...,...
13995,Is echocardiography or ultrasound essential fo...,There are no electrocardiogram findings associ...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
13996,What are the symptoms of upper gastrointestina...,What are the typical symptoms of upper gastroi...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
13997,What are the treatments for Chronic pancreatit...,,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
13998,Do I need to see a doctor for Depression - ove...,,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...


In [49]:
df.isnull().sum()

input             0
output         2274
instruction       0
prompt            0
dtype: int64

In [50]:
df = df.dropna()
df = df.drop_duplicates()
df = df[df['input'].str.len() != 0]
df = df[df['output'].str.len() != 0]

prompts = df[['prompt']]

In [51]:
df

Unnamed: 0,input,output,instruction,prompt
0,What medical condition is characterized by def...,Hirschsprung disease is a medical condition th...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
2,What stage is the meningioma in?,There is no established system for the staging...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
3,Q:A 24-year-old man is brought to the emergenc...,"D: Opening pressure: 15 cm H2O, color: clear, ...",Please answer with one of the option in the br...,<|begin_of_text|><|start_header_id|>system<|en...
4,What is the name of the type of anemia that ca...,The type of anemia that can result from the us...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
6,Could you provide more context or information ...,"Prognosis, complications, and outcome depend o...","Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
...,...,...,...,...
13993,What are the symptoms of Head and neck cancer?,Symptoms include:\nA lump or sore that does no...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
13994,What are the symptoms of Sciatica ?,Sciatica pain can vary widely. It may feel lik...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
13995,Is echocardiography or ultrasound essential fo...,There are no electrocardiogram findings associ...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...
13996,What are the symptoms of upper gastrointestina...,What are the typical symptoms of upper gastroi...,"Answer this question truthfully, you are a med...",<|begin_of_text|><|start_header_id|>system<|en...


In [52]:
prompts

Unnamed: 0,prompt
0,<|begin_of_text|><|start_header_id|>system<|en...
2,<|begin_of_text|><|start_header_id|>system<|en...
3,<|begin_of_text|><|start_header_id|>system<|en...
4,<|begin_of_text|><|start_header_id|>system<|en...
6,<|begin_of_text|><|start_header_id|>system<|en...
...,...
13993,<|begin_of_text|><|start_header_id|>system<|en...
13994,<|begin_of_text|><|start_header_id|>system<|en...
13995,<|begin_of_text|><|start_header_id|>system<|en...
13996,<|begin_of_text|><|start_header_id|>system<|en...


In [53]:
df.isnull().sum()

input          0
output         0
instruction    0
prompt         0
dtype: int64

In [54]:
df.to_parquet(Path(DIRECTORY) / "full_dataset.parquet")
prompts.to_parquet(Path(DIRECTORY) / "prompts.parquet")