# Instruct drug dataset

Based on the FDA approved medications listed on drugs.com

In [1]:
from datasets import load_dataset

In [2]:
def format_instruction(sample):
	return f"""### Instruction:
Formulate an instruction to answer inquiries truthfully and accurately related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
What is the clinical purpose of {sample['name']}?
### Response:
{sample['prescribed']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
What is the appropriate dosage and frequency for a patient to take {sample['name']} medication?
### Response:
{sample['usage']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
What are the potential adverse effects or side effects of {sample['name']} medication?
### Response:
{sample['side_effects']}


### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
Can the {sample['name']} medication be priscribed for other usages?
### Response:
{sample['other_uses']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
Are there notifications patients need to be aware of about the prescription drug {sample['name']}?
### Response:
{sample['notice']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
What are the contraindications or precautions associated with {sample['name']} medication?
### Response:
{sample['precautions']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
Should the patient avoid any specific foods, beverages, or activities while taking {sample['name']} medication?
### Response:
{sample['dietary']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
What should the patient do if they miss a dose of {sample['name']} medication?
### Response:
{sample['forgotten']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
How should the the {sample['name']} medication be stored to maintain its efficacy?
### Response:
{sample['storage_disposal']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
What are the signs and recommended actions to take in the event of an overdose on {sample['name']} medication?
### Response:
{sample['overdose']}

### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
Are there any additional info regarding {sample['name']} medication?
### Response:
{sample['other_info']}


### Instruction:
Formulate an instruction to address inquiries related to prescription medication, tailored for doctors, pharmacists, or patients.
### Input:
What alternative treatment options including over-the-counter alternatives or supplements are available to supplyment {sample['name']} medication?
### Response:
{sample['brand_names']}
"""

In [180]:
import re
def remove_chars(input_string, chars_to_remove):
    return ''.join([char for char in input_string if char not in chars_to_remove])

def format_instruction_jsonl(sample):
    instructs = []
    inputs = {}
    inputs['prescribed']= f"What is the clinical purpose of {sample['name']} and how does it help patients?"
    inputs['usage'] = f"What is the appropriate dosage and frequency for a patient to take {sample['name']} medication?"
    inputs['side_effects'] = f"What are the potential adverse effects or side effects of {sample['name']} medication?",
    inputs['other_uses'] = f"Can the {sample['name']} medication be priscribed for other usages?",
    inputs['notice'] = f"Are there notifications patients need to be aware of about the prescription drug {sample['name']}?"
    inputs['precautions'] = f"What are the contraindications or precautions associated with {sample['name']} medication?"
    inputs['dietary'] = f"Should the patient avoid any specific foods, beverages, or activities while taking {sample['name']} medication?"
    inputs['forgotten'] = f"What should the patient do if they miss a dose of {sample['name']} medication?"
    inputs['storage_disposal'] = f"How should the the {sample['name']} medication be stored to maintain its efficacy?"
    inputs['overdose'] = f"What are the signs and recommended actions to take in the event of an overdose on {sample['name']} medication?"
    inputs['other_info'] = f"Are there any additional info regarding {sample['name']} medication?"
    inputs['brand_names'] = f"What alternative treatment options including over-the-counter alternatives or supplements are available to supplyment {sample['name']} medication?"

    chars_to_remove = ['[', ']', '(', ')']
    for key, value in inputs.items():
        value = remove_chars(value, chars_to_remove)
        instruct = {"Instruction":f"Formulate an instruction to answer inquiries truthfully and accurately related to {sample['name']} medication, tailored for doctors, pharmacists, or patients.","Input":value,"Response":sample[key]}
        '''
        if type(value) == str:
            instruct[key.strip()] = value.strip().replace("\r\n","")
        else:
            instruct[key.strip()] = value
        '''
        instruct_text = {"text": f"###Instruction Formulate an instruction to answer inquiries truthfully and accurately for medicare providers or patients. ###Iuput Analyze the question on {value}. ###Response {sample[key]}"}
        instructs.append(instruct_text)
    
    return instructs

import json

def format_instruction(sample):
	return f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM. 

### Input:
What is the clinical purpose of {sample['name']} and how should the patient take this medication?

### Response:
{sample['usage']}
"""


def parse_jsonl(input_file, output_file):
    """
    Parses a JSONL file, extracts keys and values, and saves them as instructions in a new JSONL file.

    :param input_file: Path to the input JSONL file.
    :param output_file: Path to the output JSONL file.
    """
    # Ensure the input file has a .jsonl extension
    if not input_file.endswith('.jsonl'):
        raise ValueError('Input file must have a .jsonl extension')

    # Prepare to save the instructions
    instructions = []

    # Open and read the input JSONL file
    with open(input_file, 'r') as infile:
        for line in infile:
            record = json.loads(line)
            
            
            # Extract keys and values from each record
            for key, value in record.items():
                #record.pop('url', None)
                instruction = {
                    if (key == 'name'):
                        'instruction': f"What is the clinical purpose of '{value}' and how should the patient take this medication?",
                        #'context': value
                }
                instructions.append(instruction)

    # Save the instructions in a new JSONL file
    with open(output_file, 'w') as outfile:
        for instruction in instructions:
            outfile.write(json.dumps(instruction) + '\n')

    print(f"Instructions saved in {output_file}")

In [183]:
# Example usage
input_file = './raw/drugs_output.jsonl'
output_file = './output/drugs_instruct_text.jsonl'
#parse_jsonl(input_file, output_file)

In [184]:
# Load .jsonl file into a Dataset
dataset = load_dataset('json', data_files=input_file, split='train')

In [181]:
from random import randrange
#m_list = format_instruction_jsonl(dataset[randrange(len(dataset))])
m_list = format_instruction_jsonl(dataset[randrange(len(dataset))])
#drug_str = ''.join(map(str,format_instruction_jsonl(dataset[randrange(len(dataset))])))
m_json = json.dumps(m_list)

In [182]:
m_list

[{'text': '###Instruction Formulate an instruction to answer inquiries truthfully and accurately for medicare providers or patients. ###Iuput Analyze the question on What is the clinical purpose of Neomycin, Polymyxin, and Bacitracin Topical and how does it help patients?. ###Response Neomycin, polymyxin, and bacitracin combination is used to prevent minor skin injuries such as cuts, scrapes, and burns from becoming infected. Neomycin, polymyxin, and bacitracin are in a class of medications called antibiotics. Neomycin, polymyxin, and bacitracin combination works by stopping the growth of bacteria. '},
 {'text': "###Instruction Formulate an instruction to answer inquiries truthfully and accurately for medicare providers or patients. ###Iuput Analyze the question on What is the appropriate dosage and frequency for a patient to take Neomycin, Polymyxin, and Bacitracin Topical medication?. ###Response Neomycin, polymyxin, and bacitracin combination comes as an ointment to apply to the ski

In [185]:
with open(output_file, "w") as jsonl_file:
    for i in range(len(dataset)):
        my_list = format_instruction_jsonl(dataset[i])
        for item in m_list:
            json.dump(item, jsonl_file)
            jsonl_file.write("\n") 

In [186]:
# Load back the instruct dataste
# Load .jsonl file into a Dataset
dataset_2 = load_dataset('json', data_files=output_file, split='train')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [187]:
dataset_2

Dataset({
    features: ['text'],
    num_rows: 16500
})