If you are using your own data, it must be formatted as one directory with 6 files:
----------------------------------------------------------------------------

test.source   
test.target   
train.source   
train.target   
val.source   
val.target   


The .source files are the input, the .target files are the desired output.

In [1]:
import pickle
from nltk.tokenize import wordpunct_tokenize

In [2]:
# load array of objects, where object - class Leaflet
with open("datasets/LEAFLET_TRAIN_DATASET.pickle", "rb") as f:
    train_dataset = pickle.load(f)

In [3]:
# load array of objects, where object - class Leaflet
with open("datasets/LEAFLET_VALID_DATASET.pickle", "rb") as f:
    valid_dataset = pickle.load(f)

In [4]:
# load array of objects, where object - class Leaflet
with open("datasets/LEAFLET_TEST_DATASET.pickle", "rb") as f:
    test_dataset = pickle.load(f)

In [5]:
def create_source_target(dataset, section_conditiong=False):
    """
    Create input files for T5 fine-tuning.
    
    The .source files are the input, the .target files are the desired output.
    """
    
    # array to store source data of each leaflet
    source_data_array = []
    
    # array to store target data of each leaflet
    target_data_array = []
    
    
    for leaflet in dataset:
    
        current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                    leaflet.section3, leaflet.section4, 
                                    leaflet.section5, leaflet.section6]
    
        for section_index, current_section in enumerate(current_leaflet_sections):
            
            # skip if either input or output is None
            if current_section.section_content is None or current_section.entity_recognition is None:
                continue
            
            # extract section content
            section_content = current_section.section_content
            
            # extract results of NER
            section_entity_recognition = current_section.entity_recognition
            
            # if add conditioning on a section_type (section title)  
            # depending on the Section index, add corresponding tag - section_title
            #
            
            if section_conditiong:
            
                if (section_index+1) == 1: source_leaflet_str = 'What the medicine is and what it is used for: '.lower()
                elif (section_index+1) == 2: source_leaflet_str = 'What you need to know before you take the medicine: '.lower()
                elif (section_index+1) == 3: source_leaflet_str = 'How to take the medicine: '.lower()
                elif (section_index+1) == 4: source_leaflet_str = 'Possible side effects: '.lower()
                elif (section_index+1) == 5: source_leaflet_str = 'How to store the medicine: '.lower()
                elif (section_index+1) == 6: source_leaflet_str = 'Contents of the pack and other information: '.lower()
            else:
                # start with empty string
                source_leaflet_str = ''
            
            for entity in section_entity_recognition:
                
                entity_value = entity['Text'] if len(entity['Text'].split(" ")) == 0 else ("_").join(entity['Text'].split(" "))
                entity_type = entity['Type'] if entity['Type'] is not None and len(entity['Type']) > 0 else entity['Category']

                # create source data in special format
                source_leaflet_str += "<" + str(entity_type) + "> " + str(entity_value) + " </" + str(entity_type) + ">"

                if section_entity_recognition.index(entity) != len(section_entity_recognition) - 1:
                    # 1 space - delimiter between entities
                    source_leaflet_str += " "
                else:
                    source_leaflet_str += "\n"
            
            
            source_data_array.append(source_leaflet_str)

            # get the section content
            # make sure to have punctuations as a separate token
            section_content = wordpunct_tokenize(section_content)

            # back to string
            section_content = " ".join(section_content)

            # add "\n" at the end
            section_content = section_content + "\n"

            target_data_array.append(section_content)

    return (source_data_array, target_data_array)

In [6]:
# example how the input and corresponding output look like
create_source_target(train_dataset[:2])

(["<PRODUCT_NAME> cystagon </PRODUCT_NAME> <DX_NAME> cystinosis </DX_NAME> <PROBLEM> a_metabolic_disease </PROBLEM> <PROBLEM> 'nephropathic_cystinosis' </PROBLEM> <PROBLEM> an_abnormal_accumulation_of_the_amino_acid_cystine_in_various_organs_of_the_body_such_as_the_kidney,_eye,_muscle,_pancreas,_and_brain </PROBLEM> <PROBLEM> cystine_build </PROBLEM> <DX_NAME> kidney_damage </DX_NAME> <PROBLEM> excess_amounts_of_glucose </PROBLEM> <TEST_NAME> proteins </TEST_NAME> <TEST_NAME> electrolytes </TEST_NAME> <TREATMENT> cystagon </TREATMENT> <PROBLEM> this_rare_inherited_disorder </PROBLEM> <TREATMENT> a_medicine </TREATMENT> <TREATMENT> cystine </TREATMENT>\n",
  "<PRODUCT_NAME> cystagon </PRODUCT_NAME> <GENERIC_NAME> cystagon </GENERIC_NAME> <PROBLEM> allergic </PROBLEM> <DX_NAME> hypersensitive </DX_NAME> <GENERIC_NAME> cysteamine_bitartrate </GENERIC_NAME> <GENERIC_NAME> penicillamine </GENERIC_NAME> <GENERIC_NAME> cystagon </GENERIC_NAME> <DX_NAME> pregnant </DX_NAME> <TREATMENT> cystago

In [7]:
source_train, target_train = create_source_target(train_dataset)

In [8]:
source_valid, target_valid = create_source_target(valid_dataset)

In [9]:
source_test, target_test = create_source_target(test_dataset)

### Write to files

In [10]:
def save_data_file(data, filename):
    """
    Save data to the file
    """
    
    FILE_PATH = filename
    
    # save to corresponding file
    output_file = open(FILE_PATH, 'w')
    for leaflet_data in data:
        output_file.write(leaflet_data)
    output_file.close()
    
    print("Data saved successfully to ", filename)    
    print("=====================================")

In [11]:

# Train

TRAIN_SOURCE='/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/train.source'
save_data_file(source_train, TRAIN_SOURCE)

TRAIN_TARGET='/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/train.target'
save_data_file(target_train, TRAIN_TARGET)

# Validation

VAL_SOURCE='/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/val.source'
save_data_file(source_valid, VAL_SOURCE)

VAL_TARGET='/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/val.target'
save_data_file(target_valid, VAL_TARGET)

# Test

TEST_SOURCE='/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/test.source'
save_data_file(source_test, TEST_SOURCE)

TEST_TARGET='/home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/test.target'
save_data_file(target_test, TEST_TARGET)

Data saved successfully to  /home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/train.source
Data saved successfully to  /home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/train.target
Data saved successfully to  /home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/val.source
Data saved successfully to  /home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/val.target
Data saved successfully to  /home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/test.source
Data saved successfully to  /home/ruslan_yermakov/nlg-ra/T5_experiments/T5_plain/input_data/test.target
