In [1]:
import pickle
import numpy as np
import codecs
import random

from nltk.tokenize import wordpunct_tokenize

In [2]:
# load array of objects, where object - class Leaflet
with open("/home/ruslan_yermakov/nlg-ra/datasets/LEAFLET_TRAIN_DATASET.pickle", "rb") as f:
    train_dataset = pickle.load(f)

In [3]:
# load array of objects, where object - class Leaflet
with open("/home/ruslan_yermakov/nlg-ra/datasets/LEAFLET_VALID_DATASET.pickle", "rb") as f:
    valid_dataset = pickle.load(f)

In [4]:
# load array of objects, where object - class Leaflet
with open("/home/ruslan_yermakov/nlg-ra/datasets/LEAFLET_TEST_DATASET.pickle", "rb") as f:
    test_dataset = pickle.load(f)

In [5]:
len(train_dataset)

1068

In [6]:
len(valid_dataset)

134

In [7]:
len(test_dataset)

134

------------------------------------------------------------------------------------------------------

## Produce same output as the script *create_dataset* from data2text-plain

### Plan

section_content -------- true text   

entity_recognition  ------- actually input - set of records   

Set of records in order they appear in section_content    -------- content plan 

In [8]:
RECORD_DELIM = " "
DELIM = u"￨"

HOME = "HOME"
AWAY = "AWAY"

# ENTITY = "Indication"
ENTITY = "Section6"

PAD_WORD = '<blank>'
UNK_WORD = '<unk>'
UNK = 0
BOS_WORD = '<s>'
EOS_WORD = '</s>'

In [9]:
# make sure - sorted entities
def _sort_key(entity):
    return entity['BeginOffset']

def test_order_entities(section_entities):
    
    sorted_entities = sorted(section_entities, key=_sort_key)
    
    if sorted_entities == section_entities: return True
    
    return False


def test_order_dataset(dataset):
    for leaflet in dataset:

        current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                    leaflet.section3, leaflet.section4, 
                                    leaflet.section5, leaflet.section6]

        for current_section in current_leaflet_sections:

            if current_section.entity_recognition is None:
                continue

            assert test_order_entities(current_section.entity_recognition) == True

            
test_order_dataset(train_dataset)
test_order_dataset(valid_dataset)
test_order_dataset(test_dataset)

In [10]:
def create_summary_contentplan(dataset):
    """
    Transform dataset to be a suitable format for model data2text-plan
    """
    
    # array to store section1 of each leaflet
    summaries_leaflets = []
    
    # array to store content plan of each leaflet
    content_plan_leaflets = []
    
    for leaflet in dataset:
        
        # extract section1 content
        section1_content = leaflet.section6.section_content
        # extract results of NER
        section1_entity_recognition = leaflet.section6.entity_recognition
        
        # skip if either input or output is None
        if section1_content is None or section1_entity_recognition is None:
            continue
        
        # get the content plan of each section
        content_plan_section1 = ''

        for entity in section1_entity_recognition:
            entity_value = entity['Text'] if len(entity['Text'].split(" ")) == 0 else ("_").join(entity['Text'].split(" "))
            entity_type = entity['Type'] if entity['Type'] is not None and len(entity['Type']) > 0 else entity['Category']
            
            # randomly choose HOME or AWAY
            if random.randint(1,2) == 1:
                content_plan_section1 += entity_value + DELIM + ENTITY + DELIM + entity_type + DELIM + HOME
            else:
                content_plan_section1 += entity_value + DELIM + ENTITY + DELIM + entity_type + DELIM + AWAY
            
            if section1_entity_recognition.index(entity) != len(section1_entity_recognition) - 1:
                content_plan_section1 += " "
            else:
                content_plan_section1 += " " + "\n"


        content_plan_leaflets.append(content_plan_section1)

        # get the section1 content
        # make sure to have punctuations as a separate token
        section1_content = wordpunct_tokenize(section1_content)
        
        # back to string
        section1_content = " ".join(section1_content)
        
        # add "\n" at the end
        section1_content = section1_content + "\n"
        
        summaries_leaflets.append(section1_content)
    
    return (content_plan_leaflets, summaries_leaflets)

In [11]:
def add_special_records(records):
    """
    To src_train.txt and src_valid.txt pre-append these special characters, according to data2text-plan project
    """
    
    record = []
    record.append(UNK_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    records.append(DELIM.join(record))
    record = []
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    records.append(DELIM.join(record))
    record = []
    record.append(BOS_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    records.append(DELIM.join(record))
    record = []
    record.append(EOS_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    record.append(PAD_WORD)
    records.append(DELIM.join(record))
    
    return records

In [12]:
# get the src_train - training data to be input to the model

def create_src_table(content_plan_leaflets):
    """
    Create src_train - input "table" to the model
    
    Idea: - we do not have a table, so randomized the records in content plan
    
    Update: - do not randomize - make it easier for the model to learn
    """
    
    # store input "table" of each leaflet in array
    src_leaflets = []
    
    for leaflet_content_plan in content_plan_leaflets:
        # remove the end symbol ('\n') of the string
        leaflet_content_plan = leaflet_content_plan[:-2]

        # split string into a list of records
        leaflet_content_plan_collection = leaflet_content_plan.split(" ")
        
        # do not do
        # randomly shuffle records in a list
        # random.shuffle(leaflet_content_plan_collection)
        
        # add special symbols to the begining
        special_symbols = []
        special_symbols = add_special_records(special_symbols)
        
        # create a string containing all the records and special_symbols in the begining
        src_leaflet_section1 = ''

        src_leaflet_section1 += " ".join(special_symbols)
        src_leaflet_section1 += " "

        src_leaflet_section1 += " ".join(leaflet_content_plan_collection)
        src_leaflet_section1 += '\n'


        src_leaflets.append(src_leaflet_section1)
    
    return src_leaflets

In [13]:
def create_training_sets(dataset):
    
    # Output files
    INTER_CONTENT_PLAN = 'inter/train_content_plan.txt'  # intermediate content plan input to second stage
    SRC_FILE = 'src_train.txt'  # src file input to first stage
    TRAIN_TGT_FILE = "tgt_train.txt"  # tgt file of second stage
    CONTENT_PLAN_OUT = 'train_content_plan.txt'  # content plan output of first stage
    
    # Create src - content_plan - summary
    content_plan_leaflets, summaries_leaflets = create_summary_contentplan(dataset)
    src_leaflets = create_src_table(content_plan_leaflets)
    
    # save to corresponding files
    output_file = open(INTER_CONTENT_PLAN, 'w')
    for content_plan in content_plan_leaflets:
        output_file.write(content_plan)
    output_file.close()
    
    summary_file = open(TRAIN_TGT_FILE, 'w')
    for summary_leaflet in summaries_leaflets:
        summary_file.write(summary_leaflet)
    summary_file.close()
    
    src_file = open(SRC_FILE, 'w')
    for src_instance in src_leaflets:
        src_file.write(src_instance)
    src_file.close()
    
    ### create last file needed - e.g (rotowire/train_content_plan.txt)
    inputs = []
    content_plans = []
    with codecs.open(INTER_CONTENT_PLAN, "r", "utf-8") as corpus_file:
        for i, line in enumerate(corpus_file):
            content_plans.append(line.split())

    with codecs.open(SRC_FILE, "r", "utf-8") as corpus_file:
        for i, line in enumerate(corpus_file):
            inputs.append(line.split())
    
    # basically - now content plan POINTs to index in the training dataset
    # content_plan - collection of indexes where each index points to record in training dataset - training_dataset[index]
    
    outputs = []

    for i, training_sample in enumerate(inputs):
        content_plan = content_plans[i]
        output = []
        for record in content_plan:
            output.append(str(training_sample.index(record)))
        outputs.append(" ".join(output))
        
    # write to a file

    output_file = open(CONTENT_PLAN_OUT, 'w')

    # add \n to the end of the string
    output_file.write("\n".join(outputs))
    # add \n between content plans
    output_file.write("\n")

    output_file.close()
    
    return src_leaflets, content_plan_leaflets, summaries_leaflets

In [14]:
leaflets_src_train, leaflets_inter_contentplan_train, leaflets_tgt_train = create_training_sets(train_dataset)

In [15]:
with open('src_train.txt') as reader:
    # This reads the remaining lines from the file object and returns them as a list.
    src_train = reader.readlines()

In [16]:
with open('tgt_train.txt') as reader:
    # This reads the remaining lines from the file object and returns them as a list.
    tgt_train = reader.readlines()

In [17]:
with open('train_content_plan.txt') as reader:
    # This reads the remaining lines from the file object and returns them as a list.
    train_content_plan = reader.readlines()

In [18]:
with open('inter/train_content_plan.txt') as reader:
    # This reads the remaining lines from the file object and returns them as a list.
    inter_train_content_plan = reader.readlines()

In [26]:
tgt_train[0]

'what cystagon contains - the active substance is cysteamine bitartrate ( mercaptamine bitartrate ). each hard capsule of cystagon 50 mg contains 50 mg of cysteamine ( as mercaptamine bitartrate ) each hard capsule of cystagon 150 mg contains 150 mg of cysteamine ( as mercaptamine bitartrate ) - the other ingredients are microcrystalline cellulose , starch , pregelatinized , magnesium stearate / sodium lauryl sulphate , colloidal silicon dioxide , croscarmellose sodium , capsule shells : gelatin , titanium dioxide , black ink on hard capsules ( e172 ). what cystagon looks like and contents of the pack hard capsules - cystagon 50 mg : white , opaque hard capsules with cysta 50 on the body and mylan on the cap . bottles of 100 or 500 hard capsules . all pack sizes may be not marketed . - cystagon 150 mg : white , opaque hard capsules with cystagon 150 on the body and mylan on the cap . bottles of 100 or 500 hard capsules . all pack sizes may be not marketed .\n'

======================================================================================================================
## Validation dataset 

In [29]:
def create_validation_sets(dataset):
    
    # Output files    
    INTER_CONTENT_PLAN_VALID = 'inter/valid_content_plan.txt'  # intermediate content plan input to second stage
    SRC_FILE_VALID = 'src_valid.txt'  # src file input to first stage
    TRAIN_TGT_FILE_VALID = "tgt_valid.txt"  # tgt file of second stage
    CONTENT_PLAN_OUT_VALID = 'valid_content_plan.txt'  # content plan output of first stage
    
    # Create src - content_plan - summary
    content_plan_leaflets, summaries_leaflets = create_summary_contentplan(dataset)
    src_leaflets = create_src_table(content_plan_leaflets)
    
    # save to corresponding files
    output_file = open(INTER_CONTENT_PLAN_VALID, 'w')
    for content_plan in content_plan_leaflets:
        output_file.write(content_plan)
    output_file.close()
    
    summary_file = open(TRAIN_TGT_FILE_VALID, 'w')
    for summary_leaflet in summaries_leaflets:
        summary_file.write(summary_leaflet)
    summary_file.close()
    
    src_file = open(SRC_FILE_VALID, 'w')
    for src_instance in src_leaflets:
        src_file.write(src_instance)
    src_file.close()
    
    ### create last file needed - e.g (rotowire/train_content_plan.txt)
    inputs = []
    content_plans = []
    with codecs.open(INTER_CONTENT_PLAN_VALID, "r", "utf-8") as corpus_file:
        for i, line in enumerate(corpus_file):
            content_plans.append(line.split())

    with codecs.open(SRC_FILE_VALID, "r", "utf-8") as corpus_file:
        for i, line in enumerate(corpus_file):
            inputs.append(line.split())
    
    # basically - now content plan POINTs to index in the training dataset
    # content_plan - collection of indexes where each index points to record in training dataset - training_dataset[index]
    
    outputs = []

    for i, training_sample in enumerate(inputs):
        content_plan = content_plans[i]
        output = []
        for record in content_plan:
            output.append(str(training_sample.index(record)))
        outputs.append(" ".join(output))
        
    # write to a file

    output_file = open(CONTENT_PLAN_OUT_VALID, 'w')

    # add \n to the end of the string
    output_file.write("\n".join(outputs))
    # add \n between content plans
    output_file.write("\n")

    output_file.close()
    
    return src_leaflets, content_plan_leaflets, summaries_leaflets

In [30]:
leaflets_src_valid, leaflets_inter_contentplan_valid, leaflets_tgt_valid = create_validation_sets(valid_dataset)

In [37]:
with open('inter/valid_content_plan.txt') as reader:
    # This reads the remaining lines from the file object and returns them as a list.
    contentplan_valid = reader.readlines()

INTER_CONTENT_PLAN_VALID = 'inter/valid_content_plan.txt'  # intermediate content plan input to second stage
    SRC_FILE_VALID = 'src_valid.txt'  # src file input to first stage
    TRAIN_TGT_FILE_VALID = "tgt_valid.txt"  # tgt file of second stage
    CONTENT_PLAN_OUT_VALID = 'valid_content_plan.txt'  # content plan output of first stage

================================================================================================================================
## Test dataset 

In [31]:
def create_test_sets(dataset):
    
    # Output files
    SRC_FILE_TEST = 'test/src_test.txt'  # src file input to first stage
    TRAIN_TGT_FILE_TEST = "test/tgt_test.txt"  # tgt file of second stage 
    
    
    # Create src - content_plan - summary
    content_plan_leaflets, summaries_leaflets = create_summary_contentplan(dataset)
    src_leaflets = create_src_table(content_plan_leaflets)
    
    # save to just summary and src data
    
    summary_file = open(TRAIN_TGT_FILE_TEST, 'w')
    for summary_leaflet in summaries_leaflets:
        summary_file.write(summary_leaflet)
    summary_file.close()
    
    src_file = open(SRC_FILE_TEST, 'w')
    for src_instance in src_leaflets:
        src_file.write(src_instance)
    src_file.close()

    return src_leaflets, summaries_leaflets

In [32]:
leaflets_src_test, leaflets_tgt_test = create_test_sets(test_dataset)

=======================================================================================================================
## Creating *train-roto-ptrs.txt*

In [33]:
# Output files
INTER_CONTENT_PLAN = 'inter/train_content_plan.txt'  # intermediate content plan input to second stage
TRAIN_TGT_FILE = "tgt_train.txt"  # tgt file of second stage
OUTPUT_FILE = "train-roto-ptrs.txt"

In [34]:
with open(TRAIN_TGT_FILE) as reader:
    leaflet_tgt_train = reader.readlines()

In [35]:
with open(INTER_CONTENT_PLAN) as reader:
    leaflets_inter_content_plan = reader.readlines()

For eg: the last entry 245,39 in train_roto_ptrs[1] indicates that the 245th token in summary matches with 39th content plan entry.  

Phoenix ----> Phoenix￨Suns￨TEAM-CITY￨HOME  
Suns ----> Suns￨Suns￨TEAM-NAME￨HOME  
39 ----> 39￨Suns￨TEAM-WINS￨HOME  
38 ----> 38￨Suns￨TEAM-LOSSES￨HOME  
87 ----> 87￨Suns￨TEAM-PTS￨HOME  
85 ----> 85￨Jazz￨TEAM-PTS￨AWAY  
Utah ----> Utah￨Jazz￨TEAM-CITY￨AWAY  

In [36]:
leaflet_tgt_train[0]

'what cystagon contains - the active substance is cysteamine bitartrate ( mercaptamine bitartrate ). each hard capsule of cystagon 50 mg contains 50 mg of cysteamine ( as mercaptamine bitartrate ) each hard capsule of cystagon 150 mg contains 150 mg of cysteamine ( as mercaptamine bitartrate ) - the other ingredients are microcrystalline cellulose , starch , pregelatinized , magnesium stearate / sodium lauryl sulphate , colloidal silicon dioxide , croscarmellose sodium , capsule shells : gelatin , titanium dioxide , black ink on hard capsules ( e172 ). what cystagon looks like and contents of the pack hard capsules - cystagon 50 mg : white , opaque hard capsules with cysta 50 on the body and mylan on the cap . bottles of 100 or 500 hard capsules . all pack sizes may be not marketed . - cystagon 150 mg : white , opaque hard capsules with cystagon 150 on the body and mylan on the cap . bottles of 100 or 500 hard capsules . all pack sizes may be not marketed .\n'

In [37]:
leaflets_inter_content_plan[0]

'cystagon￨Section6￨PRODUCT_NAME￨HOME cysteamine_bitartrate_(mercaptamine_bitartrate￨Section6￨TREATMENT￨HOME cystagon￨Section6￨GENERIC_NAME￨AWAY 50￨Section6￨NUMBER￨HOME 50￨Section6￨NUMBER￨AWAY cysteamine_(as_mercaptamine_bitartrate)￨Section6￨TREATMENT￨AWAY cystagon￨Section6￨GENERIC_NAME￨HOME 150￨Section6￨NUMBER￨AWAY 150￨Section6￨NUMBER￨AWAY cysteamine_(as_mercaptamine_bitartrate)￨Section6￨TREATMENT￨AWAY microcrystalline_cellulose￨Section6￨GENERIC_NAME￨HOME starch￨Section6￨GENERIC_NAME￨HOME pregelatinized￨Section6￨TREATMENT￨AWAY magnesium_stearate￨Section6￨GENERIC_NAME￨AWAY sodium_lauryl_sulphate￨Section6￨GENERIC_NAME￨HOME colloidal_silicon_dioxide￨Section6￨GENERIC_NAME￨AWAY croscarmellose_sodium￨Section6￨GENERIC_NAME￨HOME capsule_shells￨Section6￨TREATMENT￨AWAY gelatin,_titanium_dioxide￨Section6￨TREATMENT￨HOME black_ink￨Section6￨GENERIC_NAME￨HOME cystagon￨Section6￨GENERIC_NAME￨HOME the_pack_hard_capsules￨Section6￨TREATMENT￨HOME cystagon￨Section6￨GENERIC_NAME￨HOME 50￨Section6￨NUMBER￨HOME 

In [38]:
roto_pts_content = []

# for each leaflet
for leflet_num in range(len(leaflet_tgt_train)):
    
    # get current leaflet and content plan
    current_leaflet = leaflet_tgt_train[leflet_num].split()
    current_content_plan = leaflets_inter_content_plan[leflet_num].split()
    
    # get the values of content plan
    instances = []
    for entry in current_content_plan:
        record_values = entry.split(DELIM)[0]
        instances.append(record_values)
    
    # pairs (index_tgt, index_contentplan) for each leaflet
    current_str = []
    
    # for each token in current summary
    for token_pos in range(len(current_leaflet)):
        
        # get token
        token = current_leaflet[token_pos]
        
        # possible tokens if 2 words in content plan like 'immunodeficiency_syndrome'
        if token_pos < (len(current_leaflet)-1):
            token_2words = current_leaflet[token_pos] + "_" + current_leaflet[token_pos+1]
        else:
            token_2words = 'something that would never be in the section content'
        
        ### my-new-change
        # possible tokens if 3 words in content plan
        if token_pos < (len(current_leaflet)-2):
            token_3words = current_leaflet[token_pos] + "_" + current_leaflet[token_pos+1] + "_" + current_leaflet[token_pos+2]
        else:
            token_3words = 'something that would never be in the section content'
        
        # possible tokens if 4 words in content plan
        if token_pos < (len(current_leaflet)-3):
            token_4words = current_leaflet[token_pos] + "_" + current_leaflet[token_pos+1] + "_" + current_leaflet[token_pos+2] + "_" + current_leaflet[token_pos+3]
        else:
            token_4words = 'something that would never be in the section content'
        
        
        for content_plan_index in range(len(instances)):
                
            if token_4words == instances[content_plan_index]:
                # mask the corresponding position in content plan
                instances[content_plan_index] = "MASKED"
                pair = str(token_pos) + "," + str(content_plan_index)
                current_str.append(pair)

                # find just one match
                break
            
            if token_3words == instances[content_plan_index]:
                # mask the corresponding position in content plan
                instances[content_plan_index] = "MASKED"
                pair = str(token_pos) + "," + str(content_plan_index)
                current_str.append(pair)
                # find just one match
                break
            
            if token_2words == instances[content_plan_index]:
                # mask the corresponding position in content plan
                instances[content_plan_index] = "MASKED"
                pair = str(token_pos) + "," + str(content_plan_index)
                current_str.append(pair)
                # find just one match
                break
            
            if token == instances[content_plan_index]:
                # mask the corresponding position in content plan
                instances[content_plan_index] = "MASKED"
                pair = str(token_pos) + "," + str(content_plan_index)
                current_str.append(pair)
                # find just one match
                break
    
    # join pairs into string with " " between pairs
    current_str = " ".join(current_str)
    
    # add "\n" at the end
    current_str += "\n"
    
    roto_pts_content.append(current_str)

In [39]:
OUTPUT_FILE = "train-roto-ptrs.txt"

src_file = open(OUTPUT_FILE, 'w')
for src_instance in roto_pts_content:
    src_file.write(src_instance)
src_file.close()

In [40]:
len(roto_pts_content)

1042

In [41]:
index = 500

In [42]:
roto_pts_content[index]

'1,0 8,2 18,4 20,5 22,6 33,8 40,9 43,10 46,11 49,12 52,13 56,14 59,15 64,23 70,17 73,18 76,19 79,20 85,21 87,1 92,24 95,25 98,26 101,27 105,22 115,28 123,31 141,33 143,34 145,35\n'

In [43]:
len(roto_pts_content[index].split())

30

In [44]:
len(leaflets_inter_content_plan[index].split())

37

### Check whether correct

In [45]:
content_plan_indeces = []

for pair in roto_pts_content[index].split():
    pair = pair.split(",")
    
    a = int(pair[0])
    b = int(pair[1])
    
    content_plan_indeces.append(b)
    
    print(leaflet_tgt_train[index].split()[a:a+4], end=" ----> ")
    print(leaflets_inter_content_plan[index].split()[b])
    

['zyclara', 'contains', '-', 'the'] ----> zyclara￨Section6￨PRODUCT_NAME￨AWAY
['imiquimod', '.', 'each', 'sachet'] ----> imiquimod￨Section6￨GENERIC_NAME￨HOME
['imiquimod', 'in', '250', 'mg'] ----> imiquimod￨Section6￨GENERIC_NAME￨AWAY
['250', 'mg', 'cream', '('] ----> 250￨Section6￨NUMBER￨AWAY
['cream', '(', '100', 'mg'] ----> cream￨Section6￨TREATMENT￨AWAY
['imiquimod', ').', '-', 'the'] ----> imiquimod￨Section6￨GENERIC_NAME￨AWAY
['isostearic', 'acid', ',', 'benzyl'] ----> isostearic_acid￨Section6￨GENERIC_NAME￨AWAY
['benzyl', 'alcohol', ',', 'cetyl'] ----> benzyl_alcohol￨Section6￨GENERIC_NAME￨HOME
['cetyl', 'alcohol', ',', 'stearyl'] ----> cetyl_alcohol￨Section6￨GENERIC_NAME￨HOME
['stearyl', 'alcohol', ',', 'white'] ----> stearyl_alcohol￨Section6￨GENERIC_NAME￨HOME
['white', 'soft', 'paraffin', ','] ----> white_soft_paraffin￨Section6￨GENERIC_NAME￨AWAY
['polysorbate', '60', ',', 'sorbitan'] ----> polysorbate_60￨Section6￨GENERIC_NAME￨AWAY
['sorbitan', 'stearate', ',', 'glycerol'] ----> sorbi

In [46]:
# check out pairs missed
for i in range(0, len(leaflets_inter_content_plan[index].split()), 1):
    if i not in content_plan_indeces:
        print(leaflets_inter_content_plan[index].split()[i])
        
# explanation - 3-word long token
# explanation - hiv_infection ---> bc NER outputs - 'hiv', 'hiv_infection' - in content plan I have 2 tokens starting with 'hiv'

9.375￨Section6￨NUMBER￨HOME
3.75￨Section6￨NUMBER￨HOME
glycerol,_methyl_parahydroxybenzoate_(e￨Section6￨TREATMENT￨HOME
the_pack_-_each_zyclara￨Section6￨TREATMENT￨AWAY
3.75￨Section6￨NUMBER￨HOME
a_white_to_slightly_yellow_cream￨Section6￨TREATMENT￨AWAY
polyester/_white_low_density_polyethylene/aluminium_foil_sachets￨Section6￨TREATMENT￨AWAY


In [47]:
leaflet_tgt_train[500]

'what zyclara contains - the active substance is imiquimod . each sachet contains 9 . 375 mg of imiquimod in 250 mg cream ( 100 mg of cream contains 3 . 75 mg imiquimod ). - the other ingredients are isostearic acid , benzyl alcohol , cetyl alcohol , stearyl alcohol , white soft paraffin , polysorbate 60 , sorbitan stearate , glycerol , methyl parahydroxybenzoate ( e 218 ), propyl parahydroxybenzoate ( e 216 ), xanthan gum , purified water ( see also section 2 " zyclara contains methyl parahydroxybenzoate , propyl parahydroxybenzoate , cetyl alcohol , stearyl alcohol and benzyl alcohol "). what zyclara looks like and contents of the pack - each zyclara 3 . 75 % cream sachet contains 250 mg of a white to slightly yellow cream with a uniform appearance . - each box contains 14 , 28 or 56 single - use polyester / white low density polyethylene / aluminium foil sachets . not all pack sizes may be marketed .\n'

In [48]:
leaflets_inter_content_plan[500]

'zyclara￨Section6￨PRODUCT_NAME￨AWAY zyclara￨Section6￨BRAND_NAME￨AWAY imiquimod￨Section6￨GENERIC_NAME￨HOME 9.375￨Section6￨NUMBER￨HOME imiquimod￨Section6￨GENERIC_NAME￨AWAY 250￨Section6￨NUMBER￨AWAY cream￨Section6￨TREATMENT￨AWAY 3.75￨Section6￨NUMBER￨HOME imiquimod￨Section6￨GENERIC_NAME￨AWAY isostearic_acid￨Section6￨GENERIC_NAME￨AWAY benzyl_alcohol￨Section6￨GENERIC_NAME￨HOME cetyl_alcohol￨Section6￨GENERIC_NAME￨HOME stearyl_alcohol￨Section6￨GENERIC_NAME￨HOME white_soft_paraffin￨Section6￨GENERIC_NAME￨AWAY polysorbate_60￨Section6￨GENERIC_NAME￨AWAY sorbitan_stearate￨Section6￨GENERIC_NAME￨HOME glycerol,_methyl_parahydroxybenzoate_(e￨Section6￨TREATMENT￨HOME propyl_parahydroxybenzoate￨Section6￨GENERIC_NAME￨AWAY e_216￨Section6￨GENERIC_NAME￨HOME xanthan_gum￨Section6￨GENERIC_NAME￨AWAY purified_water￨Section6￨GENERIC_NAME￨AWAY 2￨Section6￨NUMBER￨HOME zyclara￨Section6￨BRAND_NAME￨HOME methyl_parahydroxybenzoate￨Section6￨GENERIC_NAME￨HOME propyl_parahydroxybenzoate￨Section6￨GENERIC_NAME￨AWAY cetyl_alcohol