## Main goal - split dataset into train-val-test

- Additionally remove sections that are outliers (too long length)   
- Additionally add PRODUCT_NAME as the 1st entity.

In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import random

# split into train-dev-test
from sklearn.model_selection import train_test_split

In [2]:
# load array of objects, where object - class Leaflet
with open("datasets/LEAFLET_DATASET_PROCESSED_NER_COMBINED.pickle", "rb") as f:
    package_leaflets = pickle.load(f)

In [3]:
len(package_leaflets)

1336

### Clean Leaflets dataset    

- replace too short sections with None
- set duplicate section_content to None 
- set duplicate entity_recognition to None 
- remove outliers

In [4]:
# set empty sections to None
# set duplicate section_content to None 
# set duplicate entity_recognition to None 


# keep track of unique NER outputs observed so far
unique_NER_outputs = dict()

# keep track of unique section contents observed so far
unique_section_content = dict()

COUNT_DUPLICATE_NER_OUTPUTS = 0
COUNT_DUPLICATE_SECTION_CONTENT = 0


for leaflet in package_leaflets:
    
    current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                leaflet.section3, leaflet.section4, 
                                leaflet.section5, leaflet.section6]
    
    for section_index, current_section in enumerate(current_leaflet_sections):
        
        # if section_content is None, make sure entity_recognition is None too (can not map NER --> None)
        if current_section.section_content is None:
            current_section.entity_recognition = None
            continue
        
        # if entity_recognition is None, make sure section_content is None too (can not map from None --> text)
        if current_section.entity_recognition is None:
            current_section.section_content = None
            continue
        
        # set empty section_content to None, make sure entity_recognition is None too (can not map NER --> None)
        if len(current_section.section_content) == 0:
            current_section.section_content = None
            current_section.entity_recognition = None
            continue
        
        # set empty NER outputs to None, make sure section_content is None too (can not map from None --> text)
        if len(current_section.entity_recognition) == 0:
            current_section.entity_recognition = None
            current_section.section_content = None
            continue
            
        ### set duplicate NER outputs to None
        
        is_duplicate_NER = False
        
        # get only the 'Text' of entities
        current_section_entities = ''
        for entity in current_section.entity_recognition:
            current_section_entities += entity['Text'] + ' '
        
        if current_section_entities not in unique_NER_outputs:
            unique_NER_outputs[current_section_entities] = 1
        else:
            unique_NER_outputs[current_section_entities] += 1
            COUNT_DUPLICATE_NER_OUTPUTS += 1
            is_duplicate_NER = True
        
        
        ### set duplicate section content to None
        
        is_duplicate_section_content = False
        
        section_content = current_section.section_content
        
        if section_content not in unique_section_content:
            unique_section_content[section_content] = 1
        else:
            unique_section_content[section_content] += 1
            COUNT_DUPLICATE_SECTION_CONTENT += 1
            is_duplicate_section_content = True
        
        
        # set duplicate section_content or duplicate NER output to None
        if (section_index+1) == 1:
            if is_duplicate_NER: leaflet.section1.entity_recognition = None
            if is_duplicate_section_content: leaflet.section1.section_content = None
        elif (section_index+1) == 2:
            if is_duplicate_NER: leaflet.section2.entity_recognition = None
            if is_duplicate_section_content: leaflet.section2.section_content = None
        elif (section_index+1) == 3:
            if is_duplicate_NER: leaflet.section3.entity_recognition = None
            if is_duplicate_section_content: leaflet.section3.section_content = None
        elif (section_index+1) == 4:
            if is_duplicate_NER: leaflet.section4.entity_recognition = None
            if is_duplicate_section_content: leaflet.section4.section_content = None
        elif (section_index+1) == 5:
            if is_duplicate_NER: leaflet.section5.entity_recognition = None
            if is_duplicate_section_content: leaflet.section5.section_content = None
        elif (section_index+1) == 6:
            if is_duplicate_NER: leaflet.section6.entity_recognition = None
            if is_duplicate_section_content: leaflet.section6.section_content = None

In [5]:
COUNT_DUPLICATE_NER_OUTPUTS

475

In [6]:
COUNT_DUPLICATE_SECTION_CONTENT

143

In [7]:
# remove outliers

# calculate length of each section
section1_content_length = []
section2_content_length = []
section3_content_length = []
section4_content_length = []
section5_content_length = []
section6_content_length = []


# calc the length of section content and add to list
for leaflet_idx in range(len(package_leaflets)):
    
    for section_idx in range(1,7):
        
        if section_idx == 1:
            current_section_content = package_leaflets[leaflet_idx].section1.section_content
            if current_section_content is not None: section1_content_length.append(len(current_section_content))
        elif section_idx == 2:
            current_section_content = package_leaflets[leaflet_idx].section2.section_content
            if current_section_content is not None: section2_content_length.append(len(current_section_content))
        elif section_idx == 3:
            current_section_content = package_leaflets[leaflet_idx].section3.section_content
            if current_section_content is not None: section3_content_length.append(len(current_section_content))
        elif section_idx == 4:
            current_section_content = package_leaflets[leaflet_idx].section4.section_content
            if current_section_content is not None: section4_content_length.append(len(current_section_content))
        elif section_idx == 5:
            current_section_content = package_leaflets[leaflet_idx].section5.section_content
            if current_section_content is not None: section5_content_length.append(len(current_section_content))
        elif section_idx == 6:
            current_section_content = package_leaflets[leaflet_idx].section6.section_content
            if current_section_content is not None: section6_content_length.append(len(current_section_content))

In [8]:
print('Section 1: ', np.mean(section1_content_length))
print('Section 2: ', np.mean(section2_content_length))
print('Section 3: ', np.mean(section3_content_length))
print('Section 4: ', np.mean(section4_content_length))
print('Section 5: ', np.mean(section5_content_length))
print('Section 6: ', np.mean(section6_content_length))

Section 1:  1005.2061933534743
Section 2:  4654.529144587434
Section 3:  2397.9171686746986
Section 4:  3595.102056359482
Section 5:  644.16
Section 6:  1033.1423164269493


In [9]:
def find_outliers_threshold(data, name='', m=3.5):
    """
    Outliers are to the right side of the distribution
    
    Outliers:
    print('Outliers - Section Lengths:', data[abs(data - np.mean(data)) > m * np.std(data)])
    
    Check:
    print(data[abs(data - np.mean(data)) > m * np.std(data)] >= min(data[abs(data - np.mean(data)) > m * np.std(data)]))
    """
    
    # filtered data without outliers
    filtered_data = data[abs(data - np.mean(data)) < m * np.std(data)]
    
    # outliers
    outliers = data[abs(data - np.mean(data)) > m * np.std(data)]
    
    # print - number of outliers
    print(name, len(data), '-', len(filtered_data), "=", len(outliers), '\tThreshold:', min(outliers))
        
    # find the threshold, section content with length > threshold ---> outliers
    return min(outliers)

In [10]:
outliers_threshold = {
    '1': find_outliers_threshold(np.array(section1_content_length), name='Section1:'),
    '2': find_outliers_threshold(np.array(section2_content_length), name='Section2:'),
    '3': find_outliers_threshold(np.array(section3_content_length), name='Section3:'),
    '4': find_outliers_threshold(np.array(section4_content_length), name='Section4:'),
    '5': find_outliers_threshold(np.array(section5_content_length), name='Section5:'),
    '6': find_outliers_threshold(np.array(section6_content_length), name='Section6:')
}

Section1: 1324 - 1314 = 10 	Threshold: 3858
Section2: 1321 - 1309 = 12 	Threshold: 12766
Section3: 1328 - 1313 = 15 	Threshold: 8618
Section4: 1313 - 1295 = 18 	Threshold: 12048
Section5: 1175 - 1172 = 3 	Threshold: 4594
Section6: 1321 - 1311 = 10 	Threshold: 4233


**Note**: Before I used some hard-coded thresholds based on distribution analysis.  

{'1': 4000, '2': 14000, '3': 9000, '4': 14000, '5': 2000, '6': 5000}     

Section1: 1324 - 1316 = 8 	Threshold: 4000     
Section2: 1321 - 1316 = 5 	Threshold: 14000     
Section3: 1328 - 1319 = 9 	Threshold: 9000     
Section4: 1313 - 1307 = 6 	Threshold: 14000     
Section5: 1231 - 1228 = 3 	Threshold: 2000     
Section6: 1321 - 1316 = 5 	Threshold: 5000       

In [None]:
# plt.hist(section1_content_length, bins=300)
# plt.show()

In [11]:
# set outliers section content to None

for leaflet_idx in range(len(package_leaflets)):
    
    for section_idx in range(1,7):
        
        if section_idx == 1:
            current_section_content = package_leaflets[leaflet_idx].section1.section_content
            if current_section_content is not None and len(current_section_content) >= outliers_threshold['1']: package_leaflets[leaflet_idx].section1.section_content = None
        elif section_idx == 2:
            current_section_content = package_leaflets[leaflet_idx].section2.section_content
            if current_section_content is not None and len(current_section_content) >= outliers_threshold['2']: package_leaflets[leaflet_idx].section2.section_content = None
        elif section_idx == 3:
            current_section_content = package_leaflets[leaflet_idx].section3.section_content
            if current_section_content is not None and len(current_section_content) >= outliers_threshold['3']: package_leaflets[leaflet_idx].section3.section_content = None
        elif section_idx == 4:
            current_section_content = package_leaflets[leaflet_idx].section4.section_content
            if current_section_content is not None and len(current_section_content) >= outliers_threshold['4']: package_leaflets[leaflet_idx].section4.section_content = None
        elif section_idx == 5:
            current_section_content = package_leaflets[leaflet_idx].section5.section_content
            if current_section_content is not None and len(current_section_content) >= outliers_threshold['5']: package_leaflets[leaflet_idx].section5.section_content = None
        elif section_idx == 6:
            current_section_content = package_leaflets[leaflet_idx].section6.section_content
            if current_section_content is not None and len(current_section_content) >= outliers_threshold['6']: package_leaflets[leaflet_idx].section6.section_content = None

In [12]:
# check the mean length of each section after removing outliers

# calculate length of each section
section1_content_length = []
section2_content_length = []
section3_content_length = []
section4_content_length = []
section5_content_length = []
section6_content_length = []


# calc the length of section content and add to list

for leaflet_idx in range(len(package_leaflets)):
    
    for section_idx in range(1,7):
        
        if section_idx == 1:
            current_section_content = package_leaflets[leaflet_idx].section1.section_content
            if current_section_content is not None: section1_content_length.append(len(current_section_content))
        elif section_idx == 2:
            current_section_content = package_leaflets[leaflet_idx].section2.section_content
            if current_section_content is not None: section2_content_length.append(len(current_section_content))
        elif section_idx == 3:
            current_section_content = package_leaflets[leaflet_idx].section3.section_content
            if current_section_content is not None: section3_content_length.append(len(current_section_content))
        elif section_idx == 4:
            current_section_content = package_leaflets[leaflet_idx].section4.section_content
            if current_section_content is not None: section4_content_length.append(len(current_section_content))
        elif section_idx == 5:
            current_section_content = package_leaflets[leaflet_idx].section5.section_content
            if current_section_content is not None: section5_content_length.append(len(current_section_content))
        elif section_idx == 6:
            current_section_content = package_leaflets[leaflet_idx].section6.section_content
            if current_section_content is not None: section6_content_length.append(len(current_section_content))

In [13]:
print('Section 1: ', np.mean(section1_content_length))
print('Section 2: ', np.mean(section2_content_length))
print('Section 3: ', np.mean(section3_content_length))
print('Section 4: ', np.mean(section4_content_length))
print('Section 5: ', np.mean(section5_content_length))
print('Section 6: ', np.mean(section6_content_length))

Section 1:  962.7945205479452
Section 2:  4559.597402597403
Section 3:  2300.4912414318355
Section 4:  3452.67722007722
Section 5:  630.4846416382253
Section 6:  981.7040427154843


In [14]:
print("Total number of documents --- ", len(package_leaflets))

print('Num. of section 1 not None: ', len(section1_content_length))
print('Num. of section 2 not None: ', len(section2_content_length))
print('Num. of section 3 not None: ', len(section3_content_length))
print('Num. of section 4 not None: ', len(section4_content_length))
print('Num. of section 5 not None: ', len(section5_content_length))
print('Num. of section 6 not None: ', len(section6_content_length))

Total number of documents ---  1336
Num. of section 1 not None:  1314
Num. of section 2 not None:  1309
Num. of section 3 not None:  1313
Num. of section 4 not None:  1295
Num. of section 5 not None:  1172
Num. of section 6 not None:  1311


## Make sure the entities of each section are sorted by "BeginOffset"

In [15]:
def _sort_key(entity):
    return entity['BeginOffset']

In [16]:
def test_order_entities(section_entities):
    
    sorted_entities = sorted(section_entities, key=_sort_key)
    
    if sorted_entities == section_entities: return True
    
    return False

In [17]:
for leaflet in package_leaflets:
    
    current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                leaflet.section3, leaflet.section4, 
                                leaflet.section5, leaflet.section6]
    
    for current_section in current_leaflet_sections:
        
        if current_section.entity_recognition is None:
            continue
            
        assert test_order_entities(current_section.entity_recognition) == True

## Check number of sections with section_content != None and entities != None

####  [Before splitting] Number of not None sections 

In [18]:
def count_sections(dataset):
    count_sections = {
        '1': 0,
        '2': 0,
        '3': 0,
        '4': 0,
        '5': 0,
        '6': 0
    }

    for leaflet in dataset:

        current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                    leaflet.section3, leaflet.section4, 
                                    leaflet.section5, leaflet.section6]

        for section_index, current_section in enumerate(current_leaflet_sections):

            # skip None (duplicates should be skipped)
            if current_section.section_content is None or current_section.entity_recognition is None:
                continue

            # save the detected entities in SectionLeaflet
            if (section_index+1) == 1:
                count_sections['1'] += 1
            elif (section_index+1) == 2:
                count_sections['2'] += 1
            elif (section_index+1) == 3:
                count_sections['3'] += 1
            elif (section_index+1) == 4:
                count_sections['4'] += 1
            elif (section_index+1) == 5:
                count_sections['5'] += 1
            elif (section_index+1) == 6:
                count_sections['6'] += 1
    
    return count_sections

In [19]:
count_sections(package_leaflets)

{'1': 1311, '2': 1309, '3': 1310, '4': 1287, '5': 861, '6': 1304}

## Add <product_name> as first entity

In [20]:
for leaflet in package_leaflets:

    current_leaflet_sections = [leaflet.section1, leaflet.section2, 
                                leaflet.section3, leaflet.section4, 
                                leaflet.section5, leaflet.section6]

    for section_index, current_section in enumerate(current_leaflet_sections):

        # skip None
        if current_section.entity_recognition is None or len(current_section.entity_recognition) == 0:
            current_section.entity_recognition = None
            continue
        
        # extract results of NER
        section_entity_recognition = current_section.entity_recognition

        # add product_name as 1st Entity
        section_entity_recognition.insert(0, {'Text':leaflet.product_name.lower(), 'Type':'PRODUCT_NAME', 'BeginOffset': 0, 'EndOffset': 0})
        
        # update the info in dataset
        current_section.entity_recognition = section_entity_recognition

#### Test adding <product_name> as 1st entity

In [21]:
for leaflet_idx in range(len(package_leaflets)):
    
    for section_idx in range(1,7):
        
        if section_idx == 1: current_entities = package_leaflets[leaflet_idx].section1.entity_recognition
        elif section_idx == 2: current_entities = package_leaflets[leaflet_idx].section2.entity_recognition
        elif section_idx == 3: current_entities = package_leaflets[leaflet_idx].section3.entity_recognition
        elif section_idx == 4: current_entities = package_leaflets[leaflet_idx].section4.entity_recognition
        elif section_idx == 5: current_entities = package_leaflets[leaflet_idx].section5.entity_recognition
        elif section_idx == 6: current_entities = package_leaflets[leaflet_idx].section6.entity_recognition
        
        if current_entities is not None:
            assert current_entities[0]['Type'] == 'PRODUCT_NAME'

## Split dataset into train-dev-test

In [22]:
# split dataset into train-valid-test (0.9-0.1-0.1) and shuffle 

# train - test
train_leaflets, test_leaflets = train_test_split(package_leaflets, test_size=0.1, random_state=42, shuffle=True)

# train - valid 
train_leaflets, valid_leaflets = train_test_split(train_leaflets, test_size=134, random_state=42, shuffle=True)

### Test distribution of section in train-dev-test

In [23]:
count_sections(train_leaflets)

{'1': 1049, '2': 1046, '3': 1044, '4': 1028, '5': 682, '6': 1042}

In [24]:
count_sections(valid_leaflets)

{'1': 129, '2': 130, '3': 133, '4': 130, '5': 98, '6': 129}

In [25]:
count_sections(test_leaflets)

{'1': 133, '2': 133, '3': 133, '4': 129, '5': 81, '6': 133}

In [26]:
len(train_leaflets)

1068

In [27]:
len(test_leaflets)

134

In [28]:
len(valid_leaflets)

134

In [29]:
# save results
with open("datasets/LEAFLET_TRAIN_DATASET.pickle", "wb") as f:
    pickle.dump(train_leaflets, f)

In [30]:
# save results
with open("datasets/LEAFLET_VALID_DATASET.pickle", "wb") as f:
    pickle.dump(valid_leaflets, f)

In [31]:
# save results
with open("datasets/LEAFLET_TEST_DATASET.pickle", "wb") as f:
    pickle.dump(test_leaflets, f)

-------------------------------------------------------------------------------------------