# Extraction of Names, Scenes and Descriptions

In [56]:
import cv2
import numpy as np
import os
import pandas as pd
import json
import collections
import string

In [2]:
def define_range(path, attribute):
    ''' Defining the ranges of the first and second pages '''
    # Original image
    shape = cv2.imread(path,0).shape
    
    # Range of x_0
    x_0_lower = shape[1]*dict_attribute[attribute]['page_0_lower']
    x_0_upper = shape[1]*dict_attribute[attribute]['page_0_upper']
    # Range of x_1
    x_1_lower = shape[1]*dict_attribute[attribute]['page_1_lower'] 
    x_1_upper = shape[1]*dict_attribute[attribute]['page_1_upper']  

    return x_0_lower, x_0_upper, x_1_lower, x_1_upper

In [3]:
def change_x(coord, x_0_lower, x_0_upper, x_1_lower, x_1_upper):  
    ''' Returns the page where the word is
        0 in the first page
        1 in the second page
        -1 if not in the range 
    '''
    if coord >= x_0_lower and coord <= x_0_upper:
        return 0
    elif coord >= x_1_lower and coord <= x_1_upper:
        return 1
    else:
        return -1

In [4]:
def find_bounds(text, dict_bounds, mask, x_0_lower, x_0_upper, x_1_lower, x_1_upper, attribute):
    ''' Finds the names and bounds in the image '''
    proba = np.array([])
    
    # Defining the portion of the height of the box we want to keep
    ratio_y = int((text['Bottom_Right_Y'] - text['Top_Left_Y'])*dict_attribute[attribute]['width_box'])
    # Defining the portion of the width of the box we want to keep
    ratio_x = int((text['Bottom_Right_X'] - text['Top_Left_X'])*dict_attribute[attribute]['height_box'])
    
    # Going through every pixel of the reduced box
    for y in range(text['Top_Left_Y'] + ratio_y, text['Bottom_Right_Y'] - ratio_y):
        for x in range(text['Top_Left_X'] + ratio_x, text['Bottom_Right_X'] - ratio_x):
            # Find their associated probability of being a name
            proba = np.append(proba, mask[y][x])
            
    # Finding the mean probability of being the corresponding attribute for all the pixels in the reduced box        
    mean = proba.mean()
    if mean > dict_attribute[attribute]['mean_proba_threshold']:
        # Depending on coord_x, append extracted text and bounds on page 0 (left) or 1 (right) 
        coord_x = change_x(text['Top_Left_X'], x_0_lower, x_0_upper, x_1_lower, x_1_upper)
        if coord_x != -1:
            if coord_x in dict_bounds:
                dict_bounds[coord_x].append((text['Top_Left_Y'], attribute, text['Text']))
            else:    
                dict_bounds[coord_x] = [(text['Top_Left_Y'], attribute, text['Text'])]

In [5]:
def find_attributes_one_image(page):
    ''' Returns the attributes and bounds in one image '''
    
    # Data from segmentation
    segmentation_path = "./data/Antigone/1_Segmentation_results/" + page + ".npy"
    data = np.load(segmentation_path)
    
    dict_bounds = dict()
    for i, attribute in enumerate(dict_attribute.keys()):
        # Create x ranges
        x_0_lower, x_0_upper, x_1_lower, x_1_upper = define_range("./data/Antigone/0_Images/" + page + ".jpg", attribute)
        
        # Threshold for attributes segmentation
        mask = np.where(data[i+1]>dict_attribute[attribute]['ocr_proba_threshold'],1,0).astype(np.uint8)

        # Load results from OCR
        image_df = pd.read_csv('./data/Antigone/2_OCR_results/annotations_' + page + '.csv', index_col=0)

        # Find the attributes and bounds
        image_df.apply(lambda row: find_bounds(row, dict_bounds, mask, x_0_lower, x_0_upper, x_1_lower, x_1_upper, attribute), axis=1)

    return dict_bounds
                

In [6]:
def order_dict(dictionnary):
    ''' Returns ordered dictionnary of bounds per pages and per coordinates '''
    for pages in dictionnary.values():
        for ind in [0,1]:
            if ind in pages.keys():
                pages[ind].sort(key=lambda x: x[0])
    return sorted(dictionnary.items(), key = lambda kv:(int(kv[0][1:]), kv[1]))

In [7]:
def save_dict_in_json(dictionnary, path):
    ''' Saves ordered dictionnary of bounds per pages and per coordinates in a json file'''
    with open(path, "w") as outfile:  
        json.dump(dictionnary, outfile) 

In [8]:
def load_json_in_dict(path):
    ''' Loads ordered dictionnary of bounds per pages and per coordinates from a json file'''
    with open(path) as json_file: 
        return json.load(json_file)

In [11]:
#Define attributes to extract in images as well as their parameters
dict_attribute = {'Name': {'page_0_lower': 0, 
                            'page_0_upper':4/10, 
                            'page_1_lower':4/10, 
                            'page_1_upper':7/10,
                            'width_box': 0.4,
                            'height_box':0.4,
                            'ocr_proba_threshold':0.2, 
                            'mean_proba_threshold':0.7},
                   'Scene': {'page_0_lower': 2/10, 
                             'page_0_upper':4/10, 
                             'page_1_lower':6/10, 
                             'page_1_upper':8/10, 
                             'width_box': 0.4,
                             'height_box':0.4,
                             'ocr_proba_threshold':0.2, 
                             'mean_proba_threshold':0.7},
                   'Description': {'page_0_lower': 0, 
                                   'page_0_upper':1/2, 
                                   'page_1_lower':1/2, 
                                   'page_1_upper':1,
                                   'width_box': 0,
                                   'height_box':0,
                                   'ocr_proba_threshold':0.1, 
                                   'mean_proba_threshold':0.5}
                  }

def find_attributes():
    attributes_bounds = []
    pages = []
    # Going through all the images
    for filename in os.listdir("./data/Antigone/0_Images/"):
        if filename.endswith(".jpg"): 
            file_without_extension = os.path.splitext(filename)[0]
            #print(file_without_extension)
            pages.append(file_without_extension)
            # Find attribute in the image
            dict_bounds = find_attributes_one_image(file_without_extension)
            attributes_bounds.append(dict_bounds)
            #print(dict_bounds)
            continue
        else:
            continue
    return order_dict(dict(zip(pages, attributes_bounds)))

In [12]:
dictionnary = find_attributes()

In [13]:
save_dict_in_json(dictionnary, "./data/Antigone/2_OCR_results/Antigone.json")

# Extracting attributes

In [14]:
data = load_json_in_dict("./data/Antigone/2_OCR_results/Antigone.json")

In [97]:
def extract_all_attributes(data):
    '''Extract all atributes extracted by OCR in order, whithout storing coordinated or pages'''
    elements = np.empty(shape=(0,2))
    # for each page of the libretto
    for page in range(len(data)):
        # for each left and/or right page
        for ind in data[page][1].keys():
            # extract elements in the order they appear, without storing coordinates or pages
            elements = np.concatenate((elements, np.array(data[page][1][ind])[:, 1:]), axis = 0)
    return elements

def extract_attribute(elements, attribute):
    '''Extract elements in order from specific attribute'''
    # Extract text from specific attribute
    text_list = [row[1] for row in elements if attribute in row[0]]
    # Create string
    text = " ".join(text_list)
    # Remove digits
    text = ''.join([i for i in text if not i.isdigit()])
    # Remove punctuations
    text = text.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return text

In [98]:
all_attributes = extract_all_attributes(data)
description = extract_attribute(all_attributes, 'Description')
names = extract_attribute(all_attributes, 'Name')
scenes = extract_attribute(all_attributes, 'Scene')

In [99]:
names

'Padre Il detto Il Il Il Il La Macfiro Inventoree Inventere Sartore Luogo Creonte altri Cre Eur Cre Se AT Alib  Eur Cre Lea Ear Cre Lea Cre Per Da Non delle Se Won fa Node Il La pace man Che E Eur Lea Cre Eur Cre Eur Lea  Alc Cre Eur Alc Cre Ale Lea Eur Cre Eur Cre Eur Per Cre Lea Eur Ear  Lea Eur Lea Enrifce Eur Veggio Erm Ant Ear Erm Ant Eur Erm Eur Erm Ant Eur Ant Eur Ant Erm Eur  Ant Eur Erm Ant Erm Senza Ant Eur Ant Bur Ant Eur Ant Exr Ant Eur Ant Eur Ant Exr Ant Eur Per  Aut Corone Cuflodi Antigona Scenda Alc Erm EurGià Cre Ant Cre  Ant Erm Eur Ant Cre Ant Erm Ak Lea Cre Ant Cre Ant Cre Ant Cre  Erm Ant Erm parte Lea Alc parte Ant Eur Ant Eur Ant Penfa Alib  Eur Miferi Fine Ale Eur Alc Eur  Erm Alc Erm Eur Erm Alc Erm Eur Alc Erm Cre Lea Alc Eur Cre Erm Cre Erm Cre Lea Eur Alc Erm  Lea Cre Erm Cre Lea Ale Forfe Eur Ant Cre Eur Cre Lea Ant Alc Cre  Io Eur Ant Eur Ant Eur Deggio Ant Eur Ant Lea Forfe Alc Alib Sento Lea Più Sen  Ant Ale Lea Alc Lea Alc  Eur Creon Eur Ajc Ant Alc Ant

# Clean Names

In [110]:
import io 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

def remove_stopwords(text):
    ''' Remove stopwords in all kind of cases '''
    stop_words = stopwords.words('italian')+[word.title() for word in stopwords.words('italian')]+[word.upper() for word in stopwords.words('italian')]
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stop_words]
    return " ".join(tokens_without_sw)

In [111]:
remove_stopwords(names)

'Padre detto Macfiro Inventoree Inventere Sartore Luogo Creonte altri Cre Eur Cre AT Alib Eur Cre Lea Ear Cre Lea Cre Won fa Node pace man Eur Lea Cre Eur Cre Eur Lea Alc Cre Eur Alc Cre Ale Lea Eur Cre Eur Cre Eur Cre Lea Eur Ear Lea Eur Lea Enrifce Eur Veggio Erm Ant Ear Erm Ant Eur Erm Eur Erm Ant Eur Ant Eur Ant Erm Eur Ant Eur Erm Ant Erm Senza Ant Eur Ant Bur Ant Eur Ant Exr Ant Eur Ant Eur Ant Exr Ant Eur Aut Corone Cuflodi Antigona Scenda Alc Erm EurGià Cre Ant Cre Ant Erm Eur Ant Cre Ant Erm Ak Lea Cre Ant Cre Ant Cre Ant Cre Erm Ant Erm parte Lea Alc parte Ant Eur Ant Eur Ant Penfa Alib Eur Miferi Fine Ale Eur Alc Eur Erm Alc Erm Eur Erm Alc Erm Eur Alc Erm Cre Lea Alc Eur Cre Erm Cre Erm Cre Lea Eur Alc Erm Lea Cre Erm Cre Lea Ale Forfe Eur Ant Cre Eur Cre Lea Ant Alc Cre Eur Ant Eur Ant Eur Deggio Ant Eur Ant Lea Forfe Alc Alib Sento Lea Sen Ant Ale Lea Alc Lea Alc Eur Creon Eur Ajc Ant Alc Ant Crean Eur Creon Ant Alc Eur Creon Eur Creon Eur Aut Creon Sem Ant Eur Learco Tem