In [1]:
import cv2
import numpy as np
import os
import pandas as pd
import json
import collections
import string
import re

# Load data

We will first start by loading the json data of our libretto which has the following structure:
* A list of all scans of the libretto (one scan consists of 2 pages)
* A list per scan containing 2 dictionnaries: one for the left page, which index is 0, and one for the right page, which index is 1.
* For each index of the dictionnary is stored a list of elements figuring in the page and stored in the following way:
    * `[Num:Int, Label:String, Text:String]`
    * Num is the y-coordinate of the text in the page (if close to 0, then it is on top of the page)
    * Label is one of the following label extracted: Name, Description or Scene
    * Text is the written text figuring in the page

In [2]:
def load_json_in_dict(path):
    ''' 
    Loads ordered dictionnary of bounds per pages and per coordinates from a json file
    :param string path: where all data of libretto is stored, as explained above
    '''
    with open(path) as json_file: 
        return json.load(json_file)

In [3]:
data = load_json_in_dict("./data/Antigone/2_OCR_results/Antigone.json")

# Clean Text elements

We now implement the methods which will be useful to move from the structure which was specified above to the foloowing one:
* Remove the pages and coordinates of each word appearing in the libretto
* Remove all stopwords from the extracted text
* For each label/attribute ('Scene', 'Description', 'Name'):
    * Extract ['Label', 'Text] in the order they appear in the libretto and store it in `all_attributes`
    * Create list of names, scenes and descriptions in the order they appear in the text

In [4]:
import io 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

def remove_stopwords(text):
    '''
    Outputs text with removed stopwords in all kind of cases 
    :param string text: italian string text extracted from libretto
    '''
    #Create list of italian stopwords in all cases
    stop_words = stopwords.words('italian')+[word.title() for word in stopwords.words('italian')]+[word.upper() for word in stopwords.words('italian')]
    #Tokenize text and remove word if it is a stopword
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stop_words]
    return tokens_without_sw

def extract_all_attributes(data):
    '''
    Extract all atributes extracted by OCR in order, whithout storing coordinates or pages
    :param list data: extracted italian text of libretto separated by pages and coordinates
    '''
    elements = np.empty(shape=(0,2))
    # for each page of the libretto
    for page in range(len(data)):
        # for each left and/or right page
        for ind in data[page][1].keys():
            # extract elements in the order they appear, without storing coordinates or pages
            elements = np.concatenate((elements, np.array(data[page][1][ind])[:, 1:]), axis = 0)
    return elements

def extract_attribute(elements, attribute):
    '''
    Extract elements in order from specific attribute
    :param numpy.ndarray elements: elements ['Label', 'Text'] stored in the order they appear in 
    :param string attribute: the label/attribute to extract ('Name', 'Description', 'Scene')
    '''
    # Extract text from specific attribute
    text_list = [row[1] for row in elements if attribute in row[0]]
    # Create string
    text = " ".join(text_list)
    # Remove digits
    text = ''.join([i for i in text if not i.isdigit()])
    # Remove punctuations
    text = text.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    # Remove stopwords
    text = remove_stopwords(text)
    return text

In [5]:
all_attributes = extract_all_attributes(data)

#Lists of text from the defined attribute
description = extract_attribute(all_attributes, 'Description')
names = extract_attribute(all_attributes, 'Name')
scenes = extract_attribute(all_attributes, 'Scene')

# Extract Names

The goal now is to be able to define which character figures in which scenes. As seen above, the `names` list contains abbreviations of the characters of the libretto. Their complete name figure in the `description` list which we need to extract based on the abbreviations we have. We will therefore proceed in the following way:
* We extract the top_N most common abbreviations names figuring in the `names` list. We chose top_N = 15 as a libretto contains around 10 characters and as some frequent abbreviations had been misspelled by the OCR.
* Based on the extracted abbreviation names, we now create a list of regex patterns to look for in the description. 
* We will then extract the most frequent complete name for each pattern.
* We output a dictionnary which key is the most frequent complete name extracted from the pattern and the value is a list of all other words which has been extracted from the same pattern. This other words are not unique as we need them to define which key name is the most correct one for similar extracted key names (i.e. Autigona and Antigona are key extracted names)

In [6]:
def extract_names_abbreviations(names, top_N=15):
    '''
    Extract top-N most common abbreviation names
    :param list names: list of abbreviation names
    :param int top_N, default=15: number of most common abbreviations names to extract
    '''
    #Create the list of abbreviation names to return
    names_abbreviations = []
    #Extract the top_N most common abbreviations names figuring in the list names
    frequent_names = collections.Counter(names).most_common()[:top_N]
    for name, count in frequent_names:
        names_abbreviations.append(name)
    return list(set(names_abbreviations))

def list_patterns(names_abbreviations):
    '''
    Create list of patterns from abbreviation names
    :param list names_abbreviations: list of unique abbreviation names
    '''
    #Create a pattern for each abbreviation name by adding a '.*' after each character
    #This means that the character is followed by 0 or more other characters until it meets the next character 
    #of the pattern.
    patterns = list(map('.*'.join, names_abbreviations))
    #Creat a list of correct patterns to return
    correct_patterns = []
    #For each patter remove the '.*'following the first character of the name
    for pattern in patterns:
        correct_patterns.append(pattern.replace(".*", "", 1) + ".*")
    return correct_patterns

def filter_pattern(pattern, description):
    '''
    Filter list of abbreviations by given pattern
    :param string pattern: a regex pattern
    :param list description: a list of text from which we need to extract words with the given pattern
    '''
    #list of all words from description which matched the pattern
    occurences = [val for val in description if re.search(pattern, val)]
    if len(occurences) > 0:
        return occurences
    else:
        return []

def find_complete_name(pattern, text, abbreviation):
    '''
    Returns most common name who follows the given pattern
    :param string pattern: a regex pattern corresponding to the given abbreviation
    :param string text: a list of text from which we need to extract words with the given pattern
    :param string abbreviation: the abbreviation name for which the complete name is being searched
    '''
    #list of all words from text which matched the pattern
    occurences = filter_pattern(pattern, text)
    #Extract the most common word form the list of occurences
    most_common_name = collections.Counter(occurences).most_common(1)
    #If a name has been extracted from the list of occurences, then return it with all
    #the other words matching the pattern as well as the abbreviation from ehich the pattern
    #has been derived.
    if (len(most_common_name) > 0) and (len(occurences) > 0):
        return most_common_name[0][0], occurences+[abbreviation]
    return  None, None
    
def extract_complete_names(names, description):
    '''
    Extract dictionnary of characters complete names and their respective similar names
    :param list names: list of abbreviation names
    :param list description: a list of text from which we need to extract words with the given pattern
    '''
    #Extract abbrevations names
    names_abbreviations = extract_names_abbreviations(names)
    #Extract patterns for each of the abreviation names
    patterns = list_patterns(names_abbreviations)

    #A dictionnary which will store for each most common complete name all occurences 
    #of words which come from the same pattern.
    dic = {}
    #For each abbreviation and pattern, we are going to find its most common complete name
    for abbreviation, pattern in zip(names_abbreviations, patterns):
        name, name_mappings = find_complete_name(pattern, description, abbreviation)
        #Check whether a complete name was found in the description list of texts
        if name != None:
            #If the name is already contained in the dictionnary, then it means the pattern 
            #found many names which probably had been misspelled by the OCR and which we
            #therefore need to correct.
            if name in dic:
                dic[name] =dic[name] + name_mappings
            else:
                dic[name] = name_mappings
    return dic, names_abbreviations

In [7]:
dic, abbreviations = extract_complete_names(names, description)

Now that we achieved to extract the complete names for each abbreviation name as well as their respective list of names which follow the same pattern, we can now focus on extracting the correct true names. Indeed as mentionned before, one can get *Antigona* and *Autigona* which define one and only character which is *Antigona*. 

To be able to correct the extracted name *Autigona*, we will make use of similarity distances and in our case the levenshtein distancea string metric for measuring the difference between two words. The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.

If two words are similar above some threshold, then the correct word/name to keep will be the one who had the longest list of extracted similar words coming from the same pattern.

The goal now is to find for each abbreviation name which has been extracted the corresponding true complete name.

In [8]:
from difflib import SequenceMatcher as sm

def complete_names_to_correct(dic, threshold=0.60):
    '''
    Extract a dictionnary of complete names with their respective similar names and a set 
    complete names to remove if they are similar in the Levenshtein distance metric sense.
    :param dictionnary dic: a dictionnary of characters complete names and their respective similar names
    :param float threshold: the Levenshtein similitude ratio needed to confirm that two words are similar
    '''
    #The dictionnary which will store the similar complete names and their respective similar 
    #wording in the same key.
    dic_new = {}
    names_keys = list(dic.keys())
    names_values = list(dic.values())
    #The set of complete names to remove from the dictionnary as their are similar. Will be replaced correctly.
    keys_to_remove = set()
    #For each pair of complete names, check wether the names are similar
    for i in range(len(names_keys)):
        for j in range(i+1, len(names_keys)):
            if(sm(None, names_keys[i], names_keys[j]).ratio() >= threshold):
                #As the names are similar, check which of the two names has the longest 
                #list of extracted similar words coming from the same pattern. This will then
                #be the correct complete name.
                key_index = j if (len(names_values[i]) < len(names_values[j])) else i
                keys_to_remove.add(names_keys[i])
                keys_to_remove.add(names_keys[j])
                if names_keys[key_index] in dic_new.keys():
                    dic_new[names_keys[key_index]].extend(names_values[i])
                    dic_new[names_keys[key_index]].extend(names_values[j])
                else:
                    dic_new[names_keys[key_index]] = names_values[i]
                    dic_new[names_keys[key_index]].extend(names_values[j])
    #Remove duplicates 
    dic_new = {k:list(set(j)) for k,j in dic_new.items()}
    return dic_new, keys_to_remove

In [9]:
dic_names_corrected, names_to_remove = complete_names_to_correct(dic)

In [10]:
def match_abbrev_with_complete_names(dic, names_to_remove, dic_names_corrected):
    '''
    Output the dictionnary which map abbreviation names with their correct complete name
    :param dictionnary dic: a dictionnary of characters complete names and their respective similar names
    :param set names_to_remove: a set of string complete names to remove as they match other names
    :param dictionnary dic_names_corrected: a dictionnary of aggregated similar complete names
    '''
    #For all key names to remove, remove them from dictionnary of complete names
    for k in names_to_remove :
        dic.pop(k)
    #Update the dictionnary of complete names with the names of the removed keys 
    dic.update(dic_names_corrected)
    #Remove duplicates from the list of similar words for each key complete name 
    #in the dictionnary of complete names
    dic = {k:list(set(j)) for k,j in dic.items()}
    #For each complete name in the dictionnary, keep only the abbreviation names
    for k,v in dic.items():
        dic[k] = list(set(abbreviations) & set(v))
    #Inverse the dictionnary to obtain as keys the abbreviation names and as values the complete name
    #with which it will be replaced
    dic_inv = {}
    for key,list_val in dic.items():
        for val in list_val:
            dic_inv[val] = key
    return dic_inv

In [11]:
dic_names = match_abbrev_with_complete_names(dic, names_to_remove, dic_names_corrected)

# Correct Scenes and Names

In [12]:
def clean_attributes(attributes, dic_names):
    ''' 
    Adds the acts in the attributes and stores the scenes as numbers 
    :param list attributes: the list containing all attributes
    :param dictionnary dic_names: the dictionnary which maps abbreviations with complete names
    '''
    attributes_clean = attributes.copy()
    count_scene = 0
    count_act = 0
    mask = np.ones((np.shape(all_attributes)))
    # Goes through all the list
    for i, att in enumerate(all_attributes):
        # through all the text that has the 'Scene' tag
        if (att[0]=='Scene'):
            # Remove punctuation and case
            word = att[1].lower().translate(str.maketrans(dict.fromkeys(string.punctuation)))
            # If a scene is the first one, we add the begining of an act
            if (word=='prima'):
                count_act += 1
                count_scene = 0
                attributes_clean[i] = ['Act', count_act]
            # Detects the scene, stores its number
            elif (word=='scena'):
                count_scene += 1
                attributes_clean[i] = ['Scene', count_scene]
            else:
            # Otherwise, we will delete this row
                mask[i] = 0
        if (att[0]=='Name'):
            # Remove punctuation and case
            word = att[1].translate(str.maketrans(dict.fromkeys(string.punctuation)))
            if (word in dic_names.keys()):
                attributes_clean[i] = ['Name', dic_names[word]]
            else: 
                mask[i] = 0
        if (att[0]=='Description'):
            mask[i] = 0
    # Delete rows that we don't need anymore
    attributes_clean = attributes_clean[mask.astype(np.bool)]
    attributes_clean = attributes_clean.reshape(int(np.shape(attributes_clean)[0]/2), 2)
    return attributes_clean

In [13]:
attributes_clean = clean_attributes(all_attributes, dic_names)

# Create Tree Structure for Network Analysis

We will now store the extracted and cleaned information in a tree structure which will then be accesible in a json format. The tree structure will be created in the following way:
* The root will be the libretto name (level 0).
* The nodes in level 1 will be the Acts.
* The children nodes in level 2 coming from one of the Act parent node will be the Scenes contained in the corresponding Act.
* The children nodes in level 3 coming from one of the Scene parent node will be the Character names and the number of occurences of their name in the corresponding Scene.

In [14]:
def create_tree(attributes_clean):
    ''' 
    Create the tree structure of our libretto 
    :param numpy.ndarray attributes_clean: the list containing all attributes in the order of appearance
    '''
    #Creat dictionnary which will store the tree
    final_dic = {}
    #Loop through all cleaned attributes
    for i, att in enumerate(attributes_clean):
        #If attribute is 'Act' tag, create empty dictionnary to store the Scenes
        if (att[0]=='Act'):
            final_dic[int(att[1])] = {}
        #If attribute is 'Scene' tag, stay in the least added Act and add an empty dictionnary to store the Names
        if (att[0]=='Scene'):
            dic_act = final_dic[list(final_dic.keys())[-1]]
            dic_act[int(att[1])] = {}
        #If attribute is 'Name' tag, stay in the least added Scene and add the Name in the dictionnary 
        #with a counter == 1. Each time the name reappers, add 1 to the counter. The counter represents the
        #occurences of the name in the scene
        if (att[0]=='Name'):
            dic_act = final_dic[list(final_dic.keys())[-1]]
            dict_scene = dic_act[list(dic_act.keys())[-1]]
            if att[1] in dict_scene.keys():
                dict_scene[att[1]] += 1
            else:
                dict_scene[att[1]] = 1
    return final_dic

In [15]:
tree = create_tree(attributes_clean)

We will now save that tree structure in a json file.

In [16]:
def save_dict_in_json(dictionnary, path):
    ''' Saves ordered dictionnary of bounds per pages and per coordinates in a json file'''
    with open(path, "w") as outfile:  
        json.dump(dictionnary, outfile) 

In [17]:
save_dict_in_json(tree, "./data/Antigone/3_Network/network.json")