In [2]:
import cv2
import numpy as np
import os
import pandas as pd
import json
import collections
import string
import re

# Load data

In [8]:
def load_json_in_dict(path):
    ''' Loads ordered dictionnary of bounds per pages and per coordinates from a json file'''
    with open(path) as json_file: 
        return json.load(json_file)

In [9]:
data = load_json_in_dict("./data/Antigone/2_OCR_results/Antigone.json")

In [7]:
import io 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

def remove_stopwords(text):
    ''' Remove stopwords in all kind of cases '''
    stop_words = stopwords.words('italian')+[word.title() for word in stopwords.words('italian')]+[word.upper() for word in stopwords.words('italian')]
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stop_words]
    return tokens_without_sw

def extract_all_attributes(data):
    '''Extract all atributes extracted by OCR in order, whithout storing coordinated or pages'''
    elements = np.empty(shape=(0,2))
    # for each page of the libretto
    for page in range(len(data)):
        # for each left and/or right page
        for ind in data[page][1].keys():
            # extract elements in the order they appear, without storing coordinates or pages
            elements = np.concatenate((elements, np.array(data[page][1][ind])[:, 1:]), axis = 0)
    return elements

def extract_attribute(elements, attribute):
    '''Extract elements in order from specific attribute'''
    # Extract text from specific attribute
    text_list = [row[1] for row in elements if attribute in row[0]]
    # Create string
    text = " ".join(text_list)
    # Remove digits
    text = ''.join([i for i in text if not i.isdigit()])
    # Remove punctuations
    text = text.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    # Remove stopwords
    text = remove_stopwords(text)
    return text

In [12]:
all_attributes = extract_all_attributes(data)
description = extract_attribute(all_attributes, 'Description')
names = extract_attribute(all_attributes, 'Name')
scenes = extract_attribute(all_attributes, 'Scene')

# Extract Names

In [10]:
def extract_names_abbreviations(names, top_N):
    '''Extract top-N most common abbreviation names'''
    names_abbreviations = []
    frequent_names = collections.Counter(names).most_common(top_N)
    for name, count in frequent_names:
        names_abbreviations.append(name)
    return list(set(names_abbreviations))

def list_patterns(names_abbreviations):
    '''Create list of patterns from abbreviation names'''
    patterns = list(map('.*'.join, names_abbreviations))
    correct_patterns = []
    for pattern in patterns:
        correct_patterns.append(pattern.replace(".*", "", 1) + ".*")
    return correct_patterns

def filter_pattern(pattern, datalist):
    '''Filter list of abbreviations by pattern'''
    occurences = [val for val in datalist if re.search(pattern, val)]
    if len(occurences) > 0:
        return occurences
    else:
        return []

def find_complete_name(pattern, text):
    '''Returns most common name who follows the pattern'''
    occurences = filter_pattern(pattern, text)
    most_common_name = collections.Counter(occurences).most_common(1)
    if (len(most_common_name) > 0) and (len(occurences) > 0):
        return most_common_name[0][0], list(set(occurences))
    
def extract_complete_names(names, description):
    '''Extract list of characters names and their respective similar names'''
    names_abbreviations = extract_names_abbreviations(names, 10)
    patterns = list_patterns(names_abbreviations)

    dic = {}
    for pattern in patterns:
        name, name_mappings = find_complete_name(pattern, description)
        if name != None:
            if name in dic:
                dic[name] = list(set(dic[name] + name_mappings))
            else:
                dic[name] = list(set(name_mappings))
    return dic

In [13]:
dic = extract_complete_names(names, description)
dic

{'Eurifteo': ['Euritleo',
  'Eur',
  'Euriftec',
  'Eurifleo',
  'Euriftco',
  'Eurifteo',
  'Eurifico'],
 'Creonte': ['Creonte', 'Creente'],
 'Alcefte': ['Alcefie', 'Alceie', 'Alcefle', 'Alcefte'],
 'Learco': ['Learto', 'Learcoe', 'Learco', 'Learce'],
 'Autigona': ['Autigona', 'Aut', 'Autigo'],
 'Ermione': ['ErmioneudiftiIl', 'Ermione', 'Erm', 'Ermiene', 'dErmione'],
 'Antigona': ['dAntiope',
  'fudettoedAntigont',
  'dAntigona',
  'Antigona',
  'Antigoua',
  'Antigana',
  'Ant']}

In [14]:
from difflib import SequenceMatcher as sm

dic_new = {}
names_keys = list(dic.keys())
names_values = list(dic.values())

for i in range(len(names_keys)):
    for j in range(i+1, len(names_keys)):
        if(sm(None, names_keys[i], names_keys[j]).ratio() >= 0.60):            
            if names_keys[i] in dic_new.keys():
                dic_new[names_keys[i]].extend(names_values[i])
                dic_new[names_keys[i]].extend(names_values[j])
            else:
                dic_new[names_keys[i]] = names_values[i]
                dic_new[names_keys[i]].extend(names_values[j])
#        else:
#            if names_keys[i] in dic_new.keys():
#                dic_new[names_keys[i]].extend(names_values[i])
#            else:
#                dic_new[names_keys[i]] = names_values[i]
dic_new = {k:list(set(j)) for k,j in dic_new.items()}
dic_new

{'Autigona': ['dAntiope',
  'fudettoedAntigont',
  'Aut',
  'Autigo',
  'Autigona',
  'dAntigona',
  'Antigona',
  'Antigoua',
  'Antigana',
  'Ant']}

# Correct Scenes

In [15]:
def clean_scenes_and_acts(attributes_clean):
    ''' Adds the acts in the attributes
        And stores the scenes as numbers '''
    count_scene = 0
    count_act = 0
    mask = np.ones((np.shape(all_attributes)))
    # Goes through all the list
    for i, att in enumerate(all_attributes):
        # through all the text that has the 'Scene' tag
        if (att[0]=='Scene'):
            # Remove punctuation and case
            word = att[1].lower().translate(str.maketrans(dict.fromkeys(string.punctuation)))
            # If a scene is the first one, we add the begining of an act
            if (word=='prima'):
                count_act += 1
                count_scene = 0
                attributes_clean[i] = ['Act', count_act]
            # Detects the scene, stores its number
            elif (word=='scena'):
                count_scene += 1
                attributes_clean[i] = ['Scene', count_scene]
            else:
            # Otherwise, we will delete this row
                mask[i] = 0
    # Delete rows that we don't need anymore
    attributes_clean = attributes_clean[mask.astype(np.bool)]
    attributes_clean = attributes_clean.reshape(int(np.shape(attributes_clean)[0]/2), 2)
    return attributes_clean

In [16]:
scenes_clean = clean_scenes_and_acts(all_attributes)