Developed by Sebastian Gampe, Chrisowalandis Deligio

In [1]:
%load_ext autoreload

import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
from cnt.preprocess import Preprocess
import spacy
from cnt.io import  Database_Connection
import warnings
warnings.filterwarnings('ignore')
import re

params = {'host': "localhost", 'user' : 'name', 'password' : 'pass', 'database' : 'database_name'}
dc =  Database_Connection("mysql+mysqlconnector://user:password@localhost:3306/database_name", params)

In [2]:
# Create dataframe and load designs from the database
id_col = "id"
design_col = "design_en"
output_dir =  "../cnt/trained_model/ner/english_new_2/"
model_name = "english_cno"

designs = dc.load_designs_from_db("nlp_training_designs", [id_col, design_col])
designs["design_en_changed"] = designs.loc[:, 'design_en']
designs["rules_applied"] = ""
designs["ner_results"] = ""
designs["mapped_results"] = ""
designs.head()

# Loading a subset of all designs
designs2 = designs.head(500)

model = load_ner_model_v2(output_dir, model_name, id_col, design_col)
designs.head().style

In [None]:
# Load entities from the database to another dataframe
language = "_en"
add_columns = ["name"+language, "alternativenames"+language]

person_entities = dc.load_designs_from_db("nlp_list_person", ["name", "alternativenames"])
person_entities.rename(columns={"name": "name_en", "alternativenames": "alternativenames_en"}, inplace=True)
object_entities = dc.load_designs_from_db("nlp_list_obj", add_columns)
animal_entities = dc.load_designs_from_db("nlp_list_animal", add_columns)
plant_entities = dc.load_designs_from_db("nlp_list_plant", add_columns)

frames = [person_entities, object_entities, animal_entities, plant_entities]
df_entities = pd.concat(frames)

df_entities.style
#df_entities.to_csv('ents.csv', index=False)

In [4]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")
#preprocess.add_rule("\?", "")
#preprocess.add_rule("(", "")
#preprocess.add_rule(")", "")
#preprocess.add_rule(" I.", " I")
#preprocess.add_rule(" II.", " II")
#preprocess.add_rule(" III.", " III")
#preprocess.add_rule(" IV.", " IV")
#preprocess.add_rule(" V.", " V")


for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        standard_name = row["name_en"]
        alt_names = row["alternativenames_en"].split(", ")
        for alt_name in alt_names:
            preprocess.add_rule(alt_name, standard_name)


In [5]:
# Preprocessing a single design
# 1. Create df with the design

new_df = designs.head(1)
new_df["design_en_changed"] = new_df.loc[:, 'design_en']
new_df["rules_applied"] = ""
new_df["ner_results"] = ""
new_df["mapped_results"] = ""


for index, row in new_df.iterrows():
    #new_df.at[index, "design_en_changed"] = "Alexander the Great macht Sachen, Alexander the Great hat andere Probleme."
    #new_df.at[index, "design_en"] = "Diademed head of deified Alexander the Great with astragals, right. Border of dots."
    new_df.at[index, "design_en"] = "Diademed head of deified Alexander the Great with horsemen(?), right. Border of dots."
    #new_df.at[index, "design_en"] = "Nude Apollo advancing right, holding arrow and drawing bow in his left hand."
new_df.style

In [6]:
# Preprocess the design and saving the applied rules

%autoreload 2

new_df.at[index, "design_en_changed"], new_df.at[index, "rules_applied"] = \
preprocess.preprocess_design(new_df.loc[index,'design_en'],new_df.loc[index,'id'])

new_df.style

In [8]:
# Map back the processed design with the saved rules

preprocess.map_back_design(new_df.loc[index,'design_en_changed'], new_df.loc[index,'id'])

In [9]:
# use the NLP model for finding the named entities and their postions 
new_df.at[index, "ner_results"] = model.predict_single_sentence(new_df.loc[index,'design_en_changed'])
new_df.style

In [10]:
# map the positions of the found entities to the original design
%autoreload 2

new_df.at[index, "mapped_results"] = \
preprocess.map_result_ner(new_df.loc[index,'design_en_changed'], new_df.loc[index,'ner_results'], new_df.loc[index,'id'])
new_df.style

In [11]:
# preprocess all designs
%autoreload 2

for index, row in designs2.iterrows():
    designs2.at[index, "design_en_changed"], designs2.at[index, "rules_applied"] = \
        preprocess.preprocess_design(designs2.loc[index,'design_en'], designs2.loc[index,'id'])
    designs2.at[index, "ner_results"] = model.predict_single_sentence(designs2.loc[index,'design_en_changed'])
    designs2.at[index, "mapped_results"] = \
    preprocess.map_result_ner(designs2.loc[index,'design_en_changed'], designs2.loc[index,'ner_results'], \
                                  designs2.loc[index,'id'])

#designs2.to_csv('out200.csv', index=False)

In [None]:
designs2.style


In [66]:
# Deleting brackets and questionmarks

for index, row in designs.iterrows():
    if "?" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace("?", "")
        #print(row["design_en"])
    if "(" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace("(", "")
        #print(row["design_en"])
    if ")" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(")", "")
        #print(row["design_en"])

In [12]:
# Fixing "Horseman" problem 

for index, row in designs.iterrows():
    if "Horseman" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace("Horseman", "Horse man")
        #print(row["design_en"])
    if "horseman" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace("horseman", "horse man")
        #print(row["design_en"])
    if "Horsemen" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace("Horsemen", "Horse men")
        #print(row["design_en"])
    if "horsemen" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace("horsemen", "horse men")
        #print(row["design_en"])

In [18]:
# Deleting dots after roman numerals. Execute twice to get all dots.

for index, row in designs.iterrows():
    if " I." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
        print(designs.loc[index,'design_en'])
    if " II." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
        print(designs.loc[index,'design_en'])
    if " III." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
        print(designs.loc[index,'design_en'])
    if " IV." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
        print(designs.loc[index,'design_en'])
    if " V." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" V.", " V")
        print(designs.loc[index,'design_en'])
for index, row in designs.iterrows():
    if "Facing heads of Claudius and Agrippina II" in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace("Facing heads of Claudius and Agrippina II", \
                                                                  "Facing heads of Claudius and Agrippina II.")
        

# Old Version

In [5]:
# change alternative names to the standard name
i = 0
for index, row in new_df.iterrows():
    print(i)
    i += 1
    for index2, row2 in df_entities.iterrows():
        if row2["alternativenames_en"] is not None:
            standard_name = row2["name_en"]
            alt_names = row2["alternativenames_en"].split(", ")
            if len(alt_names) > 1:
                alt_names = sorted(alt_names, key=len)
                alt_names.reverse()
            if re.search(r"\b%s\b" %standard_name, new_df.at[index, "design_en_changed"]):
                #print("Standard found: "+ standard_name )
                #print(row["design_en"])
                continue
            for alt_name in alt_names:
                #print(alt_name)
                #if "." in alt_name:
                    #print(alt_name)
                    #if re.search(r"\b%s\b" %alt_name + ".", new_df.at[index, "design_en_changed"]):
                        #print("Alt found: "+ alt_name + "." + ", changed to: " + standard_name)
                        #new_df.at[index, "design_en_changed"] = re.sub(r"\b%s\b" %alt_name + ".", standard_name, new_df.at[index, "design_en_changed"])
                        #print(new_df.loc[index,'design_en_changed'])
                        #continue
                if re.search(r"\b%s\b" %alt_name, new_df.at[index, "design_en_changed"]):
                    print("Alt found: "+ alt_name + ", changed to: " + standard_name)
                    new_df.at[index, "design_en_changed"] = re.sub(r"\b%s\b" %alt_name, standard_name, new_df.at[index, "design_en_changed"])
                    print(new_df.loc[index,'design_en_changed'])
                    break
                if alt_name[0].islower():
                    new_alt_name = alt_name[0].upper() + alt_name[1:]
                    new_standard_name = standard_name[0].upper() + standard_name[1:]
                    if re.search(r"\b%s\b" %new_alt_name, new_df.at[index, "design_en_changed"]):
                        print("Alt found: "+ new_alt_name + ", changed to: " + new_standard_name)
                        new_df.at[index, "design_en_changed"] = re.sub(r"\b%s\b" %new_alt_name, new_standard_name, new_df.at[index, "design_en_changed"])
                        print(new_df.loc[index,'design_en_changed'])
                        break
                if alt_name[0].isupper():
                    new_alt_name = alt_name[0].lower() + alt_name[1:]
                    new_standard_name = standard_name[0].lower() + standard_name[1:]
                    if re.search(r"\b%s\b" %new_alt_name, new_df.at[index, "design_en_changed"]):
                        print("Alt found: "+ new_alt_name + ", changed to: " + new_standard_name)
                        new_df.at[index, "design_en_changed"] = re.sub(r"\b%s\b" %new_alt_name, new_standard_name, new_df.at[index, "design_en_changed"])
                        print(new_df.loc[index,'design_en_changed'])
                        break
                    #print(alt_name)
new_df.to_csv('out.csv', index=False)