developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import os
import sys
sys.path.append('../')
import pandas as pd
import random
import numpy as np
import spacy
import swifter
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
#from cnt.stem_lemma_annotation import Stem_Lemma_Annotatizer
from cnt.preprocess import Preprocess
from cnt.io import  Database_Connection



import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_en"
use_lemma_stem = False

In [3]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [4]:
dc =  Database_Connection("mysql+mysqlconnector://[user]:password@localhost") # Format user:password@IP/Database

In [5]:
designs = dc.load_designs_from_db("nlp_training_designs", [id_col, design_col])

In [6]:
designs.head(5)

In [7]:
designs.shape

## Load and annotate designs

In [8]:
entities = {
    "PERSON": dc.load_entities_from_db_v2("nlp_list_entities", "PERSON", add_columns, [add_columns[1]], ",", True),
    "OBJECT": dc.load_entities_from_db_v2("nlp_list_entities", "OBJECT", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db_v2("nlp_list_entities", "ANIMAL", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db_v2("nlp_list_entities", "PLANT", add_columns, [add_columns[1]], ",", True)}

In [9]:
entities

In [10]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [11]:
if use_lemma_stem:
    annotater = Stem_Lemma_Annotatizer() # parameter: method="lemma_stem", language="en", backbone="spacy_snowball"
    annotated_designs = annotater.annotate(annotated_designs, entities, id_col, design_col)

In [12]:
annotated_designs.head(5)

In [13]:
annotated_designs.shape

## Preprocessing

In [14]:
annotated_designs["design_en_changed"] = ""

In [15]:
df_entities = dc.load_from_db("nlp_list_entities", add_columns)

In [16]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")

for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        standard_name = row["name_en"]
        alt_names = row["alternativenames_en"].split(", ")
        for alt_name in alt_names:
            preprocess.add_rule(alt_name, standard_name)

#### Bei rÃ¶mischen Zahlen scheint es noch kleine Probleme zu geben, daher nochmal extra manuell

In [17]:
for rule in list(preprocess.rules):
    if " I." in rule or " II." in rule or " III." in rule or " IV." in rule or " V." in rule:
        del preprocess.rules[rule]

In [18]:
for index, row in annotated_designs.iterrows():
    if " I." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
    if " II." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
    if " III." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
    if " IV." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
    if " V." in row["design_en"]:
        designs.at[index, "design_en"] = row["design_en"].replace(" V.", " V")

In [19]:
# Apply defined rules
annotated_designs["design_en_changed"] = annotated_designs.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.id)[0], axis=1)

In [20]:
# Deleting brackets and questionmarks
annotated_designs["design_en_changed"] = annotated_designs.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

In [21]:
annotated_designs.rename(columns={"design_en":"design_en_orig", "design_en_changed":"design_en", "annotations":"annotations_orig"}, inplace=True)

In [22]:
annotated_designs.head(5).style

In [23]:
train_designs = annotate_designs(entities, annotated_designs[["id", "design_en"]], id_col, design_col)
train_designs = train_designs[
    train_designs.annotations.map(len) > 0]
annotated_designs = annotated_designs.merge(train_designs[["id", "annotations"]], left_on="id", right_on="id")

In [24]:
annotated_designs.head(5)

In [25]:
annotated_designs.shape

## Train NER

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [27]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [28]:
output_dir =  "../cnt/trained_model/ner/english_new/"
model_name = "english_cno"

In [29]:
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

## Load and evaluate model

In [30]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [31]:
x_predict = model.predict(X_test,as_doc=False)

In [32]:
x_predict

In [33]:
metrics = Metrics()

In [34]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

In [35]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [36]:
F1 = (2*precision*recall) / (precision + recall)

In [37]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

## Entity scores

In [38]:
y_test.head(5)

In [39]:
x_predict.head(5)

In [40]:
X_test["annotation"] = y_test["y"]
X_test["prediction"] = x_predict["y"]
X_test.head(2)

In [41]:
def get_text(design, ent_list):
    result = []
    for i in ent_list:
        result.append(design[i[0]:i[1]])
    return result

In [42]:
X_test["annotation_str"] = X_test.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)
X_test["prediction_str"] = X_test.apply(lambda row: get_text(row.design_en, row.prediction), axis=1)

In [43]:
X_test.head(2)

In [44]:
X_train["annotation"] = y_train["annotations"]

In [45]:
X_train["annotation_str"] = X_train.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)

In [46]:
labels = {}
for index, row in X_test.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

for index, row in X_train.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

In [47]:
for index, row in X_test.iterrows():
    annot = row.annotation_str
    pred = row.prediction_str
    
    for i in annot:
        labels[i][0] += 1
        if i in pred:
            labels[i][1] += 1

In [48]:
for index, row in X_train.iterrows():
    annot = row.annotation_str
    
    for i in annot:
        labels[i][2] += 1


In [49]:
label_scores = pd.DataFrame().from_dict(labels, orient="index").rename(columns={0:"Annotation", 1:"Prediction", 2:"Total_in_train"})

In [50]:
label_scores["Accuracy"] = label_scores.apply(lambda row: row.Prediction/row.Annotation, axis=1)

In [51]:
label_scores.loc[label_scores.index.str.contains("Alexander")]

In [52]:
label_scores.sort_values("Accuracy").head(10)

## Map result back

In [53]:
X_test["y"] = y_test["y"]

In [54]:
X_test

In [55]:
X_test["design_en_orig"] = X_test.swifter.apply(lambda row: preprocess.map_back_design(row.design_en, row.id) if row.id in preprocess.rules_applied else row.design_en, axis=1)

In [56]:
X_test["y_orig"] = X_test.swifter.apply(lambda row: preprocess.map_result_ner(row.design_en,row.y, row.id) if row.id in preprocess.rules_applied else row.y, axis=1)

In [57]:
X_test.loc[X_test.design_en.str.contains("Alexander III")]

In [58]:
X_test.loc[X_test.design_en.str.contains("Veil")].head(5)

# Visualize prediction

In [59]:
x_predict_as_doc = model.predict(X_test, as_doc=True)

In [60]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y[:10], 
                style='ent', jupyter=True, options=options)

## Upload data to mysql

In [61]:
upload = False

In [62]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", [id_col, design_col])
    cnt_pred = my_estimator.predict_clear(cnt_designs )
    cnt_pred_predictions_only = cnt_pred["y"]
    
    cnt_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

    cnt_ner_output.to_sql("cnt_pipeline_ner", dc.mysql_connection, 
                           if_exists="replace", index=False)