developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
import spacy
from cnt.io import  Database_Connection
import warnings
warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [2]:
id_col = "id"
design_col = "design_de"

In [3]:
dc =  Database_Connection("mysql+mysqlconnector://root:YourConnection") # Format user:password@IP/Database

In [4]:
designs = dc.load_designs_from_db("data_designs", [id_col, design_col])

## This step is optional - load additional data to save with the model

In [5]:
language = "_ger"
add_columns = ["name"+language, "alternativenames"+language, "link"]

In [6]:
entity_information = [dc.load_designs_from_db("nlp_list_person", ["name_german", "alternativenames", "link"]),
                      dc.load_designs_from_db("nlp_list_obj", add_columns),
                      dc.load_designs_from_db("nlp_list_animal", add_columns),
                      dc.load_designs_from_db("nlp_list_plant", add_columns)]

In [7]:
for dataframe in entity_information:
    dataframe.rename(columns={"name_german":"name", "name"+language:"name", "alternativenames"+language:"alternativenames"}, inplace=True)

In [8]:
optional_info = pd.DataFrame(columns=["name","link"])
for df in entity_information:
    tmp = split_alternativenames(df.fillna(" "))
    optional_info = optional_info.append(tmp)
optional_info

In [9]:
optional_info.loc[optional_info["name"]=="Andromeda"]["link"].item()

### Load and annotate designs

In [10]:
language = "_ger"
add_columns = ["name"+language, "alternativenames"+language]

In [11]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name_german", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [12]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [13]:
annotated_designs.shape

In [14]:
annotated_designs.head(5)

## Train NER

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [16]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [17]:
output_dir =  "../cnt/trained_model/ner/german/"
model_name = "german_cno"

In [18]:
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col, spacy_model="de_core_news_sm", save_optional=True, optional_info=optional_info)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

## Load and evaluate model

In [19]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [20]:
x_predict = model.predict(X_test,as_doc=False)

In [21]:
x_predict

In [22]:
metrics = Metrics()

In [23]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

In [24]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [25]:
F1 = (2*precision*recall) / (precision + recall)

In [26]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

# Visualize prediction

In [27]:
x_predict_as_doc = model.predict(designs, as_doc=True)

In [28]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y, 
                style='ent', jupyter=True, options=options)