developed by Patricia Klinger, modified by Sebastian Gampe, Kerim Gencer, Chrisowalandis Deligio

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
from cnt.model import DesignEstimator, save_ner_model, load_ner_model,save_ner_model_v2, load_ner_model_v2
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, split_alternativenames)
from cnt.evaluate import Metrics
#from cnt.stem_lemma_annotation import Stem_Lemma_Annotatizer
from cnt.preprocess import Preprocess
import spacy
from cnt.io import  Database_Connection
import warnings

from database.connector import DatabaseConnector
from database.operations import load_entities_from_db, load_designs_from_db

warnings.filterwarnings('ignore')

### Define the column names for the id and design column 

In [72]:
sys.executable

In [73]:
spacy.__version__

In [74]:
pip list

In [75]:
id_col = "id"
design_col = "design_en"
use_lemma_stem = False

In [2]:
dc =  Database_Connection("mysql+mysqlconnector://[user]:[password]@localhost/nlp") # Format user:password@IP/Database

In [3]:
designs = dc.load_designs_from_db("nlp_training_designs", [id_col, design_col, "comment"])

In [78]:
designs

In [79]:
designs.head(5)

## This step is optional - load additional data to save with the model

## Load and annotate designs

In [80]:
language = "_en"
add_columns = ["name"+language, "alternativenames"+language]

In [81]:
entities = {
    "PERSON": dc.load_entities_from_db("nlp_list_person", ["name", "alternativenames"], ["alternativenames"], ",", True),
    "OBJECT": dc.load_entities_from_db("nlp_list_obj", add_columns, [add_columns[1]], ",", True),
    "ANIMAL": dc.load_entities_from_db("nlp_list_animal", add_columns, [add_columns[1]], ",", True),
    "PLANT": dc.load_entities_from_db("nlp_list_plant", add_columns, [add_columns[1]], ",", True)}

In [82]:
entities

In [83]:
annotated_designs = annotate_designs(entities, designs, id_col, design_col)
annotated_designs = annotated_designs[
    annotated_designs.annotations.map(len) > 0]

In [84]:
if use_lemma_stem:
    annotater = Stem_Lemma_Annotatizer() # parameter: method="lemma_stem", language="en", backbone="spacy_snowball"
    annotated_designs = annotater.annotate(annotated_designs, entities, id_col, design_col)

In [85]:
annotated_designs.head(5)

In [86]:
annotated_designs.shape

## Train NER

In [87]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_designs[[id_col, design_col]],
                                                    annotated_designs[[id_col, "annotations"]],
                                                    test_size=0.25, random_state = 12)
y_test = y_test.rename(columns={"annotations": "y"})

In [88]:
X_test.index = [i for i in range(X_test.shape[0])]
y_test.index = [i for i in range(y_test.shape[0])]

#### output directory for the trained model

In [89]:
#output_dir =  "../cnt/trained_model/ner/english/"
output_dir = "C:\\Users\\wishotz\\PycharmProjects\\NLP-on-multilingual-coin-datasets\\cnt\\trained_model\\ner\\english"
model_name = "english_cno"

In [90]:
annotated_designs[[id_col, design_col]]

In [91]:
annotated_designs[[id_col, 'annotations']]

In [92]:
my_estimator = DesignEstimator(4, output_dir, model_name, id_col, design_col)
my_estimator.set_labels("PERSON", "OBJECT", "ANIMAL", "PLANT")
my_estimator.fit(X_train, y_train.annotations, "cnt")

## Load and evaluate model

In [93]:
model = load_ner_model_v2(output_dir, model_name, id_col, design_col)

In [94]:
x_predict = model.predict(X_test,as_doc=False)

In [95]:
x_predict

In [96]:
metrics = Metrics()

In [97]:
scores_frame = metrics.create_score_frame(y_test, x_predict, my_estimator.get_labels())
scores_frame

In [98]:
precision, recall = metrics.score_precision_recall(y_test, x_predict)

In [99]:
F1 = (2*precision*recall) / (precision + recall)

In [100]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

## Testset

In [101]:
y_test.head(5)

In [102]:
x_predict.head(5)

In [103]:
X_test["annotation"] = y_test["y"]
X_test["prediction"] = x_predict["y"]
X_test.head(2)

In [104]:
def get_text(design, ent_list):
    result = []
    for i in ent_list:
        result.append(design[i[0]:i[1]])
    return result

In [105]:
X_test["annotation_str"] = X_test.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)
X_test["prediction_str"] = X_test.apply(lambda row: get_text(row.design_en, row.prediction), axis=1)

In [106]:
X_test.head(2)

In [107]:
X_train["annotation"] = y_train["annotations"]

In [108]:
X_train["annotation_str"] = X_train.apply(lambda row: get_text(row.design_en, row.annotation), axis=1)

In [109]:
labels = {}
for index, row in X_test.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

for index, row in X_train.iterrows():
    for i in row.annotation_str:
        labels[i] = [0,0,0]

In [110]:
for index, row in X_test.iterrows():
    annot = row.annotation_str
    pred = row.prediction_str
    
    for i in annot:
        labels[i][0] += 1
        if i in pred:
            labels[i][1] += 1

In [111]:
for index, row in X_train.iterrows():
    annot = row.annotation_str
    
    for i in annot:
        labels[i][2] += 1


In [112]:
label_scores = pd.DataFrame().from_dict(labels, orient="index").rename(columns={0:"Annotation", 1:"Prediction", 2:"Total_in_train"})

In [113]:
label_scores["Accuracy"] = label_scores.apply(lambda row: row.Prediction/row.Annotation, axis=1)

In [114]:
label_scores.loc[label_scores.index.str.contains("Alexander")]

In [115]:
label_scores.sort_values("Accuracy").head(200).style

# Visualize prediction

In [116]:
x_predict_as_doc = model.predict(designs[:20], as_doc=True)

In [117]:
from spacy import displacy
colors = {'PERSON': 'mediumpurple','OBJECT': 'greenyellow', 'ANIMAL' : 'orange', 'PLANT': 'salmom', 'VERBS': 'skyblue'}
options = {'ent': ['PERSON', 'OBJECT', 'ANIMAL', 'PLANT'], 'colors': colors}
displacy.render(x_predict_as_doc.y[:10], 
                style='ent', jupyter=True, options=options)

## Upload data to mysql

In [None]:
upload = False

In [None]:
if upload ==True:
    dc =  Database_Connection("mysql+mysqlconnector://YourConnection")
    cnt_designs = dc.load_designs_from_db("designs", [id_col, design_col])
    cnt_pred = my_estimator.predict_clear(cnt_designs )
    cnt_pred_predictions_only = cnt_pred["y"]
    
    cnt_ner_output = pd.DataFrame([(str(designid), *relation) for  _, (designid, relation_list) in cnt_pred.iterrows()
                    for relation in relation_list],
            columns=["DesignID", "Entity", "Label_Entity"])

    cnt_ner_output.to_sql("cnt_pipeline_ner", dc.mysql_connection, 
                           if_exists="replace", index=False)