In [67]:
import sys
sys.path.append('../')
import pandas as pd
import random
import os
import numpy as np
import swifter

from cnt.model import (DesignEstimator, RelationExtractor, save_pipeline, load_pipeline, predict_re_single_sentence, 
relations_from_adjectives_df, relations_from_adjectives_single, concat_relations, load_ner_model_v2)
from cnt.annotate import (annotate, annotate_single_design, 
                          annotate_designs, 
                          extract_string_from_annotation, labeling_eng)
from cnt.extract_relation import (path, NERTransformer, FeatureExtractor)
from cnt.evaluate import Metrics
from cnt.vectorize import (Doc2Str, Path2Str, Verbs2Str, AveragedPath2Vec, 
                           AveragedRest2Vec)
from cnt.io import (replace_left_right)
from cnt.io import  Database_Connection
from cnt.preprocess import Preprocess


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import MultinomialNB
from itertools import product
import warnings
warnings.filterwarnings('ignore')

In [68]:
dc =  Database_Connection("mysql+mysqlconnector://[user]:[password]@localhost:3306/nlp_challenge") # Format

In [69]:
id_col = "id"
design_col = "design_en"

In [70]:
language = "_en"
add_columns = ["id", "name"+language, "alternativenames"+language]

In [71]:
train = dc.create_own_query("""select design_id, 
(select design_en from nlp_training_designs as nlp where re.design_id=nlp.id) as design_en,
(select name_en from nlp_list_entities as ner where ner.id=re.subject) as s, 
(select class from nlp_list_entities as ner where ner.id=re.subject) as subject_class, 
(select name_en from nlp_list_entities as ner where ner.id=re.predicate) as p, 
(select name_en from nlp_list_entities as ner where ner.id=re.object) as o, 
(select class from nlp_list_entities as ner where ner.id=re.object) as object_class
from nlp_relation_extraction_en_v3 as re;""")

Engine(mysql+mysqlconnector://saif:***@localhost:3306/nlp_challenge)


In [72]:
train.head(5)

Unnamed: 0,design_id,design_en,s,subject_class,p,o,object_class
0,6652,"Aequitas standing left, wearing chiton and him...",Aequitas,PERSON,wearing,chiton,OBJECT
1,6652,"Aequitas standing left, wearing chiton and him...",Aequitas,PERSON,wearing,himation,OBJECT
2,6652,"Aequitas standing left, wearing chiton and him...",Aequitas,PERSON,holding,scales,OBJECT
3,6652,"Aequitas standing left, wearing chiton and him...",Aequitas,PERSON,holding,cornucopia,OBJECT
4,9,Amphora with ribbed surface and crooked handle...,amphora,OBJECT,holding,poppy,PLANT


In [73]:
train["y"] = train.apply(lambda row: [(row.s, row.subject_class, row.p, row.o, row.object_class)], axis=1)

In [74]:
train.head(2)

Unnamed: 0,design_id,design_en,s,subject_class,p,o,object_class,y
0,6652,"Aequitas standing left, wearing chiton and him...",Aequitas,PERSON,wearing,chiton,OBJECT,"[(Aequitas, PERSON, wearing, chiton, OBJECT)]"
1,6652,"Aequitas standing left, wearing chiton and him...",Aequitas,PERSON,wearing,himation,OBJECT,"[(Aequitas, PERSON, wearing, himation, OBJECT)]"


In [75]:
tmp = train.groupby("design_id").agg({"y": "sum"})

In [76]:
tmp.loc[tmp.index==1706].style

Unnamed: 0_level_0,y
design_id,Unnamed: 1_level_1
1706,"[('Caracalla', 'PERSON', 'wearing', 'toga', 'OBJECT'), ('Caracalla', 'PERSON', 'holding', 'scroll', 'OBJECT'), ('Geta', 'PERSON', 'wearing', 'toga', 'OBJECT'), ('Geta', 'PERSON', 'holding', 'scroll', 'OBJECT')]"


In [77]:
X = train #.drop_duplicates("design_id",keep="first")

In [78]:
X = X.merge(tmp, left_on="design_id", right_on="design_id", suffixes=('', 'y'))

In [79]:
X = X[["design_id", "design_en", "yy"]].rename(columns={"yy":"y"})

In [80]:
X["design_en_changed"] = ""

In [81]:
X

Unnamed: 0,design_id,design_en,y,design_en_changed
0,6652,"Aequitas standing left, wearing chiton and him...","[(Aequitas, PERSON, wearing, chiton, OBJECT), ...",
1,6652,"Aequitas standing left, wearing chiton and him...","[(Aequitas, PERSON, wearing, chiton, OBJECT), ...",
2,6652,"Aequitas standing left, wearing chiton and him...","[(Aequitas, PERSON, wearing, chiton, OBJECT), ...",
3,6652,"Aequitas standing left, wearing chiton and him...","[(Aequitas, PERSON, wearing, chiton, OBJECT), ...",
4,9,Amphora with ribbed surface and crooked handle...,"[(amphora, OBJECT, holding, poppy, PLANT), (am...",
...,...,...,...,...
2407,27792,Nike holding jar.,"[(Nike, PERSON, holding, jar, OBJECT)]",
2408,27669,Athena resting on panther.,"[(Athena, PERSON, resting_on, panther, ANIMAL)]",
2409,27567,Olybrius receiving crown.,"[(Olybrius, PERSON, receiving, crown, OBJECT)]",
2410,27729,Apollo holding staff.,"[(Apollo, PERSON, holding, staff, OBJECT)]",


In [82]:
X.shape

(2412, 4)

In [83]:
df_entities = dc.load_from_db("nlp_list_entities", add_columns)

In [84]:
# Add rules for preprocessing
preprocess = Preprocess()
preprocess.add_rule("horseman", "horse man")
preprocess.add_rule("horsemen", "horse men")

for index, row in df_entities.iterrows():
    if row["alternativenames_en"] is not None:
        standard_name = row["name_en"]
        alt_names = row["alternativenames_en"].split(", ")
        for alt_name in alt_names:
            preprocess.add_rule(alt_name, standard_name)

#### Bei römischen Zahlen scheint es noch kleine Probleme zu geben, daher nochmal extra manuell

In [85]:
for rule in list(preprocess.rules):
    if " I." in rule or " II." in rule or " III." in rule or " IV." in rule or " V." in rule:
        del preprocess.rules[rule]

In [86]:
for index, row in X.iterrows():
    if " I." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" I.", " I")
    if " II." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" II.", " II")
    if " III." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" III.", " III")
    if " IV." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" IV.", " IV")
    if " V." in row["design_en"]:
        X.at[index, "design_en"] = row["design_en"].replace(" V.", " V")

## Apply Preprocessing

In [87]:
# Apply defined rules
X["design_en_changed"] = X.swifter.apply(lambda row: preprocess.preprocess_design(row.design_en, row.design_id)[0], axis=1)

Pandas Apply:   0%|          | 0/2412 [00:00<?, ?it/s]

In [88]:
# Deleting brackets and questionmarks
X["design_en_changed"] = X.swifter.apply(lambda row: row["design_en_changed"].replace("?", "").replace("(", "").replace(")", ""), axis=1)

Pandas Apply:   0%|          | 0/2412 [00:00<?, ?it/s]

In [89]:
X.rename(columns={"design_en":"design_en_orig", "design_en_changed":"design_en", "y":"annotations_orig"}, inplace=True)

In [90]:
# Mapping GT
X["y"] = X.swifter.apply(lambda row: preprocess.preprocess_re(row["annotations_orig"], row.design_id), axis=1)

Pandas Apply:   0%|          | 0/2412 [00:00<?, ?it/s]

### Train the RE model

In [91]:
id_col = "design_id"
design_col = "design_en"

In [92]:
classifier = LogisticRegression(max_iter=1000)
#classifier = RandomForestClassifier()
string_converter = Path2Str(pos=True) 
vectorizer = CountVectorizer(ngram_range=(1,3))
feature = make_pipeline(string_converter, vectorizer)


In [93]:
feature

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X[[id_col, design_col]], X[[id_col, "y"]], test_size=0.25, random_state=33)

#### load pretrained NER-Model

In [95]:
#ner_model_directory = "../cnt/trained_model/ner/english_new/"
ner_model_directory = "../results/trained_model/ner/"
ner_model_name = "english_cno"

#### define RE-Model path

In [96]:
#re_model_directory = "../cnt/trained_model/re/"
re_model_directory = "../results/trained_model/re/"
re_model_name = "english_cno"

In [97]:
inner_pipeline = make_pipeline(feature, classifier)
pipeline = make_pipeline(NERTransformer(ner_model_directory, ner_model_name, id_col, design_col),
                         FeatureExtractor(ner_model_directory, ner_model_name, id_col, design_col),
                         RelationExtractor(inner_pipeline, re_model_directory, re_model_name, id_col))


In [98]:
X_train

Unnamed: 0,design_id,design_en
1511,5047,"Nude Hermes seated_on on a ram advancing left,..."
23,67,Apollo and Artemis in front of large conical t...
1208,1320,Wreath emperor Severus Alexander seated_on rig...
1179,1712,"Wreath bust of bearded Geta, right, wearing cu..."
2259,27577,Rhoemetalces I receiving crown.
...,...,...
102,1711,"Artemis Tauropolos advancing right, wearing a ..."
2243,27604,Apollo feeding panther.
57,2450,"Ares standing facing, head left, wearing cuira..."
578,2507,"Demeter seated_on left on basket, wearing corn..."


In [99]:
y_train

Unnamed: 0,design_id,y
1511,5047,"[(Hermes, PERSON, seated_on, ram, ANIMAL), (He..."
23,67,"[(Apollo, PERSON, holding, spear, OBJECT), (Ap..."
1208,1320,"[(Severus Alexander, PERSON, wearing, cuirass,..."
1179,1712,"[(Geta, PERSON, wearing, cuirass, OBJECT), (Ge..."
2259,27577,"[(Rhoemetalces I, PERSON, receiving, crown, OB..."
...,...,...
102,1711,"[(Artemis, PERSON, wearing, chiton, OBJECT), (..."
2243,27604,"[(Apollo, PERSON, feeding, panther, ANIMAL)]"
57,2450,"[(Ares, PERSON, wearing, cuirass, OBJECT), (Ar..."
578,2507,"[(Demeter, PERSON, seated_on, basket, OBJECT),..."


In [100]:
pipeline.fit(X_train, y_train)

## Save and Load model

In [101]:
save_pipeline(pipeline, re_model_directory, re_model_name)

In [102]:
model = load_pipeline(re_model_directory, re_model_name)

## Predict

In [103]:
y_pred = model.predict(X_test)

In [1]:
example = predict_re_single_sentence(model, "At left, Demeter standing facing, head right, wearing wreath, holding long lit torch in left arm and two corn with right hand over cista mystica on the left with half-open lid, to right, out of which serpent creeps, right; at right, Homonoia with kalathos standing facing, head left, holding a patera in outstretched right hand over lighted and garland altar to left and cornucopia in left arm. Ground line. Border of dots.", "design_id", "design_en")

NameError: name 'predict_re_single_sentence' is not defined

In [39]:
example

In [104]:
metrics = Metrics()

In [105]:
precision, recall = metrics.score_precision_recall(y_test, y_pred)
F1 = (2*precision*recall) / (precision + recall)

In [42]:
X_test

In [43]:
y_test

In [44]:
y_pred

In [106]:
print("Precision", round(precision*100,2))
print("Recall", round(recall*100,2))
print("F1", round(F1*100,2))

Precision 95.44
Recall 81.19
F1 87.74


## Map back

In [46]:
X_test["y"] = y_test["y"]

In [47]:
# Deleting brackets and questionmarks
X_test["y_mapped"] = X_test.swifter.apply(lambda row: preprocess.map_re(row["y"], row.design_id), axis=1)

In [48]:
X_test.head(5)

## Auto relations

In [49]:
obj_list = {
"veiled": ("wearing", "Veil", "before"),
"draped": ("wearing", "Clothing", "before"),
"helmeted": ("wearing", "Helmet", "before"),
"diademed": ("wearing", "Diadem", "before"),
"turreted": ("wearing", "Mural crown", "before"),
"enthroned": ("seated_on", "Throne", "after"),

}

In [50]:
y_pred["design_en"] = X_test.design_en

In [51]:
y_pred = relations_from_adjectives_df(y_pred, "design_en", "y", ner_model_directory, ner_model_name, id_col, design_col, obj_list, entities_to_consider=["PERSON"])

In [52]:
y_pred

In [53]:
design = "Diademed Athena to the left and helmeted Ares to the right, holding swo."
auto_relations = relations_from_adjectives_single(design,ner_model_directory, ner_model_name, id_col, design_col, obj_list)
model_relations = predict_re_single_sentence(model, design, id_col, design_col)
concat_relations(auto_relations, model_relations)