In [None]:
import numpy as np
np.random.seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

import os
import gc 
import math

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";  

import glob
import datetime
import zipfile

import numpy as np
import pandas as pd
import re
import spacy
import math
from collections import Counter
from ftfy import fix_text

from gensim.models.phrases import Phrases, Phraser
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText 

from keras.preprocessing.sequence import make_sampling_table, skipgrams
from keras.layers import Input, Dense, Embedding, Activation, Dot, Flatten, GRU, Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, Concatenate, Lambda, CuDNNGRU, CuDNNLSTM, GaussianNoise, GaussianDropout, Conv1D, BatchNormalization, Softmax
from keras.models import Model
from keras.optimizers import RMSprop
from keras.models import load_model
from keras.utils import to_categorical
from keras.activations import softmax

from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

from sklearn.metrics import f1_score


In [None]:
nlp = spacy.load("en_core_web_lg")
fasttext = FastText.load_fasttext_format("wiki.en.bin")

In [None]:
TAG_RE = re.compile(r'<[^>]+>')

entity_map = {"PERSON":"person",
              "NORP":"nationality",
              "FAC":"facility",
              "ORG":"organization",
              "GPE":"government",
              "LOC":"location",
              "PRODUCT":"product",
              "EVENT":"event",
              "WORK_OF_ART":"artwork",
              "LAW":"law",
              "LANGUAGE":"language",
              "DATE":"date",
              "TIME":"time",
              "PERCENT":"percent",
              "MONEY":"money",
              "QUANTITY":"amount",
              "ORDINAL":"first",
              "CARDINAL":"number"}

def remove_tags(text):
    return TAG_RE.sub(' ', text)

def strip_whitespace(text):
    return " ".join(text.split())

def clean_text(text):
    return(remove_punctuation(fix_text(strip_whitespace(remove_tags(text)))))

def remove_punctuation(text):
    return(re.sub(r"[^A-Za-z0-9 ]","",text))

def sub_entities(text, spacy_model):
#     replacements = [(e.text,e.label_) for e in spacy_model(text).ents]
#     replacements = sorted(replacements, key=lambda x: len(x[0]), reverse=True)
#     replacements = [(a,entity_map[b]) for a,b in replacements]
#     for k,v in replacements:
#         text = re.sub(r"\b"+re.escape(k)+r"\b", v, text)
# #     text = re.sub(r"[^a-zA-Z0-9 ]","",text).lower()
    return(text)

def df_to_dic(df):
    data_dic = {}
    for item in df.iterrows():
        data = item[1]
        _id = data["id"]
        data_dic[_id] = {}
        data_dic[_id]["label"] = data["label"]
        data_dic[_id]["text"] = data["text"]
        data_dic[_id]["url"] = data["url"]
    return data_dic

def task3_listify(df,spacy_model):
    return_list = []
    maxn = max(df["sentence"].tolist())
    for ii in range(maxn+1):
        subset = df.loc[df.sentence==ii]
        sentence = subset["word"].tolist()
        y = subset["label"].tolist()
        y = [a if a is not None else "O" for a in y]
        ent_sub = [((e.text+" ").split(" "), e.label_) for e in spacy_model(' '.join(sentence)).ents]
        ent_sub = [(a, [b]*len(a)) for a,b in ent_sub]
        ent_sub = {k:v for k,v in zip([item for sublist in [a for a,b in ent_sub] for item in sublist],
                                      [item for sublist in [b for a,b in ent_sub] for item in sublist])}
        entities = [ent_sub[a] if a in ent_sub.keys() else "O" for a in sentence]
        return_list.append({"text":[s for s in sentence], "y":y, "entities":entities})
    return(return_list)

In [None]:
## News Data

file_name_train_task1 = "data/Document/train_filled.json"
file_name_dev_task1 = "data/Document/dev_filled.json"
file_name_test_task1 = "data/Document/test_filled.json"
file_name_china_task1 = "data/Document/test_china_filled.json"

file_name_train_task2 = "data/Sentence/train_filled.json"
file_name_dev_task2 = "data/Sentence/dev_filled.json"
file_name_test_task2 = "data/Sentence/test_filled.json"
file_name_china_task2 = "data/Sentence/test_china_filled.json"

## JSON - to - DF

df_train_task1 = pd.read_json(file_name_train_task1, orient="records", lines = True)
df_train_task1["text"] = [sub_entities(clean_text(t), nlp) for t in df_train_task1.text]

df_dev_task1 = pd.read_json(file_name_dev_task1, orient="records", lines = True)
df_dev_task1["text"] = [sub_entities(clean_text(t), nlp) for t in df_dev_task1.text]

df_test_task1 = pd.read_json(file_name_test_task1, orient="records", lines = True)
df_test_task1["text"] = [sub_entities(clean_text(t), nlp) for t in df_test_task1.text]
df_test_task1["label"] = -1

df_train_task2 = pd.read_json(file_name_train_task2, orient="records", lines = True)
df_train_task2["text"] = [sub_entities(clean_text(t), nlp) for t in df_train_task2.sentence]

df_dev_task2 = pd.read_json(file_name_dev_task2, orient="records", lines = True)
df_dev_task2["text"] = [sub_entities(clean_text(t), nlp) for t in df_dev_task2.sentence]

df_test_task2 = pd.read_json(file_name_test_task2, orient="records", lines = True)
df_test_task2["text"] = [sub_entities(clean_text(t), nlp) for t in df_test_task2.sentence]
df_test_task2["label"] = -1

df_china_task1 = pd.read_json(file_name_china_task1, orient="records", lines = True)
df_china_task1["text"] = [sub_entities(clean_text(t), nlp) for t in df_china_task1.text]
df_china_task1["label"] = -1

df_china_task2 = pd.read_json(file_name_china_task2, orient="records", lines = True)
df_china_task2["text"] = [sub_entities(clean_text(t), nlp) for t in df_china_task2.sentence]
df_china_task2["label"] = -1

In [None]:
file_name_train_task3 = "task3public9may/train.txt"
file_name_dev_task3 = "task3public9may/dev.txt"
file_name_test_task3 = "task3public9may/task3_test.data"
file_name_china_task3 = "task3public9may/china_test.data"

with open(file_name_train_task3) as f:
    file = f.read().splitlines()
    df_train_task3 = pd.DataFrame([a.split("\t") for a in file], columns=["word","label"])
    df_train_task3["word"] = [a.strip() for a in df_train_task3["word"].tolist()]
    df_train_task3["label"] = [str(a) if a!="" else "" for a in df_train_task3["label"].tolist()]

with open(file_name_dev_task3) as f:
    file = f.read().splitlines()
    df_dev_task3 = pd.DataFrame([a.split("\t") for a in file], columns=["word","label"])
    df_dev_task3["word"] = [a.strip() for a in df_dev_task3["word"].tolist()]
    df_dev_task3["label"] = [str(a) if a!="" else "" for a in df_dev_task3["label"].tolist()]
#     
with open(file_name_test_task3) as f:
    df_test_task3 = pd.DataFrame([(a.strip(),-1) for a in f.readlines()], columns=["word","label"])
with open(file_name_china_task3) as f:
    df_china_task3 = pd.DataFrame([(a.strip(),-1) for a in f.readlines()], columns=["word","label"])
    
df_train_task3["sentence"] = (df_train_task3["word"]=="SAMPLE_START").cumsum() - 1
df_dev_task3["sentence"] = (df_dev_task3["word"]=="SAMPLE_START").cumsum() - 1
df_test_task3["sentence"] = (df_test_task3["word"]=="SAMPLE_START").cumsum() - 1 
df_china_task3["sentence"] = (df_china_task3["word"]=="SAMPLE_START").cumsum() - 1

task3_y_values = set([a for a in df_train_task3["label"].tolist() + df_dev_task3["label"].tolist() if a is not None])
task3_y_fwd_dict = {k:v for v,k in enumerate(task3_y_values)}
task3_y_rev_dict = {k:v for v,k in task3_y_fwd_dict.items()}

train_task3 = task3_listify(df_train_task3,nlp)
test_task3 = task3_listify(df_test_task3,nlp)
dev_task3 = task3_listify(df_dev_task3,nlp)
china_task3 = task3_listify(df_china_task3,nlp)

task3_ent_values = set(item for sublist in 
                       [a['entities'] for a in train_task3] + 
                       [a['entities'] for a in test_task3] + 
                       [a['entities'] for a in dev_task3] + 
                       [a['entities'] for a in china_task3]
                       for item in sublist)
task3_ent_fwd_dict = {k:v for v,k in enumerate(task3_ent_values)}
task3_ent_rev_dict = {k:v for v,k in task3_ent_fwd_dict.items()}

In [None]:
print(df_china_task1.head())
print(df_train_task3.head())
print(dev_task3[0])
print(task3_y_fwd_dict)

In [None]:
def try_pass(text, model):
    try:
        return(model[text])
    except:
        pass
def try_zeros(text, model, dim=300):
    try:
        return(model[text])
    except:
        return(np.zeros(dim))

In [None]:
## Dataframe to Dictionary

dict_train_task1 = df_to_dic(df_train_task1)
dict_test_task1 = df_to_dic(df_test_task1)
dict_dev_task1 = df_to_dic(df_dev_task1)
dict_china_task1 = df_to_dic(df_china_task1)

dict_train_task2 = df_to_dic(df_train_task2)
dict_test_task2 = df_to_dic(df_test_task2)
dict_dev_task2 = df_to_dic(df_dev_task2)
dict_china_task2 = df_to_dic(df_china_task2)

## Sub in Word Vectors

for id_, dic in dict_train_task1.items():
    dict_train_task1[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_train_task1[id_]["tokenized"] = [a for a in dict_train_task1[id_]["tokenized"] if a is not None]
for id_, dic in dict_test_task1.items(): 
    dict_test_task1[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_test_task1[id_]["tokenized"] = [a for a in dict_test_task1[id_]["tokenized"] if a is not None]
for id_, dic in dict_dev_task1.items():
    dict_dev_task1[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_dev_task1[id_]["tokenized"] = [a for a in dict_dev_task1[id_]["tokenized"] if a is not None]
for id_, dic in dict_china_task1.items():
    dict_china_task1[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_china_task1[id_]["tokenized"] = [a for a in dict_china_task1[id_]["tokenized"] if a is not None]
    
for id_, dic in dict_train_task2.items():
    dict_train_task2[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_train_task2[id_]["tokenized"] = [a for a in dict_train_task2[id_]["tokenized"] if a is not None]
for id_, dic in dict_test_task2.items(): 
    dict_test_task2[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_test_task2[id_]["tokenized"] = [a for a in dict_test_task2[id_]["tokenized"] if a is not None]
for id_, dic in dict_dev_task2.items():
    dict_dev_task2[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_dev_task2[id_]["tokenized"] = [a for a in dict_dev_task2[id_]["tokenized"] if a is not None]
for id_, dic in dict_china_task2.items():
    dict_china_task2[id_]["tokenized"] = [try_pass(a,fasttext) for a in dic["text"].split()]# if a in fasttext.vocab.keys()]
    dict_china_task2[id_]["tokenized"] = [a for a in dict_china_task2[id_]["tokenized"] if a is not None]
                                  
train_task3 = [dict(a,**{"tokenized":[try_zeros(b,fasttext) for b in a["text"]]}) for a in train_task3]
test_task3 = [dict(a,**{"tokenized":[try_zeros(b,fasttext) for b in a["text"]]}) for a in test_task3]
dev_task3 = [dict(a,**{"tokenized":[try_zeros(b,fasttext) for b in a["text"]]}) for a in dev_task3]
china_task3 = [dict(a,**{"tokenized":[try_zeros(b,fasttext) for b in a["text"]]}) for a in china_task3]


In [None]:
seq_x_train_task1 = [v["tokenized"] for k,v in dict_train_task1.items()]
seq_y_train_task1 = np.array([v["label"] for k,v in dict_train_task1.items()])
max_len_train_task1 = max([len(x) for x in seq_x_train_task1])

seq_x_dev_task1 = [v["tokenized"] for k,v in dict_dev_task1.items()]
seq_y_dev_task1 = np.array([v["label"] for k,v in dict_dev_task1.items()])
max_len_dev_task1 = max([len(x) for x in seq_x_dev_task1])

seq_x_test_task1 = [v["tokenized"] for k,v in dict_test_task1.items()]
max_len_test_task1 = max([len(x) for x in seq_x_test_task1])

seq_x_china_task1 = [v["tokenized"] for k,v in dict_china_task1.items()]
max_len_china_task1 = max([len(x) for x in seq_x_china_task1])

max_len_task1 = max(max_len_dev_task1, max_len_train_task1, max_len_test_task1, max_len_china_task1)
seq_x_train_task1 = np.stack([[np.zeros(300)]*(max_len_task1-len(x))+x for x in seq_x_train_task1])
seq_x_dev_task1 = np.stack([[np.zeros(300)]*(max_len_task1-len(x))+x for x in seq_x_dev_task1])
seq_x_test_task1 = np.stack([[np.zeros(300)]*(max_len_task1-len(x))+x for x in seq_x_test_task1])
seq_x_china_task1 = np.stack([[np.zeros(300)]*(max_len_task1-len(x))+x for x in seq_x_china_task1])

print(max_len_task1)

seq_x_train_task2 = [v["tokenized"] for k,v in dict_train_task2.items()]
seq_y_train_task2 = np.array([v["label"] for k,v in dict_train_task2.items()])
max_len_train_task2 = max([len(x) for x in seq_x_train_task2])

seq_x_dev_task2 = [v["tokenized"] for k,v in dict_dev_task2.items()]
seq_y_dev_task2 = np.array([v["label"] for k,v in dict_dev_task2.items()])
max_len_dev_task2 = max([len(x) for x in seq_x_dev_task2])

seq_x_test_task2 = [v["tokenized"] for k,v in dict_test_task2.items()]
max_len_test_task2 = max([len(x) for x in seq_x_test_task2])

seq_x_china_task2 = [v["tokenized"] for k,v in dict_china_task2.items()]
max_len_china_task2 = max([len(x) for x in seq_x_china_task2])

max_len_task2 = max(max_len_dev_task2, max_len_train_task2, max_len_test_task2, max_len_china_task2)
seq_x_train_task2 = np.stack([[np.zeros(300)]*(max_len_task2-len(x))+x for x in seq_x_train_task2])
seq_x_dev_task2 = np.stack([[np.zeros(300)]*(max_len_task2-len(x))+x for x in seq_x_dev_task2])
seq_x_test_task2 = np.stack([[np.zeros(300)]*(max_len_task2-len(x))+x for x in seq_x_test_task2])
seq_x_china_task2 = np.stack([[np.zeros(300)]*(max_len_task2-len(x))+x for x in seq_x_china_task2])

print(max_len_task2)

##
## Word-level classifier
##
seq_x_train_task3 = [v["tokenized"] for v in train_task3]
seq_x_dev_task3 = [v["tokenized"] for v in dev_task3]
seq_x_test_task3 = [v["tokenized"] for v in test_task3]
seq_x_china_task3 = [v["tokenized"] for v in china_task3]
seq_y_train_task3 = [v["y"] for v in train_task3]
seq_y_dev_task3 = [v["y"] for v in dev_task3]

seq_ent_train_task3 = [v["entities"] for v in train_task3]
seq_ent_dev_task3 = [v["entities"] for v in dev_task3]
seq_ent_test_task3 = [v["entities"] for v in test_task3]
seq_ent_china_task3 = [v["entities"] for v in china_task3]

max_len_task3 = max([len(a) for a in seq_x_train_task3 + seq_x_dev_task3 + seq_x_test_task3 + seq_x_china_task3])

seq_x_train_task3 = np.stack([[np.zeros(300)]*(max_len_task3-len(x))+x for x in seq_x_train_task3])
seq_x_dev_task3 = np.stack([[np.zeros(300)]*(max_len_task3-len(x))+x for x in seq_x_dev_task3])
seq_x_test_task3 = np.stack([[np.zeros(300)]*(max_len_task3-len(x))+x for x in seq_x_test_task3])
seq_x_china_task3 = np.stack([[np.zeros(300)]*(max_len_task3-len(x))+x for x in seq_x_china_task3])
seq_y_train_task3 = np.stack(np.array([task3_y_fwd_dict[a] for a in ["O"]*(max_len_task3-len(x))+x]) for x in seq_y_train_task3)
seq_y_dev_task3 = np.stack(np.array([task3_y_fwd_dict[a] for a in ["O"]*(max_len_task3-len(x))+x]) for x in seq_y_dev_task3)

seq_ent_train_task3 = np.stack(np.array([task3_ent_fwd_dict[a] for a in ["O"]*(max_len_task3-len(x))+x]) for x in seq_ent_train_task3)
seq_ent_dev_task3 = np.stack(np.array([task3_ent_fwd_dict[a] for a in ["O"]*(max_len_task3-len(x))+x]) for x in seq_ent_dev_task3)
seq_ent_test_task3 = np.stack(np.array([task3_ent_fwd_dict[a] for a in ["O"]*(max_len_task3-len(x))+x]) for x in seq_ent_test_task3)
seq_ent_china_task3 = np.stack(np.array([task3_ent_fwd_dict[a] for a in ["O"]*(max_len_task3-len(x))+x]) for x in seq_ent_china_task3)

print(max_len_task3)

In [None]:
seq_y_train_task3 = np.stack([to_categorical(a, len(task3_y_fwd_dict)) for a in seq_y_train_task3])
seq_y_dev_task3 = np.stack([to_categorical(a, len(task3_y_fwd_dict)) for a in seq_y_dev_task3])
print(seq_y_train_task3.shape)
print(seq_y_dev_task3.shape)
print(seq_ent_train_task3.shape)

In [None]:
# del fasttext
del df_train_task2
del df_dev_task2
del df_test_task2
del df_train_task1
del df_dev_task1
del df_test_task1
del df_train_task3
del df_test_task3
del df_dev_task3
del df_china_task3

gc.collect()

In [None]:
n_train_task1 = seq_x_train_task1.shape[0]
n_train_task2 = seq_x_train_task2.shape[0]
batch_size = 128

def generate_data(x_doc, y_doc, batch_size):
    i_doc = 0
    n_doc = x_doc.shape[0]
    while True:  
        j_doc = i_doc+batch_size
        if j_doc > n_doc:
            j_doc = n_doc
        output = (x_doc[i_doc:j_doc,:], y_doc[i_doc:j_doc])
        i_doc = j_doc
        if i_doc == n_doc:
            i_doc = 0
        yield output
        
def generate_xxy(x1_doc, x2_doc, y_doc, batch_size):
    i_doc = 0
    n_doc = x1_doc.shape[0]
    while True:
        j_doc = i_doc+batch_size
        if j_doc > n_doc:
            j_doc = n_doc
        output = ([x1_doc[i_doc:j_doc,:], x2_doc[i_doc:j_doc,:]], y_doc[i_doc:j_doc])
        i_doc = j_doc
        if i_doc == n_doc:
            i_doc = 0
        yield output

In [None]:
seq_in = Input(shape=(None,300))

ent_in = Input(shape=(None,))
ent_emb = Embedding(input_dim=len(task3_ent_fwd_dict), output_dim=5)

gru_layer = Bidirectional(CuDNNGRU(20, return_sequences=True))

doc_out = Dense(1,activation="sigmoid")(
    GaussianDropout(rate=0.25)(
        GlobalMaxPooling1D()(
            gru_layer(GaussianDropout(rate=0.25)(seq_in)))))

sent_out = Dense(1,activation="sigmoid")(
    GaussianDropout(rate=0.25)(
        GlobalMaxPooling1D()(
            gru_layer(GaussianDropout(rate=0.25)(seq_in)))))

word_out = Softmax(axis=2)(
    CuDNNGRU(len(task3_y_fwd_dict), return_sequences=True)(
        GaussianDropout(rate=0.25)(
    Activation("tanh")(
    Bidirectional(CuDNNGRU(len(task3_y_fwd_dict)*2, return_sequences=True))(
        GaussianDropout(rate=0.25)(
            Concatenate(axis=2)(
                [seq_in, 
                 gru_layer(GaussianDropout(rate=0.25)(seq_in)), 
                 ent_emb(ent_in)])))))))

rmsprop_1 = RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)
rmsprop_2 = RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)
rmsprop_3 = RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)

doc_model = Model(inputs=seq_in,outputs=doc_out)
doc_model.compile(rmsprop_1, loss="binary_crossentropy", metrics=["accuracy"])
print(doc_model.summary())

sent_model = Model(inputs=seq_in,outputs=[sent_out])
sent_model.compile(rmsprop_2, loss="binary_crossentropy", metrics=["accuracy"])
print(sent_model.summary())

word_model = Model(inputs=[seq_in,ent_in],outputs=[word_out])
word_model.compile(rmsprop_3, loss=["categorical_crossentropy"], metrics=["categorical_accuracy"])
print(word_model.summary())

gen_doc = generate_data(seq_x_train_task1, seq_y_train_task1, batch_size)
gen_sen = generate_data(seq_x_train_task2, seq_y_train_task2, batch_size)
gen_wor = generate_xxy(seq_x_train_task3, seq_ent_train_task3, seq_y_train_task3, batch_size)

d_train_metrics = []
d_dev_metrics = []
s_train_metrics = []
s_dev_metrics = []
w_train_metrics = []
w_dev_metrics = []

for ii in range(100):
    (loss_d_train, acc_d_train) = doc_model.evaluate(seq_x_train_task1, seq_y_train_task1, batch_size=256, verbose=0)
    (loss_d_dev, acc_d_dev) = doc_model.evaluate([np.stack(seq_x_dev_task1)], np.array(seq_y_dev_task1), batch_size=256, verbose=0)
    (loss_s_train, acc_s_train) = sent_model.evaluate(seq_x_train_task2, [seq_y_train_task2], batch_size=256, verbose=0)
    (loss_s_dev, acc_s_dev) = sent_model.evaluate([np.stack(seq_x_dev_task2)], [np.array(seq_y_dev_task2)], batch_size=256, verbose=0)
    (loss_w_train, acc_w_train_w) = word_model.evaluate([seq_x_train_task3, seq_ent_train_task3], [np.stack(seq_y_train_task3)], batch_size=256, verbose=0)
    (loss_w_dev, acc_w_dev_w) = word_model.evaluate([np.stack(seq_x_dev_task3), seq_ent_dev_task3], [np.stack(seq_y_dev_task3)], batch_size=256, verbose=0)
    print(f"{ii:2.0f}:\t[{loss_d_train:.3f}\t{loss_d_dev:.3f}]\t[{acc_d_train:.3f}\t{acc_d_dev:.3f}]\t[{loss_s_train:.3f}\t{loss_s_dev:.3f}]\t[{acc_s_train:.3f}\t{acc_s_dev:.3f}]\t[{loss_w_train:.3f}\t{loss_w_dev:.3f}]\t[{acc_w_train_w:.3f}\t{acc_w_dev_w:.3f}]")
    
    d_train_metrics.append((loss_d_train, acc_d_train))
    d_dev_metrics.append((loss_d_dev, acc_d_dev))
    s_train_metrics.append((loss_s_train, acc_s_train))
    s_dev_metrics.append((loss_s_dev, acc_s_dev))
    w_train_metrics.append((loss_w_train, acc_w_train_w))
    w_dev_metrics.append((loss_w_dev, acc_w_dev_w))
    
    for ee in range(20):
        batch = next(gen_doc)
        doc_model.train_on_batch(batch[0],batch[1],class_weight="auto")
        batch = next(gen_sen)
        sent_model.train_on_batch(batch[0],[batch[1]],class_weight=["auto","auto"])
        batch = next(gen_wor)
        word_model.train_on_batch(batch[0],[batch[1]],class_weight=["auto"])
    
    

In [None]:
import matplotlib.pyplot as plt

pal = ["#FF0000", "#00A08A", "#F2AD00", "#F98400", "#5BBCD6"]

plt.figure(figsize=(3.5,3.5))
plt.plot(range(100), np.array([a[0] for a in d_train_metrics]), c=pal[0], ls='-', label="Task 1 train")
plt.plot(range(100), np.array([a[0] for a in d_dev_metrics]), c=pal[0], ls='--', label="Task 1 dev")
plt.plot(range(100), np.array([a[0] for a in s_train_metrics]), c=pal[2], ls='-', label="Task 2 train")
plt.plot(range(100), np.array([a[0] for a in s_dev_metrics]), c=pal[2], ls='--', label="Task 2 dev")
plt.plot(range(100), np.array([a[0] for a in w_train_metrics]), c=pal[4], ls='-', label="Task 3 train")
plt.plot(range(100), np.array([a[0] for a in w_dev_metrics]), c=pal[4], ls='--', label="Task 3 dev")
plt.xlabel('Epoch')
plt.ylabel('Log Loss')
plt.legend()
#plt.savefig("final_task_3/task3_loss.pdf", transparent=True, bbox_inches="tight")
plt.show()

In [None]:
plt.figure(figsize=(3.5,3.5))
plt.plot(range(100), np.array([a[1] for a in d_train_metrics]), c=pal[0], ls='-', label="Task 1 train")
plt.plot(range(100), np.array([a[1] for a in d_dev_metrics]), c=pal[0], ls='--', label="Task 1 dev")
plt.plot(range(100), np.array([a[1] for a in s_train_metrics]), c=pal[2], ls='-', label="Task 2 train")
plt.plot(range(100), np.array([a[1] for a in s_dev_metrics]), c=pal[2], ls='--', label="Task 2 dev")
plt.plot(range(100), np.array([a[1] for a in w_train_metrics]), c=pal[4], ls='-', label="Task 3 train")
plt.plot(range(100), np.array([a[1] for a in w_dev_metrics]), c=pal[4], ls='--', label="Task 3 dev")
plt.xlabel('Epoch')
plt.ylabel('Classification Accuracy')
plt.legend()
#plt.savefig("final_task_3/task3_accuracy.pdf", transparent=True, bbox_inches="tight")
plt.show()

In [None]:
[(a,b) for a,b in zip([task3_y_rev_dict[a] for a in np.argmax(word_model.predict([np.stack(seq_x_dev_task3),seq_ent_dev_task3]), axis=2)[1,:]],
    [""]*(max_len_task3-len(dev_task3[1]["text"])) + dev_task3[1]["text"])]

In [None]:
from sklearn.metrics import precision_recall_fscore_support

print(doc_model.evaluate([np.stack(seq_x_dev_task1)], np.array(seq_y_dev_task1)))
print(f1_score(np.array(seq_y_dev_task1), np.round(doc_model.predict([np.stack(seq_x_dev_task1)]))))

print(sent_model.evaluate([np.stack(seq_x_dev_task2)], np.array(seq_y_dev_task2)))
print(f1_score(np.array(seq_y_dev_task2), np.round(sent_model.predict(np.stack(seq_x_dev_task2)))))

print(word_model.evaluate([np.stack(seq_x_dev_task3), seq_ent_dev_task3], np.array(seq_y_dev_task3)))
print(f1_score(np.argmax(np.array(seq_y_dev_task3),axis=2).flatten(), np.argmax(word_model.predict([np.stack(seq_x_dev_task3), seq_ent_dev_task3]), axis=2).flatten(), average="macro"))
print(precision_recall_fscore_support(np.argmax(np.array(seq_y_dev_task3),axis=2).flatten(), np.argmax(word_model.predict([np.stack(seq_x_dev_task3), seq_ent_dev_task3]), axis=2).flatten(), average='macro'))

In [None]:
task1_train_preds = np.round(doc_model.predict([np.stack(seq_x_train_task1)]))
task1_dev_preds = np.round(doc_model.predict([np.stack(seq_x_dev_task1)]))
task2_train_preds = np.round(sent_model.predict(np.stack(seq_x_train_task2)))
task2_dev_preds = np.round(sent_model.predict(np.stack(seq_x_dev_task2)))

task3_train_preds = np.argmax(word_model.predict([np.stack(seq_x_train_task3), seq_ent_train_task3]), axis=2).tolist()
task3_dev_preds = np.argmax(word_model.predict([np.stack(seq_x_dev_task3), seq_ent_dev_task3]), axis=2).tolist()
task3_preds = np.argmax(word_model.predict([np.stack(seq_x_test_task3), seq_ent_test_task3]), axis=2).tolist()
china_task3_preds = np.argmax(word_model.predict([np.stack(seq_x_china_task3), seq_ent_china_task3]), axis=2).tolist()

In [None]:
np.argmax(np.array(seq_y_dev_task3),axis=2).flatten()

In [None]:
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
#     classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
#     ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


np.set_printoptions(precision=2)

plt.figure(figsize=(7,7))
plot_confusion_matrix(np.argmax(np.array(seq_y_dev_task3),axis=2).flatten(), [item for sublist in task3_dev_preds for item in sublist], classes=[task3_y_rev_dict[a] for a in range(len(task3_y_rev_dict))], normalize=True,
                      title='Normalized confusion matrix')
#plt.savefig("final_task_3/task3_dev_confusion.pdf",transparent=True, bbox_inches="tight")
plt.show()

In [None]:
task3_train_preds = zip([a["text"] for a in train_task3], task3_train_preds)
task3_dev_preds = zip([a["text"] for a in dev_task3], task3_dev_preds)

task3_train_preds = [list(zip(["REMOVE"]*(max_len_task3-len(a)) + a, b)) for a,b in task3_train_preds]
task3_dev_preds = [list(zip(["REMOVE"]*(max_len_task3-len(a)) + a, b)) for a,b in task3_dev_preds]

task3_train_preds = [[(b[0],task3_y_rev_dict[b[1]]) for b in a if b[0]!="REMOVE"] for a in task3_train_preds]
task3_dev_preds = [[(b[0],task3_y_rev_dict[b[1]]) for b in a if b[0]!="REMOVE"] for a in task3_dev_preds]

###

task3_preds = zip([a["text"] for a in test_task3], task3_preds)
china_task3_preds = zip([a["text"] for a in china_task3], china_task3_preds)

task3_preds = [list(zip(["REMOVE"]*(max_len_task3-len(a)) + a, b)) for a,b in task3_preds]
china_task3_preds = [list(zip(["REMOVE"]*(max_len_task3-len(a)) + a, b)) for a,b in china_task3_preds]

task3_preds = [[(b[0],task3_y_rev_dict[b[1]]) for b in a if b[0]!="REMOVE"] for a in task3_preds]
china_task3_preds = [[(b[0],task3_y_rev_dict[b[1]]) for b in a if b[0]!="REMOVE"] for a in china_task3_preds]

In [None]:

with open("public_data/phase1/task1_dev_filled.data") as f:
    task1_dev_ids = f.read().splitlines()
with open("public_data/phase1/task1_train_filled.data") as f:
    task1_train_ids = f.read().splitlines()
with open("public_data/phase1/task2_dev_filled.data") as f:
    task2_dev_ids = f.read().splitlines()
with open("public_data/phase1/task2_train_filled.data") as f:
    task2_train_ids = f.read().splitlines()
with open("task3public9may/dev.txt") as f:
    task3_dev_ids = [a.split("\t")[0] for a in f.read().splitlines()]
with open("task3public9may/train.txt") as f:
    task3_train_ids = [a.split("\t")[0] for a in f.read().splitlines()]
with open("task3public9may/task3_test.data") as f:
    task3_test_ids = f.read().splitlines()
with open("task3public9may/china_test.data") as f:
    task3_china_ids = f.read().splitlines()
    
for kk, vv in dict_test_task1.items():
    doc_vec = vv["tokenized"]
    doc_vec = [np.zeros(300)]*(max_len_task1 - len(doc_vec)) + doc_vec
    dict_test_task1[kk]["doc_vec"] = doc_vec
    
for kk, vv in dict_test_task2.items():
    doc_vec = vv["tokenized"]
    doc_vec = [np.zeros(300)]*(max_len_task2 - len(doc_vec)) + doc_vec
    dict_test_task2[kk]["doc_vec"] = doc_vec
    
for kk, vv in dict_china_task1.items():
    doc_vec = vv["tokenized"]
    doc_vec = [np.zeros(300)]*(max_len_task1 - len(doc_vec)) + doc_vec
    dict_china_task1[kk]["doc_vec"] = doc_vec
    
for kk, vv in dict_china_task2.items():
    doc_vec = vv["tokenized"]
    doc_vec = [np.zeros(300)]*(max_len_task2 - len(doc_vec)) + doc_vec
    dict_china_task2[kk]["doc_vec"] = doc_vec

task3_train_pred_tokens = [item[1] for sublist in task3_train_preds for item in sublist]
task3_dev_pred_tokens = [item[1] for sublist in task3_dev_preds for item in sublist]
task3_pred_tokens = [item[1] for sublist in task3_preds for item in sublist]
task3_china_pred_tokens = [item[1] for sublist in china_task3_preds for item in sublist]

def fix_BI(sequence):
    new_seq = []
    previous_token = "O"
    for token in sequence:
        if token == "O" or token == "JUNK":
            new_seq.append("O")
        elif token == "None" or token == "":
            new_seq.append("")
        else:
            new_seq.append(token)
#         else:
#             if token == previous_token:
#                 new_seq.append("I"+token)
#             else:
#                 new_seq.append("B"+token)
#         previous_token = token
#     new_seq = sequence
    return(new_seq)
     
task3_train_ids = list(zip(task3_train_ids, fix_BI(task3_train_pred_tokens)))
task3_dev_ids = list(zip(task3_dev_ids, fix_BI(task3_dev_pred_tokens)))
task3_train_ids = [a + "\t" + b if a!='' else '' for a, b in task3_train_ids]
task3_dev_ids = [a + "\t" + b if a!='' else '' for a, b in task3_dev_ids]

task3_test_ids = list(zip(task3_test_ids, fix_BI(task3_pred_tokens)))
task3_china_ids = list(zip(task3_china_ids, fix_BI(task3_china_pred_tokens)))
task3_test_ids = [a + "\t" + b if a!='' else '' for a, b in task3_test_ids]
task3_china_ids = [a + "\t" + b if a!='' else '' for a, b in task3_china_ids]
    
task1_train_results = list(zip(task1_train_ids, [int(a) for a in list(task1_train_preds[:,0])]))
task2_train_results = list(zip(task2_train_ids, [int(a) for a in list(task2_train_preds[:,0])]))
task1_dev_results = list(zip(task1_dev_ids, [int(a) for a in list(task1_dev_preds[:,0])]))
task2_dev_results = list(zip(task2_dev_ids, [int(a) for a in list(task2_dev_preds[:,0])]))

task1_train_results = [str(a) + "\t" + str(b) for a,b in task1_train_results]
task2_train_results = [str(a) + "\t" + str(b) for a,b in task2_train_results]
task1_dev_results = [str(a) + "\t" + str(b) for a,b in task1_dev_results]
task2_dev_results = [str(a) + "\t" + str(b) for a,b in task2_dev_results]

In [None]:
# with open("final_task_3/task3_train.predict","w") as f:
#     for ll in task3_train_ids:
#         f.write("{}\n".format(ll))
        
# with open("final_task_3/task3_dev.predict","w") as f:
#     for ll in task3_dev_ids:
#         f.write("{}\n".format(ll))
        
# with open("final_task_3/task1_train.predict","w") as f:
#     for ll in task1_train_results:
#         f.write("{}\n".format(ll))
        
# with open("final_task_3/task1_dev.predict","w") as f:
#     for ll in task1_dev_results:
#         f.write("{}\n".format(ll))
        
# with open("final_task_3/task2_train.predict","w") as f:
#     for ll in task2_train_results:
#         f.write("{}\n".format(ll))
        
# with open("final_task_3/task2_dev.predict","w") as f:
#     for ll in task2_dev_results:
#         f.write("{}\n".format(ll))
        
# doc_model.save('final_task_3/model_doc.h5')  # creates a HDF5 file 'my_model.h5'
# sent_model.save('final_task_3/model_sent.h5')
# word_model.save('final_task_3/model_word.h5')
        
