In [None]:
import os
import sys
import json
import pickle
import joblib

import numpy as np
from tqdm import tqdm

import sklearn_crfsuite
from itertools import accumulate
from sklearn.decomposition import PCA
from transformers import AutoModel, AutoTokenizer, AutoConfig
from seqeval.metrics import classification_report, f1_score

sys.path.append('../../wrappers/')

import wrapper_CRF as crf_

In [None]:
CORPUS = "CONLL2003"
EMBEDDING = "bert-base-cased_cl4l_pca256"
pretrained_model = "bert-base-cased" 
NCOMP = 256

DIR = "../../expt_results/results_passive/passive_"+CORPUS+"_"+EMBEDDING
crfModelPath = DIR+"/passive_model/passive"

In [None]:
with open(
        crfModelPath, "rb"
    ) as outfile:
        model = joblib.load(filename=outfile)

In [None]:
tempSplit = EMBEDDING.split("_")
tembName = tempSplit[1] + "_" + tempSplit[0]

with open("../../saved_embeddings/"+CORPUS+"_"+tembName+".test", "rb") as outfile:
    embeddings = pickle.load(outfile)
with open("../../datasets/tokenized/"+CORPUS+"_test.tags", "rb") as outfile:
    tags = json.load(outfile)
with open("../../datasets/tokenized/"+CORPUS+"_test.pos", "rb") as outfile:
    pos_tags = json.load(outfile)
with open("../../datasets/tokenized/"+CORPUS+"_test.tokenized", "rb") as outfile:
    tknzd_sent = json.load(outfile)

tokenizer_ = AutoTokenizer.from_pretrained(
        pretrained_model, do_basic_tokenize=False
    )
pretrained_tknzd = [
        tokenizer_(
            sent,
            return_tensors="pt",
            is_pretokenized=True,
            max_length=512,
            truncation=True,
        )
        for sent in tqdm(tknzd_sent)
    ]

In [None]:
temp = []
for i in tqdm(range(len(pretrained_tknzd))):
    temp_sent = [embeddings[i][0]]
    # n = 1
    for j in range(1, len(embeddings[i])):
        if (
            tokenizer_.decode([pretrained_tknzd[i]["input_ids"][0][j + 1]])[0]
            == "#"
        ):
            temp_sent[-1] = temp_sent[-1] + embeddings[i][j]
            # n = n + 1
        else:
            temp_sent.append(embeddings[i][j])
    temp.append(temp_sent[:])

    # Truncation for tags and actual tokens, truncation can be done explicitly (rather than seperately)
    tknzd_sent[i] = tknzd_sent[i][: len(temp_sent)]
    tags[i] = tags[i][: len(temp_sent)]
    pos_tags[i] = tags[i][: len(temp_sent)]

embeddings = temp

In [None]:
def pca_r_embeddings(embedding, n_comp=200, seed=29):
    embedding_flat = [word.numpy() for sent in embedding for word in sent]
    X_ = np.array(embedding_flat)

    if np.isinf(X_).any():
        print("inf: ", X_[np.isinf(X_) == True])

    if np.isnan(X_).any():
        print("nan: ", X_[np.isnan(X_) == True])
    
    sent_len = [0] + [len(sent) for sent in embedding]
    sent_idx = list(accumulate(sent_len))


    pca = PCA(n_components=n_comp, random_state=seed)
    embeddings = pca.fit_transform(X_)

    pca_r_embeddings = [
        embeddings[sent_idx[i - 1] : sent_idx[i]]
        for i in range(1, len(sent_idx))
    ]
    # print("Variance Explained:", list(accumulate(pca.explained_variance_ratio_)))

    return pca_r_embeddings
embeddings_r = pca_r_embeddings(embeddings, n_comp=NCOMP, seed=29)

In [None]:
import yaml
with open(os.path.join(DIR, "features_config.yaml"), "r") as f:
        feature_cfg = yaml.load(f, Loader=yaml.FullLoader)

In [None]:
X_test = crf_.sent2features(
    feature_cfg,
    tknzd_sent,
    generator=True,
    embeddings=embeddings_r,
    pos=pos_tags,
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
report = classification_report(tags, y_pred)
print(classification_report(tags, y_pred))