# Setup

After installing requirements, you'll be prompted for restarting kernel

## Preparing Repo

In [None]:
!pip install -r requirements.txt

Collecting absl-py==0.13.0
  Downloading absl_py-0.13.0-py3-none-any.whl (132 kB)
[?25l[K     |██▌                             | 10 kB 29.8 MB/s eta 0:00:01[K     |█████                           | 20 kB 36.7 MB/s eta 0:00:01[K     |███████▍                        | 30 kB 39.6 MB/s eta 0:00:01[K     |██████████                      | 40 kB 39.9 MB/s eta 0:00:01[K     |████████████▍                   | 51 kB 37.1 MB/s eta 0:00:01[K     |██████████████▉                 | 61 kB 21.8 MB/s eta 0:00:01[K     |█████████████████▍              | 71 kB 22.8 MB/s eta 0:00:01[K     |███████████████████▉            | 81 kB 24.2 MB/s eta 0:00:01[K     |██████████████████████▎         | 92 kB 25.8 MB/s eta 0:00:01[K     |████████████████████████▉       | 102 kB 27.2 MB/s eta 0:00:01[K     |███████████████████████████▎    | 112 kB 27.2 MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122 kB 27.2 MB/s eta 0:00:01[K     |████████████████████████████████| 132 kB 27.2 M

## Patching Anago

In [None]:
%cd amr_parser
!./update-anago.sh -d /usr/local/lib/python3.7/dist-packages/anago

/content/amr_parser
commit_hash is unset, using default hash directory
anago directory: '/usr/local/lib/python3.7/dist-packages/anago'
commit hash: '9afccaa5bcc232676f9c2b59faa4c9531fb25190'
check https://raw.githubusercontent.com/banditelol/anago/9afccaa5bcc232676f9c2b59faa4c9531fb25190 for included files
updating callbacks.py
2021-09-10 16:33:50 URL:https://raw.githubusercontent.com/banditelol/anago/9afccaa5bcc232676f9c2b59faa4c9531fb25190/anago/callbacks.py [1257/1257] -> "/usr/local/lib/python3.7/dist-packages/anago/callbacks.py" [1]
updating layers.py
2021-09-10 16:33:50 URL:https://raw.githubusercontent.com/banditelol/anago/9afccaa5bcc232676f9c2b59faa4c9531fb25190/anago/layers.py [25660/25660] -> "/usr/local/lib/python3.7/dist-packages/anago/layers.py" [1]
updating models.py
2021-09-10 16:33:50 URL:https://raw.githubusercontent.com/banditelol/anago/9afccaa5bcc232676f9c2b59faa4c9531fb25190/anago/models.py [4931/4931] -> "/usr/local/lib/python3.7/dist-packages/anago/models.py" [1]


## Prepare Language

In [None]:
import stanfordnlp
import stanza
import nltk

stanfordnlp.download('id')
stanza.download('id')
nltk.download('punkt')

## Download Pretrained Models

- NER
- POS
- Word2Vec


In [None]:
!wget https://storage.googleapis.com/riset_amr/adylan/pretrained_feature_models.zip -O pretrained.zip
!unzip pretrained.zip
!rm pretrained.zip

--2021-09-10 16:37:29--  https://storage.googleapis.com/riset_amr/adylan/pretrained_feature_models.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.1.208, 142.251.45.16, 172.217.164.176, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.1.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180433604 (172M) [application/zip]
Saving to: ‘pretrained.zip’


2021-09-10 16:37:33 (52.2 MB/s) - ‘pretrained.zip’ saved [180433604/180433604]

Archive:  pretrained.zip
   creating: pretrained/
   creating: pretrained/ner_tagger/
  inflating: pretrained/ner_tagger/nerparams.json  
  inflating: pretrained/ner_tagger/nerprepro.pkl  
  inflating: pretrained/ner_tagger/nerweight.h5  
   creating: pretrained/pos_tagger/
  inflating: pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger  
   creating: pretrained/word2vec/
   creating: pretrained/word2vec/id/
  inflating: pretrained/word2vec/id/id.tsv  
  inflating: pretrain

# Evaluating NER

## Import Packages

In [None]:
import os
import sys
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from IPython.display import display, HTML

In [None]:
import penman
from importlib import reload
reload(penman)

<module 'penman' from '/usr/local/lib/python3.7/dist-packages/penman.py'>

In [None]:
import pandas as pd
from tqdm import tqdm
from utils.ner.entity_recognizer import get_entities

tqdm.pandas()

## Defined Functions

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report

def preprocess_data(data):
    """
    Group Data into sentences
    """
    # Fill na
    data_fillna = data.fillna(method='ffill', axis=0)
    # Groupby and collect columns
    data_group = data_fillna.groupby(
    ['Sentence #'],as_index=False
    )['Word', 'POS', 'Tag'].agg(lambda x: list(x))
    return data_group


def evaluation_report(flat_gold, flat_pred):
  #accuracy: (tp + tn) / (p + n)
  accuracy = accuracy_score(flat_gold, flat_pred)
  print('Accuracy: %f' % accuracy)
  # precision tp / (tp + fp)
  precision = precision_score(flat_gold, flat_pred,average='macro',zero_division=0)
  print('Precision: %f' % precision)
  # recall: tp / (tp + fn)
  recall = recall_score(flat_gold, flat_pred,average='macro',zero_division=0)
  print('Recall: %f' % recall)
  # f1: 2 tp / (2 tp + fp + fn)
  f1 = f1_score(flat_gold, flat_pred,average='macro',zero_division=0)
  print('F1 score: %f' % f1)
  print(classification_report(flat_gold, flat_pred,zero_division=0)) 

## Load Data and create Sentence

In [None]:
data_test = pd.read_csv('https://storage.googleapis.com/riset_amr/dataset/NER_pretraining/ner_dataset_indo_test_v2.csv', encoding= 'unicode_escape')
data_group = preprocess_data(data_test)
data_group["Sentence"] = data_group["Word"].progress_apply(lambda x: " ".join(x))
# get the rhs of dash, or if not split will return itself
data_group["Gold_Tag"] = data_group["Tag"].progress_apply(lambda x: [t.split('-')[-1] for t in x])

  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 2396/2396 [00:00<00:00, 328427.48it/s]
100%|██████████| 2396/2396 [00:00<00:00, 102562.15it/s]


Predict the tag and flatten it for evaluation

In [None]:
data_group["Pred_Tag"] = data_group["Sentence"].progress_apply(get_entities)

  0%|          | 0/2396 [00:00<?, ?it/s]2021-09-10 16:44:09.164177: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-09-10 16:44:09.165804: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2200150000 Hz
100%|██████████| 2396/2396 [02:10<00:00, 18.33it/s]


In [None]:
flat_yhat = []
flat_true_tags = []
for tokens in data_group["Pred_Tag"].to_list():
    flat_yhat.extend(tokens)

for tokens in data_group["Gold_Tag"].to_list():
    flat_true_tags.extend(tokens)

In [None]:
evaluation_report(flat_true_tags, flat_yhat)

Accuracy: 0.792812
Precision: 0.320171
Recall: 0.176501
F1 score: 0.202367
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         2
         CRD       0.61      0.70      0.65      1360
         DAT       0.80      0.26      0.40      2675
          ER       0.00      0.00      0.00         1
         EVT       0.64      0.10      0.17       440
         FAC       0.00      0.00      0.00       239
         GPE       0.70      0.49      0.58      1848
           I       0.00      0.00      0.00         1
         LAW       0.00      0.00      0.00       142
         LOC       0.19      0.17      0.18       723
         MON       0.43      0.30      0.35       780
         NOR       0.56      0.18      0.27      2160
           O       0.86      0.99      0.92     46312
          OC       0.00      0.00      0.00         1
      