## Import Library

In [1]:
!git clone https://github.com/indobenchmark/indonlu.git
!pip install transformers
!pip install -U 'scikit-learn<0.24'
!pip install sklearn-crfsuite

fatal: destination path 'indonlu' already exists and is not an empty directory.


In [2]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
from tqdm import tqdm

from transformers import BertConfig, BertTokenizer
from nltk.tokenize import word_tokenize


In [3]:
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss

from transformers import BertPreTrainedModel, BertModel, BertConfig
from torch.utils.data import Dataset, DataLoader

import re
from collections import defaultdict, namedtuple

from indonlu.utils.conlleval import conll_evaluation
from gensim.models import Word2Vec

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.utils import to_categorical

from itertools import chain

import nltk
import sklearn
import scipy.stats

import sklearn_crfsuite
from sklearn_crfsuite import scorers,CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics

In [4]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [5]:
downloaded = drive.CreateFile({'id':"12mW6Xh3Qc-5YrzGb-rEEGuiDbsAY57xN"})   # replace the id with id of file you want to access
downloaded.GetContentFile('glove_50dim_wiki.id.case.text.txt')          
downloaded = drive.CreateFile({'id':"1fzjyyn0-OvGcegrnmRW_Ab6vf62HP2HT"})   # replace the id with id of file you want to access
downloaded.GetContentFile('idwiki_word2vec_200_new.tar.gz')

In [6]:
!tar -xf "/idwiki_word2vec_200_new.tar.gz"

In [7]:
# Static constant variable
LABEL2INDEX = {'I-PPL': 0, 'B-EVT': 1, 'B-PLC': 2, 'I-IND': 3, 'B-IND': 4, 'B-FNB': 5, 'I-EVT': 6, 'B-PPL': 7, 'I-PLC': 8, 'O': 9, 'I-FNB': 10}
INDEX2LABEL = {0: 'I-PPL', 1: 'B-EVT', 2: 'B-PLC', 3: 'I-IND', 4: 'B-IND', 5: 'B-FNB', 6: 'I-EVT', 7: 'B-PPL', 8: 'I-PLC', 9: 'O', 10: 'I-FNB'}
NUM_LABELS = 11
MAX_LEN = 128

## Dataset Path

In [8]:
train_dataset_path = '/content/indonlu/dataset/nerp_ner-prosa/train_preprocess.txt'
valid_dataset_path = '/content/indonlu/dataset/nerp_ner-prosa/valid_preprocess.txt'
test_dataset_path = '/content/indonlu/dataset/nerp_ner-prosa/test_preprocess_masked_label.txt'

## 1. Contextual Embeddings (BERT)

### 1.1. Utils

In [9]:
class BertForWordClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        subword_to_word_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
            Classification loss.
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
            Classification scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        # average the token-level outputs to compute word-level representations
        max_seq_len = subword_to_word_ids.max() + 1
        word_latents = []
        for i in range(max_seq_len):
            mask = (subword_to_word_ids == i).unsqueeze(dim=-1)
            word_latents.append((sequence_output * mask).sum(dim=1) / mask.sum())
        word_batch = torch.stack(word_latents, dim=1)

        sequence_output = self.dropout(word_batch)
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), scores, (hidden_states), (attentions)

In [10]:
# Forward function for word classification
def forward_word_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 4:
        (subword_batch, mask_batch, subword_to_word_indices_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 5:
        (subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    subword_to_word_indices_batch = torch.LongTensor(subword_to_word_indices_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        subword_to_word_indices_batch = subword_to_word_indices_batch.cuda()
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, subword_to_word_indices_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyps = []
    list_labels = []
    hyps_list = torch.topk(logits, k=1, dim=-1)[1].squeeze(dim=-1)
    for i in range(len(hyps_list)):
        hyps, labels = hyps_list[i].tolist(), label_batch[i].tolist()        
        list_hyp, list_label = [], []
        for j in range(len(hyps)):
            if labels[j] == -100:
                break
            else:
                list_hyp.append(i2w[hyps[j]])
                list_label.append(i2w[labels[j]])
        list_hyps.append(list_hyp)
        list_labels.append(list_label)
        
    return loss, list_hyps, list_labels

In [11]:
def ner_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

In [12]:
class NerProsaDataset(Dataset):
    LABEL2INDEX = {'I-PPL': 0, 'B-EVT': 1, 'B-PLC': 2, 'I-IND': 3, 'B-IND': 4, 'B-FNB': 5, 'I-EVT': 6, 'B-PPL': 7, 'I-PLC': 8, 'O': 9, 'I-FNB': 10}
    INDEX2LABEL = {0: 'I-PPL', 1: 'B-EVT', 2: 'B-PLC', 3: 'I-IND', 4: 'B-IND', 5: 'B-FNB', 6: 'I-EVT', 7: 'B-PPL', 8: 'I-PLC', 9: 'O', 10: 'I-FNB'}
    NUM_LABELS = 11
    
    def load_dataset(self, path):
        # Read file
        data = open(path,'r').readlines()

        # Prepare buffer
        dataset = []
        sentence = []
        seq_label = []
        for line in data:
            if len(line.strip()) > 0:
                token, label = line[:-1].split('\t')
                sentence.append(token)
                seq_label.append(self.LABEL2INDEX[label])
            else:
                dataset.append({
                    'sentence': sentence,
                    'seq_label': seq_label
                })
                sentence = []
                seq_label = []
        return dataset
    
    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):
        data = self.data[index]
        sentence, seq_label = data['sentence'], data['seq_label']
        
        # Add CLS token
        subwords = [self.tokenizer.cls_token_id]
        subword_to_word_indices = [-1] # For CLS
        
        # Add subwords
        for word_idx, word in enumerate(sentence):
            subword_list = self.tokenizer.encode(word, add_special_tokens=False)
            subword_to_word_indices += [word_idx for i in range(len(subword_list))]
            subwords += subword_list
            
        # Add last SEP token
        subwords += [self.tokenizer.sep_token_id]
        subword_to_word_indices += [-1]
        
        return np.array(subwords), np.array(subword_to_word_indices), np.array(seq_label), data['sentence']
    
    def __len__(self):
        return len(self.data)

class NerDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(NerDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        max_tgt_len = max(map(lambda x: len(x[2]), batch))
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        subword_to_word_indices_batch = np.full((batch_size, max_seq_len), -1, dtype=np.int64)
        seq_label_batch = np.full((batch_size, max_tgt_len), -100, dtype=np.int64)
        
        seq_list = []
        for i, (subwords, subword_to_word_indices, seq_label, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_to_word_indices = subword_to_word_indices[:max_seq_len]

            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            subword_to_word_indices_batch[i,:len(subwords)] = subword_to_word_indices
            seq_label_batch[i,:len(seq_label)] = seq_label

            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, subword_to_word_indices_batch, seq_label_batch, seq_list

In [13]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [14]:
# Set random seed
set_seed(53012532)

### 1.2. Load BERT Model

In [15]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = NerProsaDataset.NUM_LABELS

# Instantiate model
model = BertForWordClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 1.3. Load Dataset

In [16]:
train_dataset = NerProsaDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = NerProsaDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = NerProsaDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = NerDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=True)  
valid_loader = NerDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)  
test_loader = NerDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)

  cpuset_checked))


In [17]:
w2i, i2w = NerProsaDataset.LABEL2INDEX, NerProsaDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'I-PPL': 0, 'B-EVT': 1, 'B-PLC': 2, 'I-IND': 3, 'B-IND': 4, 'B-FNB': 5, 'I-EVT': 6, 'B-PPL': 7, 'I-PLC': 8, 'O': 9, 'I-FNB': 10}
{0: 'I-PPL', 1: 'B-EVT', 2: 'B-PLC', 3: 'I-IND', 4: 'B-IND', 5: 'B-FNB', 6: 'I-EVT', 7: 'B-PPL', 8: 'I-PLC', 9: 'O', 10: 'I-FNB'}


### 1.4. Fine Tuning

In [18]:
optimizer = optim.Adam(model.parameters(), lr=2e-5)
model = model.cuda()

In [19]:
# Train
n_epochs = 8
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = ner_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

  cpuset_checked))
(Epoch 1) TRAIN LOSS:1.2351 LR:0.00002000: 100%|██████████| 420/420 [03:00<00:00,  2.33it/s]


(Epoch 1) TRAIN LOSS:1.2351 ACC:0.91 F1:0.31 REC:0.29 PRE:0.35 LR:0.00002000


VALID LOSS:0.9600 ACC:0.94 F1:0.45 REC:0.45 PRE:0.55: 100%|██████████| 53/53 [00:13<00:00,  4.03it/s]


(Epoch 1) VALID LOSS:0.9600 ACC:0.94 F1:0.45 REC:0.45 PRE:0.55


(Epoch 2) TRAIN LOSS:0.8249 LR:0.00002000: 100%|██████████| 420/420 [03:00<00:00,  2.33it/s]


(Epoch 2) TRAIN LOSS:0.8249 ACC:0.95 F1:0.58 REC:0.57 PRE:0.61 LR:0.00002000


VALID LOSS:0.6983 ACC:0.95 F1:0.58 REC:0.59 PRE:0.59: 100%|██████████| 53/53 [00:13<00:00,  4.05it/s]


(Epoch 2) VALID LOSS:0.6983 ACC:0.95 F1:0.58 REC:0.59 PRE:0.59


(Epoch 3) TRAIN LOSS:0.5882 LR:0.00002000: 100%|██████████| 420/420 [03:01<00:00,  2.32it/s]


(Epoch 3) TRAIN LOSS:0.5882 ACC:0.96 F1:0.70 REC:0.69 PRE:0.71 LR:0.00002000


VALID LOSS:0.5335 ACC:0.95 F1:0.67 REC:0.66 PRE:0.68: 100%|██████████| 53/53 [00:13<00:00,  4.01it/s]


(Epoch 3) VALID LOSS:0.5335 ACC:0.95 F1:0.67 REC:0.66 PRE:0.68


(Epoch 4) TRAIN LOSS:0.4399 LR:0.00002000: 100%|██████████| 420/420 [03:00<00:00,  2.33it/s]


(Epoch 4) TRAIN LOSS:0.4399 ACC:0.97 F1:0.77 REC:0.76 PRE:0.78 LR:0.00002000


VALID LOSS:0.4312 ACC:0.96 F1:0.71 REC:0.73 PRE:0.69: 100%|██████████| 53/53 [00:13<00:00,  4.02it/s]


(Epoch 4) VALID LOSS:0.4312 ACC:0.96 F1:0.71 REC:0.73 PRE:0.69


(Epoch 5) TRAIN LOSS:0.3447 LR:0.00002000: 100%|██████████| 420/420 [03:00<00:00,  2.33it/s]


(Epoch 5) TRAIN LOSS:0.3447 ACC:0.98 F1:0.80 REC:0.80 PRE:0.81 LR:0.00002000


VALID LOSS:0.3739 ACC:0.96 F1:0.75 REC:0.78 PRE:0.72: 100%|██████████| 53/53 [00:13<00:00,  4.01it/s]


(Epoch 5) VALID LOSS:0.3739 ACC:0.96 F1:0.75 REC:0.78 PRE:0.72


(Epoch 6) TRAIN LOSS:0.2751 LR:0.00002000: 100%|██████████| 420/420 [03:00<00:00,  2.32it/s]


(Epoch 6) TRAIN LOSS:0.2751 ACC:0.98 F1:0.83 REC:0.83 PRE:0.83 LR:0.00002000


VALID LOSS:0.3351 ACC:0.96 F1:0.70 REC:0.69 PRE:0.71: 100%|██████████| 53/53 [00:13<00:00,  4.01it/s]


(Epoch 6) VALID LOSS:0.3351 ACC:0.96 F1:0.70 REC:0.69 PRE:0.71


(Epoch 7) TRAIN LOSS:0.2295 LR:0.00002000: 100%|██████████| 420/420 [03:01<00:00,  2.31it/s]


(Epoch 7) TRAIN LOSS:0.2295 ACC:0.98 F1:0.85 REC:0.85 PRE:0.84 LR:0.00002000


VALID LOSS:0.3081 ACC:0.96 F1:0.73 REC:0.75 PRE:0.72: 100%|██████████| 53/53 [00:13<00:00,  3.97it/s]


(Epoch 7) VALID LOSS:0.3081 ACC:0.96 F1:0.73 REC:0.75 PRE:0.72


(Epoch 8) TRAIN LOSS:0.1939 LR:0.00002000: 100%|██████████| 420/420 [03:04<00:00,  2.28it/s]


(Epoch 8) TRAIN LOSS:0.1939 ACC:0.98 F1:0.87 REC:0.87 PRE:0.86 LR:0.00002000


VALID LOSS:0.2908 ACC:0.96 F1:0.67 REC:0.70 PRE:0.65: 100%|██████████| 53/53 [00:14<00:00,  3.56it/s]

(Epoch 8) VALID LOSS:0.2908 ACC:0.96 F1:0.67 REC:0.70 PRE:0.65





### 1.4. Testing

In [20]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('bert_pred.txt', index=False)

print(df)

  cpuset_checked))
100%|██████████| 53/53 [00:10<00:00,  4.96it/s]

     index                                              label
0        0  [O, O, B-IND, O, O, B-PLC, O, O, O, O, O, B-PL...
1        1  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
2        2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
3        3  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
4        4  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
..     ...                                                ...
835    835  [B-PPL, O, O, O, B-PLC, O, O, O, O, O, O, O, O...
836    836  [O, O, O, O, O, O, O, O, O, O, O, O, O, B-PLC,...
837    837  [O, I-IND, O, O, O, O, O, O, O, O, O, B-IND, I...
838    838  [B-PLC, O, B-IND, O, O, O, B-PLC, I-PLC, O, B-...
839    839  [B-EVT, O, O, O, B-EVT, O, O, O, O, O, O, O, O...

[840 rows x 2 columns]





## 2. Non-Contextual Embeddings (GLOVE + Bi-LSTM)

### 2.1. Load W2Vec Vocab

In [21]:
word2vec = Word2Vec.load("/idwiki_word2vec_200_new_lower.model")
all_vocab = word2vec.wv.vocab.keys()

### 2.2. Load Dataset

In [22]:
def load_dataset(path):
        # Read file
        data = open(path,'r').readlines()

        # Prepare buffer
        dataset = []
        sentence = []
        seq_label = []
        for line in data:
            if len(line.strip()) > 0:
                token, label = line[:-1].split('\t')
                sentence.append(token)
                seq_label.append(LABEL2INDEX[label])
            else:
                dataset.append({
                    'sentence_list': sentence,
                    'seq_label': seq_label,
                    # tambahan
                    'sentence': " ".join(sentence)
                })
                sentence = []
                seq_label = []
        return pd.DataFrame(dataset)

In [23]:
train_data = load_dataset(train_dataset_path)
valid_data = load_dataset(valid_dataset_path)
test_data = load_dataset(test_dataset_path)

### 1.3. Preprocess

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate([train_data['sentence'],valid_data['sentence'],test_data['sentence']]))

In [25]:
seq_train_data = tokenizer.texts_to_sequences(train_data['sentence'])
seq_val_data = tokenizer.texts_to_sequences(valid_data['sentence'])
seq_test_data = tokenizer.texts_to_sequences(test_data['sentence'])

In [26]:
seq_train_label = pad_sequences(maxlen=MAX_LEN,sequences=train_data['seq_label'],padding="post",value=NUM_LABELS)
seq_val_label = pad_sequences(maxlen=MAX_LEN,sequences=valid_data['seq_label'],padding="post",value=NUM_LABELS)
seq_test_label = pad_sequences(maxlen=MAX_LEN,sequences=test_data['seq_label'],padding="post",value=NUM_LABELS)

In [27]:
X_train = pad_sequences(seq_train_data,maxlen=MAX_LEN, padding='post')
y_train = np.array([to_categorical(i,num_classes = NUM_LABELS+1) for i in  seq_train_label])

In [28]:
X_val = pad_sequences(seq_val_data,maxlen=MAX_LEN, padding='post')
y_val = np.array([to_categorical(i,num_classes = NUM_LABELS+1) for i in  seq_val_label])

In [29]:
X_test = pad_sequences(seq_test_data,maxlen=MAX_LEN, padding='post')
y_test = np.array([to_categorical(i,num_classes = NUM_LABELS+1) for i in  seq_test_label])

In [30]:
EMBEDDING_DIM=200
vocabulary_size=len(tokenizer.word_index)+1
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM+2))

In [31]:
for word, i in tokenizer.word_index.items():
  # embedding_vector = glove.get(word,np.random.uniform(low=-1, high=1, size=(EMBEDDING_DIM,)))
  if word in all_vocab:
    embedding_vector = word2vec.wv[word]
  else:
    embedding_vector = np.random.uniform(low=-1, high=1, size=(EMBEDDING_DIM,))
  embedding_matrix[i] = np.array(list(embedding_vector) + [word.isalnum(), word.isdigit()])
embedding_vector = np.random.uniform(low=-1, high=1, size=(EMBEDDING_DIM,))
embedding_matrix[0] = np.array(list(embedding_vector) + [0, 0])

### 2.4. Training

In [32]:
LSTM_UNITS=128

input = Input(shape = (MAX_LEN,))
model = Embedding(input_dim = vocabulary_size ,output_dim = EMBEDDING_DIM+2,input_length = MAX_LEN,weights=[embedding_matrix])(input)

model = Bidirectional(LSTM(units=LSTM_UNITS,return_sequences=True))(model)
out = TimeDistributed(Dense(NUM_LABELS+1,activation = 'softmax'))(model)
model = Model(input,out)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding (Embedding)       (None, 128, 202)          3978188   
                                                                 
 bidirectional (Bidirectiona  (None, 128, 256)         338944    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 128, 12)          3084      
 ibuted)                                                         
                                                                 
Total params: 4,320,216
Trainable params: 4,320,216
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [34]:
history = model.fit(X_train,y_train,validation_data=(X_val,y_val),batch_size = 32,epochs = 7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


### 2.5. Testing

In [35]:
y_pred = model.predict(X_val)

In [36]:
y_pred = np.argmax(y_pred,axis=2)

In [37]:
y_pred.shape

(840, 128)

In [39]:
pred_label = []
for label in y_pred:
  pred = [i for i in label if i != 11]
  pred_label.append([INDEX2LABEL[i] for i in pred])

In [41]:
actual_label = []
for label in seq_val_label:
  actual = [i for i in label if i != 11]
  actual_label.append([INDEX2LABEL[i] for i in actual])

In [42]:
ner_metrics_fn(actual_label, pred_label)

{'ACC': 0.8637042197452229,
 'F1': 0.1991385825286038,
 'PRE': 0.16704412471642643,
 'REC': 0.24955948387074214}

In [43]:
# Save prediction
df = pd.DataFrame({'label':pred_label}).reset_index()
df.to_csv('w2v_pred.txt', index=False)

print(df)

     index                                              label
0        0      [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
1        1  [O, O, B-PLC, O, O, O, B-PPL, B-PPL, I-PPL, I-...
2        2  [O, B-PPL, B-PPL, O, O, O, O, O, O, O, O, O, O...
3        3  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
4        4  [B-PPL, O, O, O, O, O, O, O, O, O, O, O, O, O,...
..     ...                                                ...
835    835  [O, O, B-PPL, O, O, O, O, O, O, O, O, O, O, O,...
836    836         [O, O, O, O, O, O, O, O, O, O, O, O, O, O]
837    837  [O, O, O, O, O, O, O, O, B-PLC, O, O, B-EVT, O...
838    838  [B-PPL, O, O, O, O, O, B-PPL, B-PPL, O, O, O, ...
839    839  [O, O, O, O, B-PLC, B-PLC, O, O, O, O, O, O, O...

[840 rows x 2 columns]


## 3. Traditional ML (CRF)

### 3.1. Load Dataset

In [44]:
def load_dataset(path):
    # Read file
    data = open(path,'r').readlines()

    # Prepare buffer
    all_df = pd.DataFrame()
    dataset = []
    sentence = []
    seq_label = []
    cnt = 1
    for line in data:
        if len(line.strip()) > 0:
            token, label = line[:-1].split('\t')
            sentence.append(token)
            seq_label.append(label)
        else:
            df = pd.DataFrame({
                'no':cnt,
                'word': sentence,
                'label': seq_label,
            })
            all_df = pd.concat([all_df,df])
            cnt+=1
            sentence = []
            seq_label = []
    return all_df

In [45]:
train_data = load_dataset(train_dataset_path)
valid_data = load_dataset(valid_dataset_path)
test_data = load_dataset(test_dataset_path)

### 3.2. Preprocess

In [46]:
# A class to retrieve the sentences from the dataset
class getsentence(object):
    
    def __init__(self, data):
        self.data = data
        agg_func = lambda s: [(w, p) for w, p in zip(s["word"].values.tolist(),s["label"].values.tolist())]
        self.grouped = self.data.groupby("no").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [47]:
train_getter = getsentence(train_data)
valid_getter = getsentence(valid_data)
test_getter = getsentence(test_data)

In [48]:
# Feature set
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isdigit()': word.isdigit(),
        'word.isalnum()':word.isalnum(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.isalnum()':word1.isalnum(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.isalnum()':word1.isalnum(),
        })
    else:
        features['EOS'] = True

    return features

In [49]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for _, label in sent]

In [50]:
x_train = [sent2features(s) for s in train_getter.sentences]
y_train = [sent2labels(s) for s in train_getter.sentences]

x_valid = [sent2features(s) for s in valid_getter.sentences]
y_valid = [sent2labels(s) for s in valid_getter.sentences]

x_test = [sent2features(s) for s in test_getter.sentences]
y_test = [sent2labels(s) for s in test_getter.sentences]

### 3.3. Training

In [51]:
#Creating the CRF model
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [52]:
crf.fit(x_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [53]:
y_pred_valid = crf.predict(x_valid)

In [54]:
print(flat_classification_report(
    y_valid, y_pred_valid, digits=3
))



              precision    recall  f1-score   support

       B-EVT      0.547     0.457     0.498       127
       B-FNB      0.857     0.522     0.649        92
       B-IND      0.835     0.629     0.717       369
       B-PLC      0.826     0.732     0.776       519
       B-PPL      0.861     0.738     0.795       640
       I-EVT      0.403     0.420     0.411       174
       I-FNB      0.857     0.625     0.723        48
       I-IND      0.757     0.643     0.696       286
       I-PLC      0.596     0.578     0.587       225
       I-PPL      0.819     0.764     0.791       386
           O      0.960     0.981     0.970     16763

    accuracy                          0.935     19629
   macro avg      0.756     0.644     0.692     19629
weighted avg      0.932     0.935     0.933     19629



### 3.4. Testing

In [55]:
y_pred_test = crf.predict(x_test)

In [56]:
# Save prediction
df = pd.DataFrame({'label':y_pred_test}).reset_index()
df.to_csv('ml_pred.txt', index=False)

print(df)

     index                                              label
0        0   [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
1        1  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
2        2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
3        3  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
4        4  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
..     ...                                                ...
835    835  [O, O, O, O, B-PLC, O, O, O, O, O, O, O, O, O,...
836    836  [O, O, O, O, O, O, O, O, O, O, O, O, O, B-PLC,...
837    837  [O, O, O, O, O, O, O, O, O, O, O, B-IND, I-IND...
838    838  [B-PLC, O, B-IND, O, O, O, B-PLC, I-PLC, O, B-...
839    839  [B-EVT, I-EVT, I-EVT, O, B-EVT, O, O, O, O, O,...

[840 rows x 2 columns]
