In [None]:
# !pip install --upgrade seqeval evaluate transformers==4.28.1 torch datasets huggingface_hub ipywidgets

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"
EXP_NAME = 'exp-991-mbert'
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
import json
import numpy as np
import pandas as pd
from datasets import Dataset, load_metric
from sklearn.metrics import f1_score
from seqeval.metrics import classification_report
from tabulate import tabulate


In [3]:
class_list = ["Tanggal Putusan", "Nama Hakim Ketua", "Nama Hakim Anggota", "Nama Panitera",
              "Putusan Hukuman", "Jenis Amar", "Melanggar UU (Pertimbangan Hukum)", "Melanggar UU (Dakwaan)",
              "Jenis Dakwaan", "Tanggal Kejadian", "Nama Saksi", "Tuntutan Hukuman", "Melanggar UU (Tuntutan)",
              "Jenis Perkara", "Nama Terdakwa", "Nama Pengadilan", "Tingkat Kasus", "Nomor Putusan", "Nama Jaksa",
              "Nama Pengacara"]
label_list = ['O']
label_list.extend([_ for __ in [[f'B-{cl}', f'I-{cl}'] for cl in class_list] for _ in __])
print(label_list)

['O', 'B-Tanggal Putusan', 'I-Tanggal Putusan', 'B-Nama Hakim Ketua', 'I-Nama Hakim Ketua', 'B-Nama Hakim Anggota', 'I-Nama Hakim Anggota', 'B-Nama Panitera', 'I-Nama Panitera', 'B-Putusan Hukuman', 'I-Putusan Hukuman', 'B-Jenis Amar', 'I-Jenis Amar', 'B-Melanggar UU (Pertimbangan Hukum)', 'I-Melanggar UU (Pertimbangan Hukum)', 'B-Melanggar UU (Dakwaan)', 'I-Melanggar UU (Dakwaan)', 'B-Jenis Dakwaan', 'I-Jenis Dakwaan', 'B-Tanggal Kejadian', 'I-Tanggal Kejadian', 'B-Nama Saksi', 'I-Nama Saksi', 'B-Tuntutan Hukuman', 'I-Tuntutan Hukuman', 'B-Melanggar UU (Tuntutan)', 'I-Melanggar UU (Tuntutan)', 'B-Jenis Perkara', 'I-Jenis Perkara', 'B-Nama Terdakwa', 'I-Nama Terdakwa', 'B-Nama Pengadilan', 'I-Nama Pengadilan', 'B-Tingkat Kasus', 'I-Tingkat Kasus', 'B-Nomor Putusan', 'I-Nomor Putusan', 'B-Nama Jaksa', 'I-Nama Jaksa', 'B-Nama Pengacara', 'I-Nama Pengacara']


In [4]:
id2label = {}
label2id = {}
for idx, x in enumerate(label_list):
    label2id[x] = idx
    id2label[idx] = x

In [5]:
def process_input(filename):
  tokens = []
  tags = []
  with open(filename, 'r', encoding="utf8") as f:
    for line in f:
      data = json.loads(line)
      tokens.append(data['text'])
      tags.append(data['text-tags'])
  
  return pd.DataFrame({'tokens': tokens, "tags": tags})

In [6]:
dataset_train = process_input("data/out.train.jsonl")
dataset_test = process_input("data/out.test.jsonl")
ds_train = Dataset.from_pandas(dataset_train)
ds_test = Dataset.from_pandas(dataset_test)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(EXP_NAME)

In [8]:
def split_tokens(row):
    label_all_data = True
    tokens = tokenizer(
        list(row["tokens"]),
        is_split_into_words=True,
        max_length=512,
        truncation=True,
        return_overflowing_tokens=True,
        return_length=True,
        padding=True
    )
    token = []
    labels = []
    word_ids = []
    
    for i in range(len(tokens.length)):
        m_label = []
        prev_word_idx = None
        token.append(tokens.tokens(i))
        for word_idx in tokens.word_ids(i):
            word_ids.append(word_idx)
            if word_idx is None:
                m_label.append(-100)
            elif word_idx != prev_word_idx:
                m_label.append(label2id[row["tags"][word_idx]])
            else:
                m_label.append(label2id[row["tags"][word_idx]] if label_all_data else -99)
            prev_word_idx = word_idx
        labels.append(m_label)
    tokens['token'] = token
    tokens['labels'] = labels
    tokens['word_ids'] = word_ids
    return tokens

In [9]:
test_tokenized_datasets = ds_test.map(split_tokens)

Map:   0%|          | 0/269 [00:00<?, ? examples/s]

In [10]:
dropped_col = ['tokens', 'tags', 'length', 'overflow_to_sample_mapping']

In [11]:
def flatten_pandas_ds(ds): 
    cols = list(ds)
    out = {}
    
    for i in cols:
        d1 = []
        for d1d in ds[i].values.tolist():
            for d2d in d1d.tolist():
                d1.append(d2d)
        out[i] = d1
    return pd.DataFrame(out)

In [12]:
pd_test_tokenized_datasets = test_tokenized_datasets.to_pandas()
pd_test_tokenized_datasets = pd_test_tokenized_datasets.drop(dropped_col, axis=1)
word_pd = pd_test_tokenized_datasets.pop('word_ids')
pd_test_tokenized_datasets_flat = flatten_pandas_ds(pd_test_tokenized_datasets)

In [13]:
test_flat_ds = Dataset.from_pandas(pd_test_tokenized_datasets_flat)

In [14]:
import evaluate

seqeval = evaluate.load('seqeval')

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="pt")

In [16]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [31]:
EXP_NAME = 'exp-992-mbert'

In [32]:
model = AutoModelForTokenClassification.from_pretrained(EXP_NAME, num_labels=41, id2label=id2label, label2id=label2id)

In [33]:
args = TrainingArguments(
    output_dir=EXP_NAME,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=16,
)

args.set_optimizer(name="adamw_torch", beta1=0.9)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=test_flat_ds,
    eval_dataset=test_flat_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

## Evaluation

In [34]:
eval_ds = test_flat_ds.remove_columns('labels')

In [35]:
pred = trainer.predict(eval_ds)

In [36]:
reference = [list(x) for x in test_flat_ds['labels']]

In [37]:
flat_word_id = []
for i in word_pd:
    for j in i:
        flat_word_id.append(j)

In [38]:
import math

pickup = []
prev_word = -100
for word_id in flat_word_id:
    if word_id <= -1 or math.isnan(word_id) :
        pickup.append(False)
    elif word_id == prev_word:
        pickup.append(False)
    else:
        pickup.append(True)
        prev_word = word_id

In [39]:
dt_refs, dt_preds = zip(*[(id2label[x], id2label[y]) for x,y,p in zip(flatten(reference), flatten(np.argmax(pred.predictions, -1)), pickup) if p and x>-1 and y > -1])

In [40]:
with open(f'real-analytics/{EXP_NAME}-analytics.txt', 'w') as f:
    rep = classification_report([list(dt_refs)], [list(dt_preds)], digits=4)
    print(rep)
    f.write(rep)

                                   precision    recall  f1-score   support

                       Jenis Amar     0.9065    0.9618    0.9333       262
                    Jenis Dakwaan     0.8875    0.9301    0.9083       229
                    Jenis Perkara     0.6109    0.7451    0.6714       255
           Melanggar UU (Dakwaan)     0.7243    0.7592    0.7414       353
Melanggar UU (Pertimbangan Hukum)     0.7456    0.7143    0.7296       357
          Melanggar UU (Tuntutan)     0.8045    0.8492    0.8263       252
               Nama Hakim Anggota     0.8721    0.9416    0.9055       514
                 Nama Hakim Ketua     0.8622    0.9208    0.8905       265
                       Nama Jaksa     0.8790    0.9500    0.9131       260
                    Nama Panitera     0.8754    0.9389    0.9061       262
                   Nama Pengacara     0.8049    0.8800    0.8408        75
                  Nama Pengadilan     0.9487    0.9885    0.9682       262
                       N

In [41]:
y_pred_df_mix = pd.Series(list(dt_preds), name="Predicted")
y_true_df_mix = pd.Series(list(dt_refs), name="Reference")
df_confusion_mix = pd.crosstab(y_true_df_mix, y_pred_df_mix)


In [42]:
df_confusion_mix

Predicted,B-Jenis Amar,B-Jenis Dakwaan,B-Jenis Perkara,B-Melanggar UU (Dakwaan),B-Melanggar UU (Pertimbangan Hukum),B-Melanggar UU (Tuntutan),B-Nama Hakim Anggota,B-Nama Hakim Ketua,B-Nama Jaksa,B-Nama Panitera,...,I-Nama Pengadilan,I-Nama Saksi,I-Nama Terdakwa,I-Nomor Putusan,I-Putusan Hukuman,I-Tanggal Kejadian,I-Tanggal Putusan,I-Tingkat Kasus,I-Tuntutan Hukuman,O
Reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B-Jenis Amar,259,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
B-Jenis Dakwaan,0,219,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
B-Jenis Perkara,0,0,223,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,26
B-Melanggar UU (Dakwaan),0,1,0,311,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,40
B-Melanggar UU (Pertimbangan Hukum),0,0,0,0,275,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,76
B-Melanggar UU (Tuntutan),0,0,0,1,0,247,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
B-Nama Hakim Anggota,0,0,0,0,0,0,504,0,0,1,...,0,0,0,0,0,0,0,0,0,2
B-Nama Hakim Ketua,0,0,0,0,0,0,3,258,0,0,...,0,0,0,0,0,0,0,0,0,4
B-Nama Jaksa,0,0,0,0,0,0,0,0,258,1,...,0,0,0,0,0,0,0,0,0,0
B-Nama Panitera,0,0,0,0,0,0,1,0,1,255,...,0,0,0,0,0,0,0,0,0,1


In [43]:
df_confusion_mix.to_csv(f'real-analytics/{EXP_NAME}-confusion_mat.csv')

In [44]:
# import os
# os._exit(00)