## Model Loading

In [1]:
import sklearn
import numpy as np
import pandas as pd

import psutil

import torch
cuda_available = torch.cuda.is_available()

from simpletransformers.ner import NERModel, NERArgs

from nervaluate import Evaluator



In [2]:
model_name='bert-base-multilingual-cased'

In [3]:
train_df =pd.read_csv("..\..\..\Dataset\\simpletransformers\\train.csv")
eval_df =pd.read_csv("..\..\..\Dataset\\simpletransformers\\val.csv")
test_df =pd.read_csv("..\..\..\Dataset\\simpletransformers\\test.csv")

In [4]:
label_list = [
    "O", 
    "B-PER", "I-PER",
    "B-ORG", "I-ORG",
    "B-LOC", "I-LOC",
    "B-PROD", "I-PROD",
    "B-WA", "I-WA",
    "B-EV", "I-EV",
]

In [5]:
# Create a NERModel
model_args=NERArgs()
# model_args.train_batch_size=64
# model_args.overwrite_output_dir=True
model_args = {
    "output_dir": "outputs/",
    "cache_dir": "cache_dir/",

    "fp16": True,
    "fp16_opt_level": "O1",
    "max_seq_length": 512,
    "train_batch_size": 8,
    "gradient_accumulation_steps": 1,
    "eval_batch_size": 8,
    # "num_train_epochs": 1,
    "num_train_epochs": 5,
    "weight_decay": 0,
    # "learning_rate": 4e-5,
    "learning_rate": 0.00008452,

    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.06,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,

    "logging_steps": 50,
    "save_steps": 2000,

    "overwrite_output_dir": True,
    "reprocess_input_data": False,
    "evaluate_during_training": True,

    "do_lower_case": False, #True if using uncased models
    "silent":True,
    'use_multiprocessing': True,
    
}


In [6]:
model = NERModel('bert', 
                'checkpoint-2000',
                args=model_args,
                labels=label_list,
                use_cuda=cuda_available,
                )

In [8]:
result, model_outputs, y_pred = model.eval_model(test_df)

2022-02-17 11:13:17 simpletransformers.ner.ner_model INFO:  Features loaded from cache at cache_dir/cached_dev_bert_512_13_1485
2022-02-17 11:15:36 simpletransformers.ner.ner_model INFO: {'eval_loss': 0.12251842792661641, 'precision': 0.8354978354978355, 'recall': 0.8106405320266014, 'f1_score': 0.8228815064842779}


In [9]:
result

{'eval_loss': 0.12251842792661641,
 'precision': 0.8354978354978355,
 'recall': 0.8106405320266014,
 'f1_score': 0.8228815064842779}

## pub/sub pred

In [55]:
data= {
    "created_at": "2022-02-17 04:07:32", 
    "id": 1494161567750901763, 
    "text": "RT @Beritasatu: Mantan Wakil Ketua DPR, Azis Syamsuddin hari ini menghadapi sidang putusan vonis atas perkara dugaan suap. Baca selengkapnya \\ud83d\\udc47 https://t.co/gF3YxluxcV",
    "entities": {
        "hashtags": [], "urls": [], "user_mentions": [], "symbols": []
        }, 
    "user_id": 1363646268153602050, 
    "user_name": "prasetio agung", 
    "user_screen_name": "Prasetiooob", 
    "user_description": "Principal Architect & Founder AKSEN Group", 
    "user_followers_count": 3, 
    "user_friends_count": 3, 
    "user_statuses_count": 2, 
    "user_created_at": "2021-02-22 00:26:31",
    }

In [57]:
from nltk.tokenize import TweetTokenizer
tk = TweetTokenizer()
predictions, raw_outputs = model.predict(
    [tk.tokenize(data["text"])],
    split_on_space=False,
    )

2022-02-17 12:32:51 simpletransformers.ner.ner_model INFO:  Converting to features started.


In [58]:
predictions[0]

[{'RT': 'O'},
 {'@Beritasatu': 'O'},
 {':': 'O'},
 {'Mantan': 'O'},
 {'Wakil': 'O'},
 {'Ketua': 'O'},
 {'DPR': 'O'},
 {',': 'O'},
 {'Azis': 'B-PER'},
 {'Syamsuddin': 'I-PER'},
 {'hari': 'O'},
 {'ini': 'O'},
 {'menghadapi': 'O'},
 {'sidang': 'O'},
 {'putusan': 'O'},
 {'vonis': 'O'},
 {'atas': 'O'},
 {'perkara': 'O'},
 {'dugaan': 'O'},
 {'suap': 'O'},
 {'.': 'O'},
 {'Baca': 'O'},
 {'selengkapnya': 'O'},
 {'\\': 'O'},
 {'ud83d': 'O'},
 {'\\': 'O'},
 {'udc': 'O'},
 {'47': 'O'},
 {'https://t.co/gF3YxluxcV': 'O'}]

In [59]:
def get_entities(row):
    full_entities = []
    prev_tag = "O" #init default tag
    for i in row:
        token_label= list(i.items())[0]
        start_tag=token_label[1][:1]
        tag= token_label[1][2:] if token_label[1] !='O' else 'O'
        if tag == "O":
            prev_tag = tag
            continue

        if start_tag == 'B': # Begin NE 
            full_entities.append([token_label[0], tag])
        elif start_tag == 'I' and prev_tag == tag: # Inside NE
            full_entities[-1][0]  = full_entities[-1][0]+" "+ token_label[0]
        prev_tag = tag
    return full_entities

In [62]:
def encode_entities(entities):
    list_of_entities=list()
    for ent in entities:
        temp_dict=dict()
        temp_dict['token'],temp_dict['entity']=ent
        list_of_entities.append(temp_dict)
    return list_of_entities

In [63]:
encode_entities(get_entities(predictions[0]))

[{'token': 'Azis Syamsuddin', 'entity': 'PER'}]

## Test Data Prediction

In [141]:
temp_df_labels=test_df.groupby("sentence_id")['labels'].apply(list).reset_index(name='labels')
temp_df_tweets=test_df.groupby("sentence_id")['words'].apply(list).reset_index(name='tweet')
test_sentences=temp_df_tweets.merge(temp_df_labels, on='sentence_id',how='inner')
test_sentences

Unnamed: 0,sentence_id,tweet,labels
0,1380685209075544064,"[@GyuuPotter, iyhh, ,, soalnya, mwu, kerjain, ...","[O, O, O, O, O, O, O, O]"
1,1380685355897085952,"[@ShopeeID, Bismillah, yok, menang, ����, SHOP...","[O, O, O, O, O, B-PROD, O, O, O, O]"
2,1380688670282289153,"[RT, @kukluxcats, :, Selevel, gubernur, Jatim,...","[O, O, O, O, B-PER, I-PER, O, O, O, O, O, O, O..."
3,1380692463669547008,"[@andre_rosiade, Kerja, nyata, kalian, ya, itu...","[B-PER, O, O, O, O, O, O, O, B-PER, B-PER, O, ..."
4,1380695624258002949,"[chairman, ke, laut, ,, Secretary, 3, hari, on...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...
1480,1391602658155196421,"[Alumni, SMA, Pangudi, Luhur, Dirikan, Sentra,...","[O, B-ORG, I-ORG, I-ORG, O, O, O, O, O, B-PER,..."
1481,1391605668294004740,"[Melayani, pijat, pria, area, kota, Jogja, dan...","[O, O, O, O, O, B-LOC, O, O, O, O, O, O, O, O,..."
1482,1391609120927543297,"[thr, tuh, duit, kek, ,, ini, malah, tugas]","[O, O, O, O, O, O, O, O]"
1483,1391612015286702084,"[RT, @Brigade01Arwan1, :, Alhamdulillah, ., Ma...","[O, O, O, O, O, O, O, O, O, O, B-PER, O, O, O,..."


In [146]:
def compare_data(test_sentences, predictions):
    pred_labels = [[list(label_pred.values())[0] for label_pred in tweet_pred] for tweet_pred in predictions]
    true_labels = test_sentences[:1024].labels
    
    print(len(pred_labels), len(true_labels))
    diff_length=0
    for i in range(len(pred_labels)):
        if len(pred_labels[i])!=len(true_labels[i]):
            diff_length += 1
            print(test_sentences.loc[i][1])
            print(len(pred_labels[i]), len(true_labels[i]))
            print(f'pred_labels[{i}]     {pred_labels[i]}')
            print(f'true_labels[{i}]     {true_labels[i]}')
            print()
    print(diff_length)

In [147]:
def predict_batched(batch_size, data_size):
    batch_size=int(data_size/batch_size)
    predictions, raw_outputs= list(), list()
    for end in range(batch_size, data_size+batch_size, batch_size):
        start=end-batch_size
        temp_predictions, temp_raw_outputs = model.predict(
            test_sentences[start:end].tweet,
            split_on_space=False, # if the input are list of list
            )
        predictions.extend(temp_predictions)
        raw_outputs.extend(temp_raw_outputs)
    return predictions, raw_outputs

In [150]:
%%time

predictions, raw_outputs = predict_batched(8,1024)

print(f'Ram Usage: {psutil.virtual_memory().percent}')

2022-02-04 17:10:48 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 17:11:05 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 17:11:22 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 17:11:39 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 17:11:57 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 17:12:14 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 17:12:31 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 17:12:48 simpletransformers.ner.ner_model INFO:  Converting to features started.


Ram Usage: 61.5
Wall time: 2min 17s


In [149]:
compare_data(test_sentences, predictions)

1024 1024
['@ShopeeID', 'Bismillah', 'yok', 'menang', '����', 'SHOPEE', '@ShopeeID', '#44ShopeeMamamoo', '#ShopeexMAMAMOO', '#AdaMamamoodiShopee']
9 10
pred_labels[1]     ['O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']
true_labels[1]     ['O', 'O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']

['@cheonsaning', 'halo', 'kak', 'aku', 'jual', 'youtube', 'premium', '1', 'bulan', ':', '8,OOO', '2', 'bulan', ':', '13,OOO', 'sudah', 'dijamin', 'trusted', 'bisa', 'liat', 'testi', 'dipinned', '��', '—', '#zonaba', '#zonauang', 'https://t.co/KdD4d6BaHc']
26 27
pred_labels[22]     ['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
true_labels[22]     ['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['@jntexpressid', 'Pengen', 'banget', 'kirimin', '@Malanu99', 'Mukenah', 'dan', 'sajadah', ',', 'apalagi', 'bent

In [130]:
%%time

predictions, raw_outputs = predict_batched(2,1024)

print(f'Ram Usage: {psutil.virtual_memory().percent}')

2022-02-04 16:38:46 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:39:37 simpletransformers.ner.ner_model INFO:  Converting to features started.


Ram Usage: 51.4
Wall time: 1min 43s


In [131]:
compare_data(test_sentences, predictions)

1024 1024
['@ShopeeID', 'Bismillah', 'yok', 'menang', '����', 'SHOPEE', '@ShopeeID', '#44ShopeeMamamoo', '#ShopeexMAMAMOO', '#AdaMamamoodiShopee']
9 10
['O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']

['@cheonsaning', 'halo', 'kak', 'aku', 'jual', 'youtube', 'premium', '1', 'bulan', ':', '8,OOO', '2', 'bulan', ':', '13,OOO', 'sudah', 'dijamin', 'trusted', 'bisa', 'liat', 'testi', 'dipinned', '��', '—', '#zonaba', '#zonauang', 'https://t.co/KdD4d6BaHc']
26 27
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['@jntexpressid', 'Pengen', 'banget', 'kirimin', '@Malanu99', 'Mukenah', 'dan', 'sajadah', ',', 'apalagi', 'bentar', 'lagi', 'kan', 'Ramadhan', '��', 'Bunga', 'dipetik', 'taruh', 'dalam', 't

In [132]:
%%time

predictions, raw_outputs = predict_batched(4,1024)

print(f'Ram Usage: {psutil.virtual_memory().percent}')

2022-02-04 16:40:30 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:40:58 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:41:25 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:41:52 simpletransformers.ner.ner_model INFO:  Converting to features started.


Ram Usage: 52.5
Wall time: 1min 49s


In [133]:
compare_data(test_sentences, predictions)

1024 1024
['@ShopeeID', 'Bismillah', 'yok', 'menang', '����', 'SHOPEE', '@ShopeeID', '#44ShopeeMamamoo', '#ShopeexMAMAMOO', '#AdaMamamoodiShopee']
9 10
['O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']

['@cheonsaning', 'halo', 'kak', 'aku', 'jual', 'youtube', 'premium', '1', 'bulan', ':', '8,OOO', '2', 'bulan', ':', '13,OOO', 'sudah', 'dijamin', 'trusted', 'bisa', 'liat', 'testi', 'dipinned', '��', '—', '#zonaba', '#zonauang', 'https://t.co/KdD4d6BaHc']
26 27
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['@jntexpressid', 'Pengen', 'banget', 'kirimin', '@Malanu99', 'Mukenah', 'dan', 'sajadah', ',', 'apalagi', 'bentar', 'lagi', 'kan', 'Ramadhan', '��', 'Bunga', 'dipetik', 'taruh', 'dalam', 't

In [134]:
%%time

predictions, raw_outputs = predict_batched(8,1024)

print(f'Ram Usage: {psutil.virtual_memory().percent}')

2022-02-04 16:42:20 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:42:36 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:42:51 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:43:07 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:43:23 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:43:39 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:43:56 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:44:13 simpletransformers.ner.ner_model INFO:  Converting to features started.


Ram Usage: 52.3
Wall time: 2min 9s


In [135]:
compare_data(test_sentences, predictions)

1024 1024
['@ShopeeID', 'Bismillah', 'yok', 'menang', '����', 'SHOPEE', '@ShopeeID', '#44ShopeeMamamoo', '#ShopeexMAMAMOO', '#AdaMamamoodiShopee']
9 10
['O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'O', 'O', 'O', 'O']

['@cheonsaning', 'halo', 'kak', 'aku', 'jual', 'youtube', 'premium', '1', 'bulan', ':', '8,OOO', '2', 'bulan', ':', '13,OOO', 'sudah', 'dijamin', 'trusted', 'bisa', 'liat', 'testi', 'dipinned', '��', '—', '#zonaba', '#zonauang', 'https://t.co/KdD4d6BaHc']
26 27
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'B-PROD', 'I-PROD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

['@jntexpressid', 'Pengen', 'banget', 'kirimin', '@Malanu99', 'Mukenah', 'dan', 'sajadah', ',', 'apalagi', 'bentar', 'lagi', 'kan', 'Ramadhan', '��', 'Bunga', 'dipetik', 'taruh', 'dalam', 't

In [136]:
%%time

predictions, raw_outputs = predict_batched(16,1024)

print(f'Ram Usage: {psutil.virtual_memory().percent}')

2022-02-04 16:44:29 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:44:40 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:44:51 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:45:02 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:45:13 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:45:24 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:45:35 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:45:45 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:45:56 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:46:07 simpletransformers.ner.ner_model INFO:  Converting to features started.
2022-02-04 16:46:17 simpletransformers.ner.ner_model INFO:  Converting to featur

KeyboardInterrupt: 

In [None]:
compare_data(test_sentences, predictions)

In [None]:
%%time

predictions, raw_outputs = predict_batched(32,1024)

print(f'Ram Usage: {psutil.virtual_memory().percent}')

In [None]:
compare_data(test_sentences, predictions)