In [10]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd

import hazm

import transformers 
from transformers import AutoTokenizer, AutoConfig
from transformers import TFAutoModelForTokenClassification

import os
from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout

print()
print('tensorflow', tf.__version__)
print('transformers', transformers.__version__)
print('numpy', np.__version__)
print('pandas', pd.__version__)
print()

if tf.test.gpu_device_name() != '/device:GPU:0':
    print()
    print('WARNING: GPU device not found.')
else:
    print()
    print('SUCCESS: Found GPU: {}'.format(tf.test.gpu_device_name()))

2024-07-13 13:07:48.632747: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-13 13:07:48.648412: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-13 13:07:48.653185: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-13 13:07:48.665406: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



tensorflow 2.17.0
transformers 4.42.4
numpy 1.24.3
pandas 2.2.2




In [4]:
texts = [
    "مدیرکل محیط زیست استان البرز با بیان اینکه با بیان اینکه موضوع شیرابه‌های زباله‌های انتقال یافته در منطقه حلقه دره خطری برای این استان است، گفت: در این مورد گزارشاتی در ۲۵ مرداد ۱۳۹۷ تقدیم مدیران استان شده است.",
    "به گزارش خبرگزاری تسنیم از کرج، حسین محمدی در نشست خبری مشترک با معاون خدمات شهری شهرداری کرج که با حضور مدیرعامل سازمان‌های پسماند، پارک‌ها و فضای سبز و نماینده منابع طبیعی در سالن کنفرانس شهرداری کرج برگزار شد، اظهار داشت: ۸۰٪  جمعیت استان البرز در کلانشهر کرج زندگی می‌کنند.",
    "وی افزود: با همکاری‌های مشترک بین اداره کل محیط زیست و شهرداری کرج برنامه‌های مشترکی برای حفاظت از محیط زیست در شهر کرج در دستور کار قرار گرفته که این اقدامات آثار مثبتی داشته و تاکنون نزدیک به ۱۰۰ میلیارد هزینه جهت خریداری اکس-ریس صورت گرفته است.",
]

peyma_translate = {
    "B_DAT": "تاریخ",
    "B_LOC": "موقعیت",
    "B_MON": "پول",
    "B_ORG": "سازمنان",
    "B_PCT": "درصد",
    "B_PER": "شخص",
    "B_TIM": "زمان",
    "I_DAT": "تاریخ",
    "I_LOC": "موقعیت",
    "I_MON": "پول",
    "I_ORG": "سازمان",
    "I_PCT": "درصد",
    "I_PER": "شخص",
    "I_TIM": "زمان",
    "O": None,
}
arman_translate = {
    "B-event": "رویداد",
    "B-fac": "امکانات",
    "B-loc": "موقعیت",
    "B-org": "سازمان",
    "B-pers": "شخص",
    "B-pro": "محصول",
    "I-event": "رویداد",
    "I-fac": "امکانات",
    "I-loc": "موقعیت",
    "I-org": "سازمان",
    "I-pers": "شخص",
    "I-pro": "محصول",
    "O": None
}

ner_translate = {
    "B-date": "تاریخ",
    "B-event": "رویداد",
    "B-facility": "امکانات",
    "B-location": "موقعیت",
    "B-money": "پول",
    "B-organization": "سازمان",
    "B-person": "شخص",
    "B-product": "محصول",
    "B-time": "زمان",
    "B-percent": "درصد",
    "I-date": "تاریخ",
    "I-event": "رویداد",
    "I-facility": "امکانات",
    "I-location": "موقعیت",
    "I-money": "پول",
    "I-organization": "سازمان",
    "I-person": "شخص",
    "I-product": "محصول",
    "I-time": "زمان",
    "I-percent": "درصد",
    "O": None
}

In [5]:
normalizer = hazm.Normalizer()


def cleanize(text):
    """A way to normalize and even clean the text"""
    # clean text
    # do some fns
    return normalizer.normalize(text)


def parsbert_ner_load_model(model_name):
    """Load the model"""
    try:
        config = AutoConfig.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = TFAutoModelForTokenClassification.from_pretrained(model_name)
        labels = list(config.label2id.keys())

        return model, tokenizer, labels
    except:
        return [None] * 3

def parsbert_ner(texts, model_name, label_translate, visualize=True):
    """Predict and visualize the NER!"""
    global css_is_load
    
    css_is_load = False
    css = """<style>
    .ner-box {
        direction: rtl;
        font-size: 18px !important;
        line-height: 20px !important;
        margin: 0 0 15px;
        padding: 10px;
        text-align: justify;
        color: #343434 !important;
    }
    .token, .token span {
        display: inline-block !important;
        padding: 2px;
        margin: 2px 0;
    }
    .token.token-ner {
        background-color: #f6cd61;
        font-weight: bold;
        color: #000;
    }
    .token.token-ner .ner-label {
        color: #9a1f40;
        margin: 0px 2px;
    }
    </style>"""

    if not css_is_load:
        display(HTML(css))
        css_is_load = True

    model, tokenizer, labels = parsbert_ner_load_model(model_name)

    if not model or not tokenizer or not labels:
        return 'Something wrong has been happened!'
    
    output_predictions = []
    for sequence in texts:
        sequence = cleanize(sequence)
        tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
        inputs = tokenizer.encode(sequence, return_tensors="tf")
        outputs = model(inputs)[0]
        predictions = tf.argmax(outputs, axis=2)
        predictions = [(token, label_translate[labels[prediction]]) for token, prediction in zip(tokens, predictions[0].numpy())]
        
        if not visualize:
            output_predictions.append(predictions)
        else:
            pred_sequence = []
            for token, label in predictions:
                if token not in ['[CLS]', '[SEP]']:
                    if label:
                        pred_sequence.append(
                            '<span class="token token-ner">%s<span class="ner-label">%s</span></span>' 
                            % (token, label))
                    else:
                        pred_sequence.append(
                            '<span class="token">%s</span>' 
                            % token)
                
            html = '<p class="ner-box">%s</p>' % ' '.join(pred_sequence) 
            display(HTML(html))

    return output_predictions

In [6]:
model_name = 'HooshvareLab/bert-base-parsbert-armanner-uncased'
_ = parsbert_ner(texts, model_name, arman_translate, visualize=True)

config.json:   0%|          | 0.00/937 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
model_name = 'HooshvareLab/bert-base-parsbert-peymaner-uncased'
_ = parsbert_ner(texts, model_name, peyma_translate, visualize=True)

config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
model_name = 'HooshvareLab/bert-base-parsbert-ner-uncased'
_ = parsbert_ner(texts, model_name, ner_translate, visualize=True)

In [11]:
#@title Live Playground { display-mode: "form" }

submit_wd = widgets.Button(description='Send', disabled=False, button_style='success', tooltip='Submit')
text_wd = widgets.Textarea(placeholder='Please enter you text ...', rows=5, layout=Layout(width='90%'))

config_wd = widgets.RadioButtons(options=['ARMAN', 'PEYMA', 'ARMAN+PEYMA'], description='Select your model', disabled=False)

output_wd = widgets.Output()

display(HTML("""
<h2>Persian NER <small>[ARMAN, PEYMA, ARMAN+PEYMA]</small></h2>
<p style="padding: 2px 20px; margin: 0 0 20px;">
</p>
<br /><br />
"""))

display(config_wd)
display(text_wd)
display(submit_wd)
display(oNamed Entity Recognitionutput_wd)

configs = {
    'ARMAN': [
        'HooshvareLab/bert-base-parsbert-armanner-uncased',
        arman_translate
    ], 
    'PEYMA': [
        'HooshvareLab/bert-base-parsbert-peymaner-uncased',
        peyma_translate
    ],
    'ARMAN+PEYMA': [
        'HooshvareLab/bert-base-parsbert-ner-uncased',
        ner_translate
    ]
}

def submit_text(sender):
    with output_wd:
        clear_output(wait=True)
        text = text_wd.value
        config = configs[config_wd.value]
        print('Predicting .... [please wait!]')
        _ = parsbert_ner([text], config[0], config[1], visualize=True)


submit_wd.on_click(submit_text)

RadioButtons(description='Select your model', options=('ARMAN', 'PEYMA', 'ARMAN+PEYMA'), value='ARMAN')

Textarea(value='', layout=Layout(width='90%'), placeholder='Please enter you text ...', rows=5)

Button(button_style='success', description='Send', style=ButtonStyle(), tooltip='Submit')

Output()