In [1]:
# !nvidia-smi
# https://pytorch.org/get-started/locally/
# !pip install torch --index-url https://download.pytorch.org/whl/cu121
# !pip install spacy-transformers==1.2.2
# !pip install hu_core_news_trf-3.7.0-py3-none-any.whl
# download https://huggingface.co/huspacy/hu_core_news_trf/resolve/main/hu_core_news_trf-any-py3-none-any.whl
# and rename to hu_core_news_trf-3.7.0-py3-none-any.whl

In [2]:
!pip install -U setuptools wheel

!pip install torch
!pip install spacy

!pip install huspacy

!pip install hu_core_news_md-3.7.0-py3-none-any.whl

Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
Processing ./hu_core_news_md-3.7.0-py3-none-any.whl
hu-core-news-md is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0m

In [3]:
import spacy

# spacy.require_gpu()
# nlp = spacy.load('hu_core_news_trf')
NLP = spacy.load('hu_core_news_md')

In [4]:
text = "Elisabeth megfogott 2 almát és beleharapott mindkettőbe."

doc = NLP(text)
doc
print([f'{token}:{token.ent_type_}' for token in doc])

['Elisabeth:PER', 'megfogott:', '2:', 'almát:', 'és:', 'beleharapott:', 'mindkettőbe:', '.:']


In [5]:
!pip install pandas
!pip install numpy
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0m

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

In [7]:
def convert_ent_type(ent_type):
    if ent_type == 'ORG':
        return 'I-ORG'
    elif ent_type == 'LOC':
        return 'I-LOC'
    elif ent_type == 'PER':
        return 'I-PER'
    elif ent_type == 'MISC':
        return 'I-MISC'
    elif ent_type == '':
        return '0'

In [8]:
df = pd.read_csv('hun_ner_corpus.txt', names=['token', 'ent_type'], skiprows=1, sep='\t', encoding='latin-1') # https://rgai.inf.u-szeged.hu/file/67#overlay-context=
sample_len = 5000
sample = df.iloc()[:sample_len].copy()
print(list(set(sample['ent_type'])))
LABELS = ['0', 'I-ORG', 'I-LOC', 'I-PER', 'I-MISC']
sample['ent_type'] = sample.apply(lambda x: LABELS.index(x['ent_type']), axis=1)
chunk_size = 50
chunk_overlap = 10

data = []
for idx in range(0, sample_len - chunk_size, chunk_size - chunk_overlap):
    chunk = sample[idx:idx+chunk_size]['token'].values
    text = ' '.join(list(sample[idx:idx+chunk_size]['token'].values))
    ent_types = sample[idx:idx+chunk_size]['ent_type'].values
    data.append((chunk, text, ent_types))

dataset = pd.DataFrame(data, columns=['chunk', 'text', 'ent_types'])

['I-PER', 'I-MISC', '0', 'I-LOC', 'I-ORG']


In [9]:
def apply_spacy(row, axis=None):
    doc = NLP(row['text'])
    row['spacy_chunk'] = [token for token in doc]
    row['spacy_preds'] = np.array(list(map(lambda y: LABELS.index(convert_ent_type(y.ent_type_)), doc)))
    return row

In [10]:
dataset = dataset.apply(apply_spacy, axis=1)
dataset.head()

Unnamed: 0,chunk,text,ent_types,spacy_chunk,spacy_preds
0,"[Hétfõn, folytatódik, az, Investicná, a, Rozvo...",Hétfõn folytatódik az Investicná a Rozvojová B...,"[0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, ...","[Hétfõn, folytatódik, az, Investicná, a, Rozvo...","[0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, ..."
1,"[-, tájékoztatták, a, pénzintézetnél, kedden, ...",- tájékoztatták a pénzintézetnél kedden az MTI...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, ...","[-, tájékoztatták, a, pénzintézetnél, kedden, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, ..."
2,"[brit, Manchester, United, labdarúgó, klub, a,...",brit Manchester United labdarúgó klub a világ ...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[brit, Manchester, United, labdarúgó, klub, a,...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[anyacége, ,, a, UAL, Corp., könyvvizsgálót, v...","anyacége , a UAL Corp. könyvvizsgálót vált - a...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[anyacége, ,, a, UAL, Corp., könyvvizsgálót, v...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,"[nap, újabb, számviteli, botrány, nélkül, az, ...",nap újabb számviteli botrány nélkül az Egyesül...,"[0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, ...","[nap, újabb, számviteli, botrány, nélkül, az, ...","[0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, ..."


In [12]:
# Not ideal bot there isn't many predictions where the tokenization is different
print((dataset.apply(lambda x: x['spacy_preds'].shape[0], axis=1) > 50).value_counts())
dataset['spacy_preds'] = dataset.apply(lambda x: x['spacy_preds'][:50], axis=1)
y = list(np.stack(dataset['ent_types'].values, axis=-1).flatten())
preds = list(np.stack(dataset['spacy_preds'].values, axis=-1).flatten())

report = classification_report(y, preds, zero_division=np.nan, target_names=LABELS)
print(report)

False    124
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5210
       I-ORG       0.99      0.96      0.98       887
       I-LOC       0.94      0.97      0.96        33
       I-PER       0.97      0.93      0.95        61
      I-MISC       0.14      0.78      0.24         9

    accuracy                           0.99      6200
   macro avg       0.81      0.93      0.82      6200
weighted avg       0.99      0.99      0.99      6200



In [13]:
!pip install openai
!pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for json[0m[31m
[0m

In [14]:
from openai import OpenAI
from dotenv import load_dotenv
import json

In [15]:
load_dotenv()
CLIENT = OpenAI()
MODEL = 'gpt-3.5-turbo'

In [16]:
def request_completion(sys_msg, ast_msg, usr_msg, logit_bias=None):
    completion_response = CLIENT.chat.completions.create(
                            messages=[
                                {'role': 'system', 'content': sys_msg},
                                {'role': 'assistant', 'content': ast_msg},
                                {'role': 'user', 'content': usr_msg}
                            ],
                            response_format={ "type": "json_object" },
                            temperature=0,
                            top_p=1,
                            frequency_penalty=0,
                            presence_penalty=0,
                            model=MODEL,
                            logit_bias=logit_bias)

    return completion_response

def perform_ner_on_text(text, entity_types, logit_bias=None):
    sys_msg = """You are a language expert working for a hungarian news organisation designed to output JSON.
You are analysing text chunks from hungarian news articles regarding business.
Your task is to find named entities in the text and sort them into the following categories:
""" + '\n'.join([f'{entity_type}, ' for entity_type in entity_types])
    ast_msg = """EXAMPLE:
    The text is: 'Sam DiPiazza, a PWC vezérigazgatója szerint az EU által már kötelezővé tett úgynevezett GAS az eddiginél nagyobb betekintést ad az adott cég pénzügyeibe, ami megelőzheti az Enronéhoz hasonló botrányok megismétlődését Európában.'
    {{
        "I-ORG": ["PWC", "EU", "Enronéhoz"],
        "I-PER": ["Sam DiPiazza"],
        "I-LOC": ["Európában"],
        "I-MISC": ["GAS"]
    }}
    """
    usr_msg = "The text is: " + text 
    return json.loads(request_completion(sys_msg, ast_msg, usr_msg, logit_bias).choices[0].message.content)

def assign_label(chunk, labels):
    def is_token_in_list(token, ent_list):
        is_in = False
        for ent in [ word for ent in ent_list for word in ent.split(' ')]:
            is_in = is_in or token == ent
        return is_in
    
    ent_types = [0 for _ in chunk]
    for idx, token in enumerate(chunk):
        for label, ents in labels.items():
            if is_token_in_list(token, ents):
                ent_types[idx] = LABELS.index(label)

    return ent_types

In [17]:
dataset['gpt_preds'] = dataset.apply(lambda x: assign_label(x['chunk'], perform_ner_on_text(x['text'], LABELS[1:])), axis=1)
dataset.head()

Unnamed: 0,chunk,text,ent_types,spacy_chunk,spacy_preds,gpt_preds
0,"[Hétfõn, folytatódik, az, Investicná, a, Rozvo...",Hétfõn folytatódik az Investicná a Rozvojová B...,"[0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, ...","[Hétfõn, folytatódik, az, Investicná, a, Rozvo...","[0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, ..."
1,"[-, tájékoztatták, a, pénzintézetnél, kedden, ...",- tájékoztatták a pénzintézetnél kedden az MTI...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, ...","[-, tájékoztatták, a, pénzintézetnél, kedden, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
2,"[brit, Manchester, United, labdarúgó, klub, a,...",brit Manchester United labdarúgó klub a világ ...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[brit, Manchester, United, labdarúgó, klub, a,...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[anyacége, ,, a, UAL, Corp., könyvvizsgálót, v...","anyacége , a UAL Corp. könyvvizsgálót vált - a...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[anyacége, ,, a, UAL, Corp., könyvvizsgálót, v...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,"[nap, újabb, számviteli, botrány, nélkül, az, ...",nap újabb számviteli botrány nélkül az Egyesül...,"[0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, ...","[nap, újabb, számviteli, botrány, nélkül, az, ...","[0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, ..."


In [19]:
y = list(np.stack(dataset['ent_types'].values, axis=-1).flatten())
preds = list(np.stack(dataset['gpt_preds'].values, axis=-1).flatten())

report = classification_report(y, preds, zero_division=np.nan, target_names=LABELS)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      5210
       I-ORG       0.95      0.86      0.90       887
       I-LOC       0.56      0.70      0.62        33
       I-PER       0.90      0.92      0.91        61
      I-MISC       0.00      0.00      0.00         9

    accuracy                           0.95      6200
   macro avg       0.68      0.69      0.68      6200
weighted avg       0.97      0.95      0.96      6200

