In [1]:
from llm import gemini
import re
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = gemini.Gemini()

In [3]:
labels_case_hinh_su = """PER	Person
ORG 	Organization
LOC	Location
DATE	Date, time, season
AMT	Amount of something
DOC_ID 	Document identifier
CRIMINAL	Criminal suspect
VICTIM	Victim
CRIME_TOOL	Crime tool
CRIME_LOCATION	Where did the crime happened?
CRIME_TIME	When did the crime happened?
COURT	Name of the court which has delivered the current judgement
STATUTE	Name of the act or law mentioned in the judgement
PROVISION	Sections, sub-sections, articles, orders, rules under a statute"""

labels_case_dan_su = """PER	Person
ORG 	Organization
LOC	Location
TIME	Date, time, season
AMT	Amount of something
DOC_ID 	Document identifier
PETITIONER	Name of the petitioners / appellants /revisionist from current case
RESPONDENT	Name of the respondents / defendents /opposition from current case
COURT	Name of the court which has delivered the current judgement
STATUTE	Name of the act or law mentioned in the judgement
PROVISION	Sections, sub-sections, articles, orders, rules under a statute
RIGHT	Civil/human rights"""

In [4]:
def convert_to_jsonl(ents, text):
    js = {}
    js['text'] = text 
    js['label'] = []
    
    for label in ents:
        dup_check = []
        for ent in ents[label]:
            ent = ent.strip()
            if ent == "" or ent == " ":
                continue
            l = len(ent)
            all_match = [m.start() for m in re.finditer(ent, text)]
            for start in all_match:
                if str(start) + '_' + str(l) not in dup_check:
                    dup_check.append(str(start) + '_' + str(l))
                    js['label'].append([start, start + l, label])
    return js

In [5]:
def write_to_file(path, text):
    with open(path, "w") as f:
        f.write(text)

In [6]:
def pipeline_label(path, dest):
    with open(path, "r", encoding="utf-8") as f:
        data = f.read()
    result = model.get_label_list(labels_case_dan_su, data)
    model.reset_history()
    jsonl_labeled_text = convert_to_jsonl(result, data)
    final_text = json.dumps(jsonl_labeled_text, ensure_ascii=False)
    write_to_file(dest, final_text)
    return 1

In [7]:
data_path = 'sample_data_160424'
data_out_path = 'last_label'

In [8]:
for f in os.listdir(data_path):
    if not f.endswith(".txt"):
        continue
    name = f.split('.')[0]
    src = os.path.join(data_path, f)
    dest = os.path.join(data_out_path, f)
    print(src)
    if not os.path.isfile(dest):
        pipeline_label(src, dest)
    else:
        print(dest, 'existed!')
    print(' ')

sample_data_160424/CASE000001.txt
PER round: 0 2759
ORG round: 0 2759
LOC round: 0 2759
TIME round: 0 2759
AMT round: 0 2759
DOC_ID round: 0 2759
PETITIONER round: 0 2759
RESPONDENT round: 0 2759
COURT round: 0 2759
STATUTE round: 0 2759
PROVISION round: 0 2759
RIGHT round: 0 2759
 
sample_data_160424/CASE000002.txt
PER round: 0 3554
ORG round: 0 3554
LOC round: 0 3554
TIME round: 0 3554
AMT round: 0 3554
DOC_ID round: 0 3554
PETITIONER round: 0 3554
RESPONDENT round: 0 3554
COURT round: 0 3554
STATUTE round: 0 3554
PROVISION round: 0 3554
RIGHT round: 0 3554
 
sample_data_160424/DOC025297.txt
PER round: 0 884
ORG round: 0 884
LOC round: 0 884
TIME round: 0 884
AMT round: 0 884
DOC_ID round: 0 884
PETITIONER round: 0 884
RESPONDENT round: 0 884
COURT round: 0 884
STATUTE round: 0 884
PROVISION round: 0 884
RIGHT round: 0 884
 
sample_data_160424/LAW000026.txt
PER round: 0 8000
round: 8001 16000
round: 16001 24000
round: 24001 32000
round: 32001 32421
ORG round: 0 8000
round: 8001 16000