In [1]:
import json
%load_ext rich
import datasets as ds
from rich import print
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('groupped_examples.json', 'r') as f:
    examples = json.loads(f.read())

with open("role-mapping.json", 'r') as f:
    role_mapping = json.loads(f.read())

with open("form-mapping.json", 'r') as f:
    form_mapping = json.loads(f.read())

In [3]:
import spacy
model = spacy.load("ru_core_news_lg")

In [4]:
def spacy_lemmatize(word):
    return next(iter(model(word))).lemma_

In [5]:
inv_form_mapping = {}
for group, forms in form_mapping.items():
    lemmas = [spacy_lemmatize(f) for f in forms]
    for lemma in lemmas:
        inv_form_mapping[lemma] = group

In [6]:
inv_examples_mapping = {}
for ex in examples:
    group = ex['group']
    if group not in inv_examples_mapping:
        inv_examples_mapping[group] = []

    inv_examples_mapping[group].append(ex)

In [None]:
data = ds.Dataset.from_csv("./data/KP_robotics_comments.csv")

In [9]:
def remapper(example):
    return {
        'text_fixed': " ".join(example['tokens']) if 'tokens' in example else example['text'] if 'text' in example else example['text_fixed']
    }

In [10]:
data = data.map(remapper)

In [11]:
def find_verbs(text):
    verbs, forms = list(), list()
    for tok in model(text):
        if tok.pos_ == "VERB":
            verbs.append(tok.lemma_)
            forms.append(tok.text)
    return verbs, forms

In [12]:
target_lemmas = set(inv_form_mapping.keys())

In [13]:
import datasets as ds
from tqdm.auto import tqdm

In [14]:
def filter_fn(example):
    return len(set(find_verbs(example['text_fixed'])[0]).intersection(target_lemmas)) > 0

In [None]:
# remove empty text_fixed
data = data.filter(lambda x: x['text_fixed'] is not None)

In [None]:
# keep only examples with target verbs
data = data.filter(filter_fn, num_proc=16, batched=False)

In [17]:
# split into sentences
data_list = data.to_list()
new_data = []
for item in tqdm(data_list):
    item_desc = item.copy()
    text = item_desc.pop("text_fixed")
    for sent in model(text).sents:
        item_desc_i = item_desc.copy()
        item_desc_i['text_fixed'] = sent.text
        new_data.append(item_desc_i)
print(f"{len(data)} => {len(new_data)}")

100%|██████████| 6984/6984 [02:11<00:00, 53.06it/s] 


In [18]:
# filter again to keep only sentences with target verbs
data = ds.Dataset.from_list(new_data)
data = data.filter(filter_fn, num_proc=16, batched=False)
len(data)

Filter (num_proc=16): 100%|██████████| 50842/50842 [00:29<00:00, 1711.51 examples/s]


[1;36m7602[0m

In [20]:
def predicate_extractor(text):
    lemmas, forms = find_verbs(text)
    new_lemmas, new_forms = list(), list()
    for lemma, form in zip(lemmas, forms):
        if lemma in set(inv_form_mapping.keys()):
            new_lemmas.append(lemma)
            new_forms.append(form)
    return {
        'predicate': new_forms,
        'lemma': new_lemmas
    }

In [21]:
def predicate_extractor_fn(example):
    return predicate_extractor(example['text_fixed'])

In [22]:
# extract predicates to identify relevant few-shot examples
data = data.map(predicate_extractor_fn)

Map: 100%|██████████| 7602/7602 [00:35<00:00, 214.19 examples/s]


In [23]:
def map_to_pred_group(example):
    group = None
    for lemma in example['lemma']:
        if lemma in inv_form_mapping:
            group = inv_form_mapping[lemma]
    return {
        'predicate_group': group
    }

In [24]:
data = data.map(map_to_pred_group)

Map:   0%|          | 0/7602 [00:00<?, ? examples/s]

Map: 100%|██████████| 7602/7602 [00:00<00:00, 48752.11 examples/s]


In [19]:
from typing import List, Literal
from pydantic import BaseModel, Field, model_validator
from typing_extensions import Annotated

class SemanticRole(BaseModel):
    short_reasoning: Annotated[str, Field(min_length=16, max_length=64)]
    arg_role: Literal["Cause", "Experiencer", "Causator", "Deliberative", "Instrument", "Object", "Not-Applicable"]
    arg_phrase_or_clause: Annotated[str, Field(min_length=1, max_length=64)]
    arg_main_indicative_word: Annotated[str, Field(min_length=1, max_length=32)]

class SemanticRoleMarkup(BaseModel):
    roles: List[SemanticRole]
    model_config = {
        "title": "SemanticRoleMarkup",
        "description": "Semantic Role Markup"
    }

In [20]:
def make_prompt_for_example_vllm(example):
    example_predicate_group = example['predicate_group']
    rule_set = role_mapping[example_predicate_group]
    rule_set = json.dumps(rule_set, ensure_ascii=False, indent=4)
    example_set = inv_examples_mapping[example_predicate_group]

    prompt = [
        {
            'role': 'system',
            'content': f'You are native russian linguist specializing in semantic role labelling.\nYou must follow those rules in your work:\n\n```json{rule_set}```'
        }
    ]
    examples = ""
    for ex in example_set:
        answers = [answer['entity'] for answer in ex['roles'] if '#predicate' not in answer['entity']]
        text = ex['text']
        semantic_roles = "".join(f"- {ans}\n" for ans in answers)
        examples += f"Example Text: {ex['text']}\n"
        examples += f"Example Semantic Roles: {semantic_roles}\n\n"

    inputs = f"""
Given a series of few-shot examples, please predict semantic roles in a target example.
Here are the few-shot examples:
{examples}

Here is the target example:
{example['text_fixed']}

Instructions:
- Do not mark semantic roles for implied, implicit or otherwise not presented arguments
- Reason out loud (concisely) before answering
- Predict both argument phrase (or clause) and a main indicative word of such phrase
- Some arguments may not have a phrase and will be represented by a single word. In this case use it as both argument phrase and main indicative word

Important: If there are no semantic roles for any argument that you can extract - reply with a ONLY ONE SINGLE argument markup that will have a role "Not-Applicable".
""".strip()

    

    prompt.append({
        'role': 'user',
        'content': inputs
    })
    return prompt

In [21]:
def make_prompt_openai_for_example(example):
    example_predicate_group = example['predicate_group']
    rule_set = role_mapping[example_predicate_group]
    rule_set = json.dumps(rule_set, ensure_ascii=False, indent=4)
    example_set = inv_examples_mapping[example_predicate_group]

    prompt = [
        {
            'role': 'system',
            'content': f'You are native russian linguist specializing in semantic role labelling.If there are no roles: reply with - No-Roles#No-Roles.\nYou must follow those rules in your work:\n\n```json{rule_set}```'
        }
    ]
    for ex in example_set:
        answers = [answer['entity'] for answer in ex['roles'] if '#predicate' not in answer['entity']]
        text = ex['text']
        semantic_roles = "".join(f"- {ans}\n" for ans in answers)
        prompt.append({
            'role': 'user',
            'content': text
        })
        prompt.append({
            'role': 'assistant',
            'content': semantic_roles
        })
        

    prompt.append({
        'role': 'user',
        'content': example['text_fixed']
    })
    return prompt

In [30]:
from openai import OpenAI

In [22]:
def make_request_openai(example):
    messages = make_prompt_openai_for_example(example)
    client = OpenAI(
        api_key="",
        base_url=""
    )
    try:
        response = client.chat.completions.create(
            model='openai/gpt-4o', messages=messages, max_tokens=128, temperature=0.0,
        )
        return {
            'llm-response': response.choices[0].message.content
        }
    except Exception as e:
        return {
            'llm-response': f"ERROR + {e}"
        }

In [23]:
def make_request_vllm(example):
    messages = make_prompt_for_example_vllm(example)
    client = OpenAI(
        base_url="http://localhost:8000/v1",
        api_key="token-abc123"
    )
    try:
        response = client.chat.completions.create(
            model='t-tech/T-lite-it-1.0',
            messages=messages, 
            max_completion_tokens=1024,
            temperature=0.0,
            extra_body={
                "guided_json": SemanticRoleMarkup.model_json_schema()
            },
        )
        return {
            'llm-response': response.choices[0].message.content
        }
    except Exception as e:
        return {
            'llm-response': f"ERROR + {e}"
        }

In [34]:
print(make_request_openai({'text_fixed': "Вася боиться за брата", "predicate_group": "бояться"})['llm-response'])

In [35]:
import time

In [36]:
t1 = time.time()
data = data.map(make_request_openai, num_proc=16)
t2 = time.time() - t1

Map (num_proc=16): 100%|██████████| 7602/7602 [09:45<00:00, 12.98 examples/s]  


In [38]:
def fix_roles_vllm(example):
    if example['llm-response'] is not None:
        roles = []
        try:
            response = json.loads(example['llm-response'])
            for item in response['roles']:
                #print(item)
                role = item['arg_role']
                content = item['arg_phrase_or_clause']
                if 'Not-Applicable' not in {role, content}:
                    roles.append({
                        'role': role.strip(),
                        'argument': content
                    })
        except Exception as e:
            print(e)
        return {'roles': roles}
    else:
        return {'roles': []}

In [39]:
def fix_roles_openai(example):
    if example['llm-response'] is not None and '- No-Roles#No-Roles' not in example['llm-response'] and "ERROR +" not in example['llm-response']:
        roles = []
        for item in example['llm-response'].split("\n"):
            role = item.split("#")[-1]
            argument = item.replace(f"#{role}", "")
            argument = argument.replace("- ", "")
            roles.append({
                'argument': argument,
                'role': role
            })
        return {'roles': roles}
    else:
        return {'roles': []}

In [40]:
data = data.map(fix_roles_openai)

Map: 100%|██████████| 7602/7602 [00:00<00:00, 28534.62 examples/s]


In [41]:
data[1]


[1m{[0m
    [32m'global_id'[0m: [32m'-170528132_4268704_4268822_vk'[0m,
    [32m'text'[0m: [32m'Почти как собачка! А боится машин дёргается видно страшно!'[0m,
    [32m'text_fixed'[0m: [32m'А боится машин дёргается видно страшно!'[0m,
    [32m'predicate'[0m: [1m[[0m[32m'боится'[0m[1m][0m,
    [32m'lemma'[0m: [1m[[0m[32m'бояться'[0m[1m][0m,
    [32m'predicate_group'[0m: [32m'бояться'[0m,
    [32m'llm-response'[0m: [32m'- [0m[32m[[0m[32mбоится машин[0m[32m][0m[32m@[0m[32m[[0m[32mмашин[0m[32m][0m[32m#Causator'[0m,
    [32m'roles'[0m: [1m[[0m[1m{[0m[32m'argument'[0m: [32m'[0m[32m[[0m[32mбоится машин[0m[32m][0m[32m@[0m[32m[[0m[32mмашин[0m[32m][0m[32m'[0m, [32m'role'[0m: [32m'Causator'[0m[1m}[0m[1m][0m
[1m}[0m

In [50]:
from typing import List, Dict

def create_roles_dataframe(examples: List[Dict]) -> pd.DataFrame:
    # List to store all rows
    rows = []
    
    for example in tqdm(examples):
        # Get roles list from the example
        roles = example.get('roles', [])
        
        # For each role in the example (except No-Roles)
        for role_dict in roles:
            if role_dict['role'] == 'No-Roles':
                continue
                
            # Create a new row with all metadata and role information
            row = {
                'group': example.get('group'),
                'global_id': example.get('global_id'),
                'date': example.get('date'),
                'text': example.get('text_fixed'),
                'predicate': example.get('predicate', [''])[0],  # Take first predicate
                'lemma': example.get('lemma', [''])[0],  # Take first lemma
                'predicate_group': example.get('predicate_group'),
                'llm_response': example.get('llm-response'),
                'has_negation': example.get('has_negation'),
                'argument': role_dict.get('argument'),
                'role': role_dict.get('role')
            }
            rows.append(row)
    
    # Create DataFrame from all rows
    df = pd.DataFrame(rows)
    
    return df

In [46]:
data_frame = create_roles_dataframe(data)

  0%|          | 0/7602 [00:00<?, ?it/s]

100%|██████████| 7602/7602 [00:00<00:00, 29381.39it/s]


In [47]:
data_frame = data_frame.drop(['llm_response'], axis=1)