In [1]:
import json
%load_ext rich
import datasets as ds
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("role-mapping.json", 'r') as f:
    role_mapping = json.loads(f.read())

In [3]:
import spacy
model = spacy.load("ru_core_news_lg")

In [4]:
def spacy_lemmatize(word):
    return next(iter(model(word))).lemma_

In [5]:
data = ds.load_dataset("Rexhaif/framebank_srl")

In [6]:
def remapper(example):
    return {
        'text_fixed': " ".join(example['tokens']) if 'tokens' in example else example['text'] if 'text' in example else example['text_fixed']
    }

In [7]:
data = data.map(remapper)

In [8]:
def filter_tagged(example):
    return any(example['srl'])

In [9]:
data = data.filter(filter_tagged)

In [10]:
def get_predicate(example):
    predicate = example['lemmas'][example['rank'].index('Предикат')]
    return predicate

def register_predicates(ds):
    predicates_count = {}
    def handle_predicate(example):
        predicate = get_predicate(example)
        if predicate not in predicates_count:
            predicates_count[predicate] = 1
        else:
            predicates_count[predicate] += 1
        return {'predicate': predicate}

    ds.map(handle_predicate)
    print(predicates_count)
    return predicates_count

In [11]:
predicates_count = register_predicates(data['train'])

{}


In [12]:
good_predicates = [pred for (pred, cnt) in predicates_count.items() if cnt >= 10 and pred in role_mapping.keys()]

In [13]:
def filter_fn(example):
    predicate = get_predicate(example)
    return predicate in good_predicates and all([role in role_mapping[predicate] for role in example['srl'] if role is not None])

In [14]:
data['train'].filter(filter_fn)


[1;35mDataset[0m[1m([0m[1m{[0m
    features: [1m[[0m[32m'doc_id'[0m, [32m'sent_idx'[0m, [32m'tokens'[0m, [32m'lemmas'[0m, [32m'rank'[0m, [32m'srl'[0m, [32m'text_fixed'[0m[1m][0m,
    num_rows: [1;36m0[0m
[1m}[0m[1m)[0m

In [15]:
def find_verbs(text):
    verbs, forms = list(), list()
    for tok in model(text):
        if tok.pos_ == "VERB":
            verbs.append(tok.lemma_)
            forms.append(tok.text)
    return verbs, forms

In [16]:
import datasets as ds
from tqdm.auto import tqdm

In [25]:
def extract_examples(ds, n_examples=5):
    examples = {}
    def pick_as_example(example):
        predicate = get_predicate(example)
        if predicate not in examples:
            examples[predicate] = []
        exclude = False
        if len(examples[predicate]) < n_examples:
            examples[predicate].append(example)
            exclude = True
        return {'exclude': exclude}

    pick_as_example(ds[0])
    new_ds = ds.map(pick_as_example)
    new_ds = new_ds.filter(lambda example: not example['exclude'])
    return new_ds, examples

In [37]:
data = data.shuffle(seed=42)
data, examples_for_predicates = extract_examples(data)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21716/21716 [00:02<00:00, 9733.71 examples/s]
Filter: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21716/21716 [00:01<00:00, 20024.29 examples/s]


In [27]:
from typing import List, Literal
from pydantic import BaseModel, Field, model_validator
from typing_extensions import Annotated

class SemanticRole(BaseModel):
    short_reasoning: Annotated[str, Field(min_length=16, max_length=64)]
    arg_role: Literal["Cause", "Experiencer", "Causator", "Deliberative", "Instrument", "Object", "Not-Applicable"]
    arg_phrase_or_clause: Annotated[str, Field(min_length=1, max_length=64)]
    arg_main_indicative_word: Annotated[str, Field(min_length=1, max_length=32)]

class SemanticRoleMarkup(BaseModel):
    roles: List[SemanticRole]
    model_config = {
        "title": "SemanticRoleMarkup",
        "description": "Semantic Role Markup"
    }

In [28]:
def make_prompt_for_example_vllm(example):
    example_predicate = get_predicate(example)
    rule_set = role_mapping[example_predicate]
    rule_set = json.dumps(rule_set, ensure_ascii=False, indent=4)
    example_set = examples_for_predicates[example_predicate]

    prompt = [
        {
            'role': 'system',
            'content': f'''
You are a native russian linguist specializing in semantic role labelling. \
You must find the verb "{example_predicate}" and its arguments in the sentence \
and determine the semantic roles of the arguments from one of the following:
```json{rule_set}```, or assign the role 'Not-Applicable' if none of the listed roles fit. '''
        }
    ]
    examples = ""
    for ex in example_set:
        examples += f"Example Text:\n{ex['text_fixed']}\n"

        semantic_roles = ""
        for (word, role) in zip(ex['lemmas'], ex['srl']):
            if role is not None:
                semantic_roles += f'{word}#{role}\n'
        examples += f"Example Semantic Roles:\n{semantic_roles}\n\n"

    inputs = f"""
Given a series of few-shot examples, please predict semantic roles in a target example.
Here are the few-shot examples:
{examples}

Here is the target sentence:
{example['text_fixed']}

Instructions:
- Do not mark semantic roles for implied, implicit or otherwise not presented arguments
- Reason out loud (concisely) before answering
- Predict both argument phrase (or clause) and a main indicative word of such phrase
- Some arguments may not have a phrase and will be represented by a single word. In this case use it as both argument phrase and main indicative word

Important: If there are no semantic roles for any argument that you can extract - reply with a ONLY ONE SINGLE argument markup that will have a role "Not-Applicable".
""".strip()

    

    prompt.append({
        'role': 'user',
        'content': inputs
    })
    return prompt

In [38]:
random_example = data[0]
prompt = make_prompt_for_example_vllm(random_example)
print(prompt[0]['content'], prompt[1]['content'])


You are a native russian linguist specializing in semantic role labelling. You must find the verb "возмущать" and its arguments in the sentence and determine the semantic roles of the arguments from one of the following:
```json[
    "говорящий - субъект психологического состояния",
    "субъект психологического состояния",
    "причина",
    "содержание высказывания"
]```, or assign the role 'Not-Applicable' if none of the listed roles fit.  Given a series of few-shot examples, please predict semantic roles in a target example.
Here are the few-shot examples:
Example Text:
Хотя бы подумали о пустой трате средств , возмущался Данилов . . .
Example Semantic Roles:
подумать#содержание высказывания
данилов#говорящий - субъект психологического состояния


Example Text:
Как обыватель я имею возможность либо возмущаться , либо , простите , глотать слюньки , читая , как из Москвы в Питер привозили « сливки » нашей интеллигенции для того , чтобы они , то есть « сливки » , могли в костюмах эпо

In [39]:
import ollama

def make_request_ollama(model, example):
    messages = make_prompt_for_example_vllm(example)
    client = ollama.Client(
      host='http://localhost:11435'
    )
    try:
        stream = client.chat(model=model, messages=messages, stream=True)

        response = ""
        for chunk in stream:
            print(chunk['message']['content'], end='', flush=True)
            response += chunk['message']['content']

        return {
            'llm-response': response
        }
    except Exception as e:
        return {
            'llm-response': f"ERROR + {e}"
        }

In [40]:
make_request_ollama('deepseek-r1:32b', random_example)

<think>
Alright, I'm trying to figure out the semantic roles in the target sentence using the examples provided. The task is to identify the verb "возмущать" and its arguments, then assign the appropriate roles from the given list.

First, let's look at the target sentence:
"Текли они от страха и отчаяния , и поэтому юнкер все время возмущался , тщетно пытаясь выдать их за слезы оскорбленной гордости ."

The verb here is "возмущался," which means "was upset" or "protested." The subject of this verb is "юнкер," a noun referring to a cadet. So, the main argument is "юнкер."

Looking at the examples provided, when the verb is "возмущать," the subject (like "Данилов," "я," "мать," etc.) is usually assigned the role of "говорящий - субъект психологического состояния." This makes sense because they are expressing their emotional state through protest or upset.

In this case, "юнкер" is the one who is upset. There's no other explicit argument in the sentence that fits into the roles like "при


[1m{[0m
    [32m'llm-response'[0m: [32m'[0m[32m<[0m[32mthink[0m[32m>\nAlright, I\'m trying to figure out the semantic roles in the target sentence using the examples provided. The task is to identify the verb "возмущать" and its arguments, then assign the appropriate roles from the given list.\n\nFirst, let\'s look at the target sentence:\n"Текли они от страха и отчаяния , и поэтому юнкер все время возмущался , тщетно пытаясь выдать их за слезы оскорбленной гордости ."\n\nThe verb here is "возмущался," which means "was upset" or "protested." The subject of this verb is "юнкер," a noun referring to a cadet. So, the main argument is "юнкер."\n\nLooking at the examples provided, when the verb is "возмущать," the subject [0m[32m([0m[32mlike "Данилов," "я," "мать," etc.[0m[32m)[0m[32m is usually assigned the role of "говорящий - субъект психологического состояния." This makes sense because they are expressing their emotional state through protest or upset.\n\nIn this cas

In [35]:
def map_to_words_info(example):
    values = zip(example['tokens'],
               example['lemmas'],
               example['rank'],
               example['srl'])
    return {'words': list(values)}

In [41]:
map_to_words_info(random_example)


[1m{[0m
    [32m'words'[0m: [1m[[0m
        [1m([0m[32m'Текли'[0m, [32m'течь'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'они'[0m, [32m'они'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'от'[0m, [32m'от'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'страха'[0m, [32m'страх'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'и'[0m, [32m'и'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'отчаяния'[0m, [32m'отчаяние'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m','[0m, [3;35mNone[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'и'[0m, [32m'и'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'поэтому'[0m, [32m'поэтому'[0m, [3;35mNone[0m, [3;35mNone[0m[1m)[0m,
        [1m([0m[32m'юнкер'[0m, [32m'юнкер'[0m, [32m'Субъект'[0m, [32m'говорящий - субъект психологическог

In [35]:
import time

In [36]:
t1 = time.time()
data = data.map(make_request_openai, num_proc=16)
t2 = time.time() - t1

Map (num_proc=16): 100%|██████████| 7602/7602 [09:45<00:00, 12.98 examples/s]  


In [38]:
def fix_roles_vllm(example):
    if example['llm-response'] is not None:
        roles = []
        try:
            response = json.loads(example['llm-response'])
            for item in response['roles']:
                #print(item)
                role = item['arg_role']
                content = item['arg_phrase_or_clause']
                if 'Not-Applicable' not in {role, content}:
                    roles.append({
                        'role': role.strip(),
                        'argument': content
                    })
        except Exception as e:
            print(e)
        return {'roles': roles}
    else:
        return {'roles': []}

In [39]:
def fix_roles_openai(example):
    if example['llm-response'] is not None and '- No-Roles#No-Roles' not in example['llm-response'] and "ERROR +" not in example['llm-response']:
        roles = []
        for item in example['llm-response'].split("\n"):
            role = item.split("#")[-1]
            argument = item.replace(f"#{role}", "")
            argument = argument.replace("- ", "")
            roles.append({
                'argument': argument,
                'role': role
            })
        return {'roles': roles}
    else:
        return {'roles': []}

In [40]:
data = data.map(fix_roles_openai)

Map: 100%|██████████| 7602/7602 [00:00<00:00, 28534.62 examples/s]


In [73]:
from typing import List, Dict

def create_roles_dataframe(examples: List[Dict]) -> pd.DataFrame:
    # List to store all rows
    rows = []
    
    for example in tqdm(examples):
        # Get roles list from the example
        roles = example.get('roles', [])

        # For each role in the example (except No-Roles)
        for role_dict in roles:
            if role_dict['role'] == 'No-Roles':
                continue
                
            # Create a new row with all metadata and role information
            row = {
                'group': example.get('group'),
                'global_id': example.get('global_id'),
                'date': example.get('date'),
                'text': example.get('text_fixed'),
                'predicate': example.get('predicate', [''])[0],  # Take first predicate
                'lemma': example.get('lemma', [''])[0],  # Take first lemma
                'predicate_group': example.get('predicate_group'),
                'llm_response': example.get('llm-response'),
                'has_negation': example.get('has_negation'),
                'argument': role_dict.get('argument'),
                'role': role_dict.get('role')
            }
            rows.append(row)
    
    # Create DataFrame from all rows
    df = pd.DataFrame(rows)
    
    return df

In [74]:
df = create_roles_dataframe(data['train'])
df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3489/3489 [00:00<00:00, 7282.84it/s]


In [46]:
data_frame = create_roles_dataframe(data)

  0%|          | 0/7602 [00:00<?, ?it/s]

100%|██████████| 7602/7602 [00:00<00:00, 29381.39it/s]


In [47]:
data_frame = data_frame.drop(['llm_response'], axis=1)