### Imports


In [1]:
import numpy as np
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support

2025-10-07 15:04:24.082478: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759849464.279628      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759849464.337438      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Dataset

In [2]:

dataset = load_dataset("/kaggle/input/true-finred")  
print(dataset)
print(dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 11400
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 2136
    })
})
{'input': 'NEW YORK (Reuters) - Apple Inc Chief Executive Steve Jobs sought to soothe investor concerns about his health on Monday, saying his weight loss was caused by a hormone imbalance that is relatively simple to treat.', 'output': 'founded_by: Apple Inc, Steve Jobs; chief_executive_officer: Apple Inc, Steve Jobs', 'instruction': 'Given phrases that describe the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be "relation1: word1, word2; relation2: word3, word4". Options: product/material produced, manufacturer, distributed by, industry, position held, original broadcaster, owned by, founded by, distribution format, headquarte

In [3]:
relation_list = [
    "no_relation",  
    "product_or_material_produced",
    "manufacturer",
    "distributed_by",
    "industry",
    "position_held",
    "original_broadcaster",
    "owned_by",
    "founded_by",
    "distribution_format",
    "headquarters_location",
    "stock_exchange",
    "currency",
    "parent_organization",
    "chief_executive_officer",
    "director_manager",
    "owner_of",
    "operator",
    "member_of",
    "employer",
    "chairperson",
    "platform",
    "subsidiary",
    "legal_form",
    "publisher",
    "developer",
    "brand",
    "business_division",
    "location_of_formation",
    "creator"
]
label_to_id = {rel: i for i, rel in enumerate(relation_list)}
id_to_label = {i: rel for rel, i in label_to_id.items()}
num_labels = len(relation_list)

print("Relations:", relation_list)

Relations: ['no_relation', 'product_or_material_produced', 'manufacturer', 'distributed_by', 'industry', 'position_held', 'original_broadcaster', 'owned_by', 'founded_by', 'distribution_format', 'headquarters_location', 'stock_exchange', 'currency', 'parent_organization', 'chief_executive_officer', 'director_manager', 'owner_of', 'operator', 'member_of', 'employer', 'chairperson', 'platform', 'subsidiary', 'legal_form', 'publisher', 'developer', 'brand', 'business_division', 'location_of_formation', 'creator']


In [4]:
def parse_output(output_str):
    relations = []
    parts = output_str.split(';')
    for part in parts:
        part = part.strip()
        if not part:
            continue
        if ':' in part:
            rel, pair = part.split(':', 1)
            rel = rel.strip().replace('/', '_').replace(' ', '_').lower()  
            pair_parts = pair.split(',', 1)
            if len(pair_parts) == 2:
                e1 = pair_parts[0].strip()
                e2 = pair_parts[1].strip()
                if e1 and e2:
                    relations.append((e1, e2, rel))
    return relations


re_data = {'text': [], 'entity1': [], 'entity2': [], 'label': []}
for ex in dataset['train']:
    output_str = ex['output']
    if ':' not in output_str:  
        continue
    text = ex['input']
    positives = parse_output(output_str)
    if not positives:
        continue
    
 
    entities = set()
    pos_dict = {}
    for e1, e2, rel in positives:
        if rel not in label_to_id:
            continue  
        entities.add(e1)
        entities.add(e2)
        pos_dict[(e1, e2)] = rel
    
    entities = list(entities)
    num_entities = len(entities)
    if num_entities < 2:
        continue
    
    for i in range(num_entities):
        for j in range(num_entities):
            if i == j:
                continue
            e1 = entities[i]
            e2 = entities[j]
            if (e1, e2) in pos_dict:
                label = label_to_id[pos_dict[(e1, e2)]]
            else:
                label = label_to_id['no_relation']
            re_data['text'].append(text)
            re_data['entity1'].append(e1)
            re_data['entity2'].append(e2)
            re_data['label'].append(label)
re_dataset = Dataset.from_dict(re_data).train_test_split(test_size=0.1)  
print(re_dataset)
print(re_dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'entity1', 'entity2', 'label'],
        num_rows: 30733
    })
    test: Dataset({
        features: ['text', 'entity1', 'entity2', 'label'],
        num_rows: 3415
    })
})
{'text': 'In Europe, the Euro Stoxx 50, which tracks large-cap companies in the eurozone plunged 2.84%.', 'entity1': 'eurozone', 'entity2': 'Euro', 'label': 12}


In [5]:
re_model_name = "microsoft/deberta-v3-small" 
re_tokenizer = AutoTokenizer.from_pretrained(re_model_name)

def tokenize_re(examples):

    inputs = [e1 + " " + re_tokenizer.sep_token + " " + e2 + " " + re_tokenizer.sep_token + " " + t 
              for e1, e2, t in zip(examples['entity1'], examples['entity2'], examples['text'])]
    tokenized = re_tokenizer(inputs, truncation=True, max_length=512)
    tokenized['labels'] = examples['label']
    return tokenized

tokenized_re = re_dataset.map(tokenize_re, batched=True)

re_model = AutoModelForSequenceClassification.from_pretrained(
    re_model_name, 
    num_labels=num_labels, 
    id2label=id_to_label, 
    label2id=label_to_id
)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/30733 [00:00<?, ? examples/s]

Map:   0%|          | 0/3415 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import torch
torch.cuda.empty_cache()


In [7]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    return {"precision": precision, "recall": recall, "f1": f1}

re_training_args = TrainingArguments(
    output_dir="./re_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./re_logs",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    disable_tqdm=False,
    report_to="none"
)

re_trainer = Trainer(
    model=re_model,
    args=re_training_args,
    train_dataset=tokenized_re["train"],
    eval_dataset=tokenized_re["test"],  
    tokenizer=re_tokenizer,
    compute_metrics=compute_metrics
)

print("Starting RE training...")
re_trainer.train()
print("RE training done.")

  re_trainer = Trainer(


Starting RE training...




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.801,0.719643,0.7144,0.792094,0.741715
2,0.4215,0.423819,0.875976,0.869107,0.861353
3,0.2562,0.300124,0.920291,0.916837,0.915995
4,0.1884,0.247818,0.937115,0.934407,0.934346
5,0.1794,0.225217,0.94559,0.943777,0.943553




RE training done.


In [8]:
re_trainer.save_model("/kaggle/working/models/re_model")
re_tokenizer.save_pretrained("/kaggle/working/models/re_model")

('/kaggle/working/models/re_model/tokenizer_config.json',
 '/kaggle/working/models/re_model/special_tokens_map.json',
 '/kaggle/working/models/re_model/spm.model',
 '/kaggle/working/models/re_model/added_tokens.json',
 '/kaggle/working/models/re_model/tokenizer.json')

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_path = "/kaggle/working/models/re_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

re_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

text = "Donald Trump found Microsoft."
entity1 = "Donald Trump"
entity2 = "Russia"

input_text = entity1 + " " + tokenizer.sep_token + " " + entity2 + " " + tokenizer.sep_token + " " + text

re_results = re_pipeline(input_text)

for result in re_results:
    print(f"Label: {result['label']}, Score: {result['score']:.4f}")

Device set to use cuda:0


Label: no_relation, Score: 0.9456
