In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score


train_df = pd.read_csv('data/563_anti_punto_switcher/train.csv')
test_df = pd.read_csv('data/563_anti_punto_switcher/test.csv')

with open('data/563_anti_punto_switcher/valid.txt', 'r', encoding='utf-8') as f:
    valid_labels = [line.strip().strip('"') for line in f.readlines()]


layout_dict = {}
for _, row in train_df.iterrows():
    s_words = str(row['text']).split()
    l_words = str(row['label']).split()
    for sw, lw in zip(s_words, l_words):
        if sw != lw:
            layout_dict[sw] = lw
print(f"Размер собранного словаря: {len(layout_dict)}")


def apply_dict(text, mapping):
    words = str(text).split()
    fixed_words = [mapping.get(w, w) for w in words]
    return " ".join(fixed_words)


dict_preds = [apply_dict(t, layout_dict) for t in test_df['text']]
dict_acc = accuracy_score(valid_labels, dict_preds)
print(f"Accuracy (только словарь): {dict_acc:.4f}")

Размер собранного словаря: 3973
Accuracy (только словарь): 0.8403


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset

MODEL_NAME = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def prepare_token_data(df):
    data = []
    for _, row in df.iterrows():
        text = str(row['text'])
        label_text = str(row['label'])
        
        tokenized = tokenizer(text, truncation=True, max_length=128, return_offsets_mapping=True)

        s_words = text.split()
        l_words = label_text.split()
        
        char_labels = np.zeros(len(text))
        curr = 0
        for sw, lw in zip(s_words, l_words):
            start = text.find(sw, curr)
            end = start + len(sw)
            if sw != lw:
                char_labels[start:end] = 1
            curr = end
            
        token_labels = []
        for start, end in tokenized['offset_mapping']:
            if start == end: 
                token_labels.append(-100)
            else:
                token_labels.append(int(char_labels[start:end].max()))
        
        data.append({
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "labels": token_labels
        })
    return data


train_dataset = Dataset.from_list(prepare_token_data(train_df))
data_collator = DataCollatorForTokenClassification(tokenizer)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    num_train_epochs=3,
    learning_rate=3e-5,
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.1471
200,0.0335
300,0.0242
400,0.0209
500,0.0166
600,0.0116
700,0.0091
800,0.0087
900,0.009
1000,0.0066


TrainOutput(global_step=1779, training_loss=0.01848309774824982, metrics={'train_runtime': 31.6359, 'train_samples_per_second': 1797.959, 'train_steps_per_second': 56.234, 'total_flos': 24937245504576.0, 'train_loss': 0.01848309774824982, 'epoch': 3.0})

In [4]:
def switch_layout(text):
    en = "qwertyuiop[]asdfghjkl;'zxcvbnm,./`QWERTYUIOP{}ASDFGHJKL:\"ZXCVBNM<>?~"
    ru = "йцукенгшщзхъфывапролджэячсмитьбю.ёЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ,Ё"

    to_ru = str.maketrans(en, ru)
    to_en = str.maketrans(ru, en)

    if any(c in en for c in text):
        return text.translate(to_ru)
    return text.translate(to_en)

def bert_fix_sentence(text, model, tokenizer):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(model.device)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=2)[0].cpu().numpy()
    offsets = tokenizer(text, return_offsets_mapping=True)["offset_mapping"]
    
    words = text.split()
    fixed_sentence = []
    curr_idx = 0
    
    for word in words:
        start = text.find(word, curr_idx)
        end = start + len(word)
        curr_idx = end

        should_flip = False
        for i, (o_start, o_end) in enumerate(offsets):
            if i < len(preds) and preds[i] == 1:
                if max(start, o_start) < min(end, o_end):
                    should_flip = True
                    break
        
        fixed_sentence.append(switch_layout(word) if should_flip else word)
        
    return " ".join(fixed_sentence)


bert_preds = [bert_fix_sentence(t, model, tokenizer) for t in test_df['text']]
bert_acc = accuracy_score(valid_labels, bert_preds)

In [5]:
def hybrid_fix(text, mapping, model, tokenizer):
    dict_fixed_text = apply_dict(text, mapping)
    final_fixed_text = bert_fix_sentence(dict_fixed_text, model, tokenizer)
    return final_fixed_text

hybrid_preds = [hybrid_fix(t, layout_dict, model, tokenizer) for t in test_df['text']]
hybrid_acc = accuracy_score(valid_labels, hybrid_preds)
print(f"Accuracy (Словарь + BERT): {hybrid_acc:.4f}")

print("\nСводка по Accuracy:")
print(f"Только словарь: {dict_acc:.4f}")
print(f"Только BERT:    {bert_acc:.4f}")
print(f"Гибрид:         {hybrid_acc:.4f}")

Accuracy (Словарь + BERT): 0.9909

Сводка по Accuracy:
Только словарь: 0.8403
Только BERT:    0.9884
Гибрид:         0.9909


In [11]:
with open('data/563_anti_punto_switcher/result.txt', 'w', encoding='utf-8') as f: 
    f.write('\n'.join([f'"{text}"' for text in hybrid_preds]))