In [1]:
import json
import torch
import numpy as np
import re
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split


def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

MODEL_NAME = "DeepPavlov/rubert-base-cased"
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

VOWELS = set("аеёиоуыэюяАЕЁИОУЫЭЮЯ")

Using device: cuda


In [2]:
def get_vowel_indices(word):
    return [i for i, char in enumerate(word) if char in VOWELS]

def get_stress_label(source_word, target_word):
    vowels_indices = get_vowel_indices(source_word)
    
    stressed_index = -1
    for i, char in enumerate(target_word):
        if char.isupper() and char in VOWELS:
            stressed_index = i
            break
    
    if stressed_index == -1:
        return 0

    try:
        label = vowels_indices.index(stressed_index)
        return label
    except ValueError:
        return 0

def preprocess_data(path, is_train=True):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    processed = []
    
    for item in data:
        source_text = item['source']
        match = re.search(r'\b[А-ЯЁ]{2,}\b', source_text)
        
        if match:
            target_word_caps = match.group(0)
            start, end = match.span()
            text_with_marker = source_text[:start] + f"[SEP] {target_word_caps} [SEP]" + source_text[end:]
            
            entry = {
                'text': text_with_marker,
                'original_word': target_word_caps,
                'original_text': source_text
            }
            
            if is_train:
                target_word_stressed = item['target']
                words_source = source_text.split()
                words_target = target_word_stressed.split()

                target_token_stress = ""
                for ws, wt in zip(words_source, words_target):
                    if ws == target_word_caps:
                        target_token_stress = wt
                        break
                        
                if not target_token_stress:
                     pattern = re.compile(target_word_caps, re.IGNORECASE)
                     found = pattern.search(target_word_stressed)
                     if found:
                         target_token_stress = target_word_stressed[found.start():found.end()]

                label = get_stress_label(target_word_caps, target_token_stress)
                entry['label'] = label
            
            processed.append(entry)
            
    return processed

train_data_raw = preprocess_data('data/373_homographs/train.json', is_train=True)
test_data_raw = preprocess_data('data/373_homographs/test.json', is_train=False)

print(f"Train samples: {len(train_data_raw)}")
print(f"Test samples: {len(test_data_raw)}")
print(f"Example train: {train_data_raw[0]}")

Train samples: 140954
Test samples: 17076
Example train: {'text': 'титул [SEP] АВГУСТА [SEP] носили римские императоры вплоть до диоклетиана .', 'original_word': 'АВГУСТА', 'original_text': 'титул АВГУСТА носили римские императоры вплоть до диоклетиана .', 'label': 0}


In [3]:
class StressDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        inputs = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if 'label' in item:
            inputs['labels'] = torch.tensor(item['label'], dtype=torch.long)
            
        return inputs

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_split, val_split = train_test_split(train_data_raw, test_size=0.1, random_state=42)

train_dataset = StressDataset(train_split, tokenizer, MAX_LEN)
val_dataset = StressDataset(val_split, tokenizer, MAX_LEN)
test_dataset = StressDataset(test_data_raw, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [5]:
NUM_LABELS = 10 

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, data_loader, optimizer, scheduler):
    model.train()
    losses = []
    correct_predictions = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader):
    model.eval()
    correct_predictions = 0
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            
    return correct_predictions.double() / len(data_loader.dataset)

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, scheduler)
    print(f"Train loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")
    
    val_acc = eval_model(model, val_loader)
    print(f"Val Accuracy: {val_acc:.4f}")
    print("-" * 20)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1


Training:   0%|          | 0/1983 [00:00<?, ?it/s]

Train loss: 0.6450, Accuracy: 0.7014
Val Accuracy: 0.8361
--------------------


In [8]:
def apply_stress(word, vowel_idx):
    word_lower = word.lower()
    vowel_positions = get_vowel_indices(word_lower)

    if vowel_idx >= len(vowel_positions):
        idx_to_stress = vowel_positions[-1]
    else:
        idx_to_stress = vowel_positions[vowel_idx]
        
    chars = list(word_lower)
    chars[idx_to_stress] = chars[idx_to_stress].upper()
    return "".join(chars)


model.eval()
predictions_list = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        
        predictions_list.extend(preds.cpu().numpy())


output_words = []
for i, item in enumerate(test_data_raw):
    word = item['original_word']
    pred_label = predictions_list[i]
    
    result_word = apply_stress(word, pred_label)
    output_words.append(result_word)

output_file = 'data/373_homographs/submission.txt'
with open(output_file, 'w', encoding='utf-8') as f:
    for word in output_words:
        f.write(word + '\n')

print(f"Готово! Файл {output_file} сохранен. Первые 5 предсказаний:")
print(output_words[:5])

Predicting:   0%|          | 0/267 [00:00<?, ?it/s]

Готово! Файл data/373_homographs/submission.txt сохранен. Первые 5 предсказаний:
['Августа', 'авгУста', 'авгУста', 'авгУста', 'адОнис']
