In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
import random

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

tqdm.pandas()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# === Очистка текста ===
BAD_CHARS = set(['"', "'", '“', '”', '‘', '’', '«', '»', '‹', '›', '„', '‟', '‚', '`',
                 '\u200b', '\u200c', '\u200d', '\ufeff', '\u202a', '\u202b',
                 '\u202c', '\u202d', '\u202e'])

def is_valid_char(c):
    if c in [' ', '.', ',']:
        return True
    if c in BAD_CHARS:
        return False
    name = unicodedata.name(c, '')
    return 'CYRILLIC' in name

def clean_text(text):
    text = str(text)
    cleaned = ''.join(c for c in text if is_valid_char(c))
    cleaned = re.sub(r'[.,]{2,}', '.', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

def is_clean_sentence(sent):
    sent = sent.strip()
    words = sent.split()
    if len(words) < 5 or len(words) > 25:
        return False
    if sent.isupper():
        return False
    if sum(1 for c in sent if c.isupper()) > len(sent) * 0.4:
        return False
    if re.search(r'(.)\1{2,}', sent):
        return False
    if re.match(r'^[А-Яа-я]{1,2}(\s|$)', sent):
        return False
    if len(re.findall(r'[аеиоуыіұүәө]', sent, flags=re.IGNORECASE)) < 3:
        return False
    return True

# === Загрузка данных ===
df_raw = pd.read_csv("/kaggle/input/tst-day-4/train.csv")
test_df = pd.read_csv("/kaggle/input/tst-day-4/public_test.csv")

sentences = []
for row in tqdm(df_raw["sentence"]):
    cleaned = clean_text(row)
    parts = re.split(r'[.!?]', cleaned)
    for part in parts:
        part = part.strip()
        if is_clean_sentence(part):
            sentences.append(part)

clean_df = pd.DataFrame({"clean_sentence": list(set(sentences))})
print(f"✅ Предобработка завершена: {len(clean_df)} предложений")

# === Генерация обучающих примеров (по 3 маски) ===
def generate_n_masks(sentence, n=3):
    words = sentence.split()
    if len(words) < n + 1:
        return []
    idxs = np.random.choice(len(words), size=n, replace=False)
    examples = []
    for i in idxs:
        masked = " ".join(words[:i] + words[i+1:])
        examples.append((masked, i))
    return examples

train_sentences, val_sentences = train_test_split(clean_df["clean_sentence"], test_size=0.1, random_state=42)

train_data = []
for sent in tqdm(train_sentences):
    train_data.extend(generate_n_masks(sent, n=3))

val_data = []
for sent in tqdm(val_sentences):
    val_data.extend(generate_n_masks(sent, n=3))

train_df = pd.DataFrame(train_data, columns=["masked_sentence", "label"])
val_df = pd.DataFrame(val_data, columns=["masked_sentence", "label"])

# === Токенизатор и модель ===
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

class MaskedSentenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=32):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoded = self.tokenizer(row["masked_sentence"], return_tensors="pt", 
                                 max_length=self.max_len, padding="max_length", 
                                 truncation=True)
        input_ids = encoded["input_ids"].squeeze(0)
        attention_mask = encoded["attention_mask"].squeeze(0)
        label = torch.tensor(row["label"])
        return input_ids, attention_mask, label

train_ds = MaskedSentenceDataset(train_df, tokenizer)
val_ds = MaskedSentenceDataset(val_df, tokenizer)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

class BertPositionModel(nn.Module):
    def __init__(self, bert_name="bert-base-multilingual-cased", hidden_size=768):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_name)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(hidden_size, 25)  # Предположим максимум 25 слов

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]
        cls = self.dropout(cls)
        return self.classifier(cls)

model = BertPositionModel().to(DEVICE)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# === Обучение ===
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in tqdm(train_loader):
        input_ids, attention_mask, labels = input_ids.to(DEVICE), attention_mask.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📚 Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

    # Валидация
    model.eval()
    preds, true = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids, attention_mask = input_ids.to(DEVICE), attention_mask.to(DEVICE)
            logits = model(input_ids, attention_mask)
            preds.extend(logits.argmax(dim=1).cpu().numpy())
            true.extend(labels.numpy())
    acc = accuracy_score(true, preds)
    print(f"🎯 Validation Accuracy: {acc:.4f}")

# === Инференс на тесте ===
test_df["masked_sentence_clean"] = test_df["masked_sentence"].progress_apply(clean_text)

class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=32):
        self.sentences = df["masked_sentence_clean"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        encoded = self.tokenizer(self.sentences[idx], return_tensors="pt",
                                 padding="max_length", truncation=True,
                                 max_length=self.max_len)
        input_ids = encoded["input_ids"].squeeze(0)
        attention_mask = encoded["attention_mask"].squeeze(0)
        return input_ids, attention_mask

test_ds = TestDataset(test_df, tokenizer)
test_loader = DataLoader(test_ds, batch_size=64)

model.eval()
all_preds = []
with torch.no_grad():
    for input_ids, attention_mask in tqdm(test_loader):
        input_ids, attention_mask = input_ids.to(DEVICE), attention_mask.to(DEVICE)
        logits = model(input_ids, attention_mask)
        pred = logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(pred)

# Эвристика: если начало с маленькой буквы — позиция = 0
def is_lower_start(s):
    s = s.strip()
    return len(s) > 0 and s[0].islower()

test_df["start_lower"] = test_df["masked_sentence_clean"].apply(is_lower_start)
final_preds = np.where(test_df["start_lower"], 0, all_preds)

submission = pd.DataFrame({
    "ID": test_df["ID"],
    "word_index": final_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv сохранён")


2025-06-29 08:36:23.322863: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751186183.543135      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751186183.608712      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|██████████| 26162/26162 [00:02<00:00, 12759.33it/s]


✅ Предобработка завершена: 28861 предложений


100%|██████████| 25974/25974 [00:00<00:00, 53016.67it/s]
100%|██████████| 2887/2887 [00:00<00:00, 44957.21it/s]


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

100%|██████████| 1218/1218 [04:39<00:00,  4.36it/s]


📚 Epoch 1: Loss = 2.1266
🎯 Validation Accuracy: 0.2829


100%|██████████| 1218/1218 [04:37<00:00,  4.38it/s]


📚 Epoch 2: Loss = 1.9368
🎯 Validation Accuracy: 0.3177


100%|██████████| 1218/1218 [04:38<00:00,  4.38it/s]


📚 Epoch 3: Loss = 1.8152
🎯 Validation Accuracy: 0.3403


100%|██████████| 1218/1218 [04:38<00:00,  4.38it/s]


📚 Epoch 4: Loss = 1.6862
🎯 Validation Accuracy: 0.3883


100%|██████████| 1218/1218 [04:37<00:00,  4.38it/s]


📚 Epoch 5: Loss = 1.5368
🎯 Validation Accuracy: 0.4151


100%|██████████| 1218/1218 [04:38<00:00,  4.37it/s]


📚 Epoch 6: Loss = 1.3971
🎯 Validation Accuracy: 0.4244


100%|██████████| 1218/1218 [04:37<00:00,  4.39it/s]


📚 Epoch 7: Loss = 1.2658
🎯 Validation Accuracy: 0.4531


100%|██████████| 1218/1218 [04:37<00:00,  4.38it/s]


📚 Epoch 8: Loss = 1.1441
🎯 Validation Accuracy: 0.4534


100%|██████████| 1218/1218 [04:38<00:00,  4.37it/s]


📚 Epoch 9: Loss = 1.0350
🎯 Validation Accuracy: 0.4611


100%|██████████| 1218/1218 [04:39<00:00,  4.36it/s]


📚 Epoch 10: Loss = 0.9368
🎯 Validation Accuracy: 0.4677


100%|██████████| 4997/4997 [00:00<00:00, 35204.93it/s]
100%|██████████| 79/79 [00:06<00:00, 12.71it/s]

✅ submission.csv сохранён



