In [None]:
!pip install -U transformers

In [None]:
import warnings
import os

os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic.*")


In [None]:
import pandas as pd
import random
import json
import torch
import numpy as np
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# КОНФИГУРАЦИЯ 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
WINDOW_SIZE = 510  # 510 тк нужно место для [CLS] и [SEP]
#STRIDE = 256       # 50% перекрытие
#STRIDE = 128       # 75% перекрытие
STRIDE = 64       # 87% перекрытие
MODEL_NAME = "Gherman/bert-base-NER-Russian"

# # Настройка устройства
# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     pin_memory_setting = False
#     print("MPS, pin_memory отключен")
# else:
#     pin_memory_setting = True
#     print("CPU/GPU, pin_memory включен")

In [None]:
# ЗАГРУЗКА ТОКЕНИЗАТОРА 
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Gherman/bert-base-NER-Russian")

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import json

def read_jsonl_dataset(file_path):
    """Чтение JSONL датасета """
    sentences = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line.strip())
            tokens = item['tokens']
            ner_tags = item['ner_tags']
            
            # Создаем предложение в формате (word, tag)
            sentence = [(token, tag) for token, tag in zip(tokens, ner_tags)]
            sentences.append(sentence)
    
    print(f"Загружено {len(sentences)} предложений из JSONL файла")
    return sentences

def create_label_mapping(sentences):
    """Создание mapping между метками и ID"""
    all_labels = set()
    for sentence in sentences:
        for word, label in sentence:
            all_labels.add(label)
    
    # Сортируем метки, начиная с 'O'
    label_list = ['O'] + sorted([l for l in all_labels if l != 'O'])
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for label, i in label2id.items()}
    
    print(f"Найдено {len(label_list)} меток: {label_list}")
    return label2id, id2label, label_list

def align_labels_with_tokens(word_ids, labels):
    """Выравнивает метки с токенами"""
    aligned_labels = []
    previous_word_idx = None
    
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(labels[word_idx])
        else:
            aligned_labels.append(-100)
        previous_word_idx = word_idx
    
    return aligned_labels

def create_sliding_windows_complete(words, labels, tokenizer, window_size=510, stride=256):
    """Создает перекрывающиеся окна, сохраняя ВСЕ части текста"""
    
    # Токенизируем весь текст 
    encoding = tokenizer(
        words,
        is_split_into_words=True,
        truncation=False,
        padding=False,
        return_offsets_mapping=True
    )
    
    full_tokens = encoding['input_ids']
    word_ids = encoding.word_ids()
    
    # Если текст короткий - возвращаем как есть
    if len(full_tokens) <= window_size:
        aligned_labels = align_labels_with_tokens(word_ids, labels)
        return [(full_tokens, aligned_labels)]
    
    windows = []
    
    # Создаем перекрывающиеся окна до самого конца
    start_idx = 0
    while start_idx < len(full_tokens):
        end_idx = min(start_idx + window_size, len(full_tokens))
        
        # Cохраняем все окна, даже короткие в конце
        window_tokens = full_tokens[start_idx:end_idx]
        window_word_ids = word_ids[start_idx:end_idx]
        
        # Выравниваем метки для этого окна
        window_labels = []
        previous_word_idx = None
        
        for word_idx in window_word_ids:
            if word_idx is None:
                window_labels.append(-100)
            elif word_idx != previous_word_idx:
                window_labels.append(labels[word_idx])
            else:
                window_labels.append(-100)
            previous_word_idx = word_idx
        
        windows.append((window_tokens, window_labels))
        
        if end_idx == len(full_tokens):
            break
            
        # Сдвигаем окно
        start_idx += stride
        
        # Гарантируем, что последнее окно захватывает конец текста
        if start_idx + window_size >= len(full_tokens) and start_idx < len(full_tokens):
            start_idx = max(0, len(full_tokens) - window_size)
    
    print(f"Создано {len(windows)} окон для текста из {len(full_tokens)} токенов")
    return windows

def prepare_dataset_with_sliding_windows(jsonl_file_path, tokenizer, window_size=510, stride=256, val_fraction=0.2):
    """Подготовка датасета с контролируемым размером валидации"""
    
    print("Подготовка данных с перекрывающимися окнами...")
    
    # 1. Чтение данных
    sentences = read_jsonl_dataset(jsonl_file_path)
    print(f"Загружено {len(sentences)} исходных документов")
    
    # 2. Создание mapping меток
    label2id, id2label, label_list = create_label_mapping(sentences)
    
    # 3. Нарезаем на окна, сохраняем индекс документа
    all_windows = []
    all_labels = []
    doc_indices = []  # Для каждого окна храним индекс документа
    
    stats = {
        'total_docs': len(sentences),
        'windows_per_doc': []
    }
    
    for doc_idx, sentence in enumerate(sentences):
        words = [word for word, label in sentence]
        label_ids = [label2id[label] for word, label in sentence]
        
        # Создаем окна для этого документа
        windows = create_sliding_windows_complete(words, label_ids, tokenizer, window_size, stride)
        
        # Сохраняем каждое окно с индексом документа
        for window_tokens, window_labels in windows:
            all_windows.append(window_tokens)
            all_labels.append(window_labels)
            doc_indices.append(doc_idx)
        
        stats['windows_per_doc'].append(len(windows))
    
    total_windows = len(all_windows)
    print(f"\nСтатистика по окнам:")
    print(f"Всего окон: {total_windows}")
    print(f"Среднее число окон на документ: {sum(stats['windows_per_doc'])/len(sentences):.1f}")
    print(f"Мин/макс окон на документ: {min(stats['windows_per_doc'])} / {max(stats['windows_per_doc'])}")
    
    # 4. Набираем валидацию целыми документами
    target_val_windows = int(total_windows * val_fraction)
    print(f"\nЦелевой размер валидации: {target_val_windows} окон ({val_fraction*100:.0f}%)")
    
    # Получаем уникальные индексы документов и перемешиваем
    unique_docs = list(set(doc_indices))
    random.seed(42)
    random.shuffle(unique_docs)
    
    # Набираем документы в валидацию, пока не достигнем цели
    val_doc_indices = set()
    val_windows_count = 0
    docs_used = []
    
    for doc_idx in unique_docs:
        # Сколько окон дает этот документ
        doc_window_count = stats['windows_per_doc'][doc_idx]

        # Проверяем, что с добавлением документа, 
        # число окон не будет превышать требуемое более, чем на 10%
        if val_windows_count + doc_window_count <= target_val_windows * 1.1:  
            val_doc_indices.add(doc_idx)
            val_windows_count += doc_window_count
            docs_used.append(doc_idx)
            print(f"Добавлен документ {doc_idx}: +{doc_window_count} окон (всего: {val_windows_count}/{target_val_windows})")
        
        if val_windows_count >= target_val_windows:
            break
    
    # Остальные документы - в train
    train_doc_indices = set(unique_docs) - val_doc_indices
    
    print(f"\nИтоговое разделение:")
    print(f"Train: {len(train_doc_indices)} документов")
    print(f"Validation: {len(val_doc_indices)} документов")
    
    # Собираем окна для train и validation
    train_indices = [i for i, doc_idx in enumerate(doc_indices) if doc_idx in train_doc_indices]
    val_indices = [i for i, doc_idx in enumerate(doc_indices) if doc_idx in val_doc_indices]
    
    print(f"Train окон: {len(train_indices)} ({len(train_indices)/total_windows*100:.1f}%)")
    print(f"Validation окон: {len(val_indices)} ({len(val_indices)/total_windows*100:.1f}%)")
    
    # Проверка на утечку
    train_docs_in_val = set([doc_indices[i] for i in train_indices]) & set([doc_indices[i] for i in val_indices])
    print(f"\nПроверка на утечку: {'Утечка' if train_docs_in_val else 'Всё хорошо'}")
    
    # 5. Создание датасетов
    train_encodings = {
        'input_ids': [all_windows[i] for i in train_indices],
        'attention_mask': [[1] * len(all_windows[i]) for i in train_indices],
        'labels': [all_labels[i] for i in train_indices]
    }
    
    val_encodings = {
        'input_ids': [all_windows[i] for i in val_indices],
        'attention_mask': [[1] * len(all_windows[i]) for i in val_indices],
        'labels': [all_labels[i] for i in val_indices]
    }
    
    train_dataset = Dataset.from_dict(train_encodings)
    val_dataset = Dataset.from_dict(val_encodings)
    
    dataset = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset
    })
    
    return dataset, label2id, id2label, label_list

In [None]:
def compute_metrics(p):
    """Вычисление метрик для оценки модели"""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Убираем игнорируемые индексы (-100)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    flat_true_predictions = [p for sublist in true_predictions for p in sublist]
    flat_true_labels = [l for sublist in true_labels for l in sublist]
    
    accuracy = accuracy_score(flat_true_labels, flat_true_predictions)
    f1_weighted = f1_score(flat_true_labels, flat_true_predictions, average='weighted')
    f1_macro = f1_score(flat_true_labels, flat_true_predictions, average='macro')  
    
    return {
        'accuracy': accuracy,
        'f1': f1_weighted,  
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro,  
    }

In [None]:
def train_with_frozen_layers_for_dataset(dataset_path, output_dir='/kaggle/working/', model_name='Gherman/bert-base-NER-Russian'):
    """Обучение сначала только головы, потом всей модели для конкретного датасета"""
    
    # Подготовка данных со скользящими окнами 
    dataset, label2id, id2label, label_list = prepare_dataset_with_sliding_windows(
        dataset_path, tokenizer, WINDOW_SIZE, STRIDE
    )
    
    # Загрузка модели
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    
    print("Обучение только классификационной головы...")
    
    # Замораживаем все слои кроме классификатора
    for name, param in model.named_parameters():
        if "classifier" not in name:  
            param.requires_grad = False
        else:
            param.requires_grad = True
            print(f"Обучается: {name}")
    
    # Обучение только головы
    training_args_stage1 = TrainingArguments(
        output_dir=output_dir + '_stage1',
        learning_rate=1e-3,  # Высокий LR для головы
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps = 2,
        num_train_epochs=10,
        weight_decay=0.0,
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='no',
        report_to='none',
        use_cpu=False,
        dataloader_pin_memory=False
    )
    
    trainer_stage1 = Trainer(
        model=model,
        args=training_args_stage1,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    
    print("Обучаем только классификатор...")
    trainer_stage1.train()
    
    print("Разморозка и тонкая настройка всей модели...")
    
    # Размораживаем все слои
    for param in model.parameters():
        param.requires_grad = True
    
    # Этап 2: Тонкая настройка всей модели
    training_args_stage2 = TrainingArguments(
        output_dir=output_dir + '_stage2',
        learning_rate=3e-5,  # Низкий LR для тонкой настройки
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps = 2,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_steps=10,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        save_total_limit=1,
        report_to='none',
        use_cpu=False,
        dataloader_pin_memory=False,
        warmup_steps=0,
    )
    
    trainer_stage2 = Trainer(
        model=model,
        args=training_args_stage2,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    
    print("Тонкая настройка всей модели...")
    trainer_stage2.train()
    
    model.save_pretrained(output_dir + '_final') 
    tokenizer.save_pretrained(output_dir + '_final')
    
    return trainer_stage2, model



In [None]:
# # Запуск обучения для всех трех моделей

# print("\n1. Model 1 (Group 1)...")
# trainer1, model1 = train_with_frozen_layers_for_dataset(
#     '/kaggle/input/datasets/zirok05/resumes-307-3-models-ner/group1_dataset.jsonl',
#     '/kaggle/working/model_group1_frozen'
# )

# print("\n2. Model 2 (Group 2)...")
# trainer2, model2 = train_with_frozen_layers_for_dataset(
#     '/kaggle/input/datasets/zirok05/resumes-307-3-models-ner/group2_dataset.jsonl',
#     '/kaggle/working/model_group2_frozen'
# )

print("\n3. Model 3 (Group 3)...")
trainer3, model3 = train_with_frozen_layers_for_dataset(
    '/kaggle/input/datasets/zirok05/resumes-307-3-models-ner/group3_dataset.jsonl',
    '/kaggle/working/model_group3_frozen'
)


In [None]:
## Продолжаем обучение ещё на 5-10 эпох:

In [None]:
from transformers import TrainerCallback, EarlyStoppingCallback
def continue_training(dataset_path, output_dir='/kaggle/working/', model_name='Gherman/bert-base-NER-Russian'):
    
    dataset, label2id, id2label, label_list = prepare_dataset_with_sliding_windows(
        dataset_path, tokenizer, WINDOW_SIZE, STRIDE
    )
    model_title = model_name.split('/')[-1]
    # Загрузка модели
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    # Параметры для продолжения обучения 
    training_args = TrainingArguments(
        output_dir=f'/kaggle/working/new_model_low_lr',  
        learning_rate=1e-6,
        per_device_train_batch_size=16, 
        per_device_eval_batch_size=32,  
        gradient_accumulation_steps = 2,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_steps=5,
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1_macro',
        greater_is_better=True,
        save_total_limit=3,
        report_to="none", 
        dataloader_pin_memory=False,
        warmup_steps=0,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['validation'],
        data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )
    
    trainer.train()

    return trainer


In [None]:
# Запуск дообучения для всех трех моделей
#

# print("\n1. Model 1 (Group 1)...")
# trainer1 = continue_training(
#     dataset_path = '/kaggle/input/datasets/zirok05/resumes-307-3-models-ner/group1_dataset.jsonl',
#     model_name = '/kaggle/working/model_group1_frozen_final'
# )

# print("\n2. Model 2 (Group 2)...")
# trainer2 = continue_training(
#     dataset_path = '/kaggle/input/datasets/zirok05/resumes-307-3-models-ner/group2_dataset.jsonl',
#     model_name = '/kaggle/working/model_group2_frozen_final'
# )

print("\n3. Model 3 (Group 3)...")
trainer3 = continue_training(
    dataset_path = '/kaggle/input/datasets/zirok05/resumes-307-3-models-ner/group3_dataset.jsonl',
    model_name = '/kaggle/working/new_model_mid_lr/checkpoint-1225'
)


In [None]:
import shutil
import os

def create_model_zip(model_path, output_name):
    """Создает ZIP-архив модели для скачивания"""
    
    if not os.path.exists(model_path):
        print(f"Модель {model_path} не найдена!")
        return
    
    # Создаем zip-архив
    zip_path = f'/kaggle/working/{output_name}.zip'
    shutil.make_archive(zip_path.replace('.zip', ''), 'zip', model_path)
    
    print(f"zip-архив создан: {zip_path}")
    print(f"Размер: {os.path.getsize(zip_path) / 1024 / 1024:.1f} MB")
    
    return zip_path

# Использование:
model_path = '/kaggle/working/new_model_mid_lr/checkpoint-1225'
zip_file = create_model_zip(model_path, 'model_3_final')


In [None]:
from IPython.display import FileLink
FileLink(r'model_3_final.zip')

In [None]:
# import shutil
# import os

# # Удаление папки
# folder_path = '/kaggle/working/model_1_final.zip'
# if os.path.exists(folder_path):
#     shutil.rmtree(folder_path)
#     print(f"Папка {folder_path} удалена")
# else:
#     print(f"Папка {folder_path} не существует")