In [None]:
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
import re

# --- Настройки ---
MODEL_NAME = "distilbert-base-cased"
MAX_LENGTH = 96
BATCH_SIZE = 32
EPOCHS = 3
OUTPUT_DIR = "./fine_tuned_ner_model"

# Загрузка датасета
df = pd.read_csv("/content/generated_unstructured_texts (5).csv", sep=';', names=['id', 'label', 'text'])

# Создание списка меток (labels)
label_list = [
    "O",  # Outside
    "B-DEPTH", "I-DEPTH",
    "B-RHOB_VAL", "I-RHOB_VAL", "B-RHOB_UN", "I-RHOB_UN",
    "B-GR_VAL", "I-GR_VAL", "B-GR_UN", "I-GR_UN",
    "B-NPHI_VAL", "I-NPHI_VAL",
    "B-PEF_VAL", "I-PEF_VAL", "B-PEF_UN", "I-PEF_UN",
    "B-DTC_VAL", "I-DTC_VAL", "B-DTC_UN", "I-DTC_UN",
    "B-DATE", "I-DATE",
    "B-RESPONSIBLE", "I-RESPONSIBLE"
]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for i, label in enumerate(label_list)}

In [None]:
def create_labels(text):
    tokens = tokenizer.tokenize(text)
    labels = ["O"] * len(tokens)  # Инициализируем все как 'O'

    # Поиск и разметка сущностей
    # Глубина
    depth_match = re.search(r'глубин[еы]\s*([\d.]+)\s*м', text.lower())
    if depth_match:
        depth_str = depth_match.group(1)
        start_pos = text.lower().find(depth_str)
        end_pos = start_pos + len(depth_str)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            # Ограничиваем token_end, чтобы не выйти за границы
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-DEPTH"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-DEPTH"

    # RHOB
    rhob_match = re.search(r'(?:RHOB|RHOВ|r hob|r hobv|rhov|rhoв|rhob)=\s*([\d.]+)', text, re.IGNORECASE)
    if rhob_match:
        rhob_val = rhob_match.group(1)
        start_pos = text.find(rhob_match.group(0))
        end_pos = start_pos + len(rhob_match.group(0)) + len(rhob_val)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-RHOB_VAL"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-RHOB_VAL"

    # GR
    gr_match = re.search(r'(?:GR|GР|gr|gр)=\s*([\d.]+)', text, re.IGNORECASE)
    if gr_match:
        gr_val = gr_match.group(1)
        start_pos = text.find(gr_match.group(0))
        end_pos = start_pos + len(gr_match.group(0)) + len(gr_val)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-GR_VAL"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-GR_VAL"

    # NPHI
    nphi_match = re.search(r'(?:NPHI|nphi|nphi_)=\s*(не\s*определено|[^\s,;]+)', text, re.IGNORECASE)
    if nphi_match:
        nphi_val = nphi_match.group(1)
        start_pos = text.find(nphi_match.group(0))
        end_pos = start_pos + len(nphi_match.group(0)) + len(nphi_val)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-NPHI_VAL"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-NPHI_VAL"

    # PEF
    pef_match = re.search(r'(?:PEF|pef|pef_)=\s*([\d.]+)', text, re.IGNORECASE)
    if pef_match:
        pef_val = pef_match.group(1)
        start_pos = text.find(pef_match.group(0))
        end_pos = start_pos + len(pef_match.group(0)) + len(pef_val)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-PEF_VAL"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-PEF_VAL"

    # DTC
    dtc_match = re.search(r'(?:DTC|dtc|dtc_)=\s*([\d.]+)', text, re.IGNORECASE)
    if dtc_match:
        dtc_val = dtc_match.group(1)
        start_pos = text.find(dtc_match.group(0))
        end_pos = start_pos + len(dtc_match.group(0)) + len(dtc_val)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-DTC_VAL"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-DTC_VAL"

    # Дата
    date_match = re.search(r'Дата:\s*(\d{2}\.\d{2}\.\d{4})', text)
    if date_match:
        date_str = date_match.group(1)
        start_pos = text.find(date_match.group(0))
        end_pos = start_pos + len(date_match.group(0)) + len(date_str)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-DATE"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-DATE"

    # Ответственный
    resp_match = re.search(r'(?:Отв\.|Отв:|отв\.:|отв\.)\s*([А-Яа-яЁё\s\.]+)', text)
    if resp_match:
        resp_str = resp_match.group(1).strip()
        start_pos = text.find(resp_match.group(0))
        end_pos = start_pos + len(resp_match.group(0)) + len(resp_str)
        token_start, token_end = get_token_span(tokens, text, start_pos, end_pos)
        if token_start is not None and token_end is not None:
            token_end = min(token_end, len(labels))
            if token_start < len(labels):
                labels[token_start] = "B-RESPONSIBLE"
                for i in range(token_start + 1, token_end):
                    if i < len(labels):
                        labels[i] = "I-RESPONSIBLE"

    return labels

# Вспомогательная функция: найти токены, соответствующие позиции в тексте
def get_token_span(tokens, text, start_pos, end_pos):
    # Преобразуем текст в список символов и сопоставляем с токенами
    char_to_token = []
    current_pos = 0
    for token in tokens:
        token_len = len(tokenizer.convert_tokens_to_string([token]))
        char_to_token.extend([len(char_to_token)] * token_len)
        current_pos += token_len

    # Найти токены, покрывающие диапазон
    start_token = char_to_token[start_pos] if start_pos < len(char_to_token) else None
    end_token = char_to_token[end_pos - 1] if end_pos - 1 < len(char_to_token) else None

    if start_token is None or end_token is None:
        return None, None

    # Убедиться, что end_token >= start_token
    if end_token < start_token:
        end_token = start_token

    return start_token, end_token + 1

# --- Токенизация и разметка ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt",
        return_offsets_mapping=True,  # важно для выравнивания меток
        is_split_into_words=False
    )

    labels = []
    for i, text in enumerate(examples["text"]):
        # Создаем метки для слов/токенов
        word_labels = create_labels(text)  # Это список меток длиной len(tokens)

        # Получаем offset_mapping — соответствие токенов исходным символам
        offset_mapping = tokenized_inputs["offset_mapping"][i]

        # Выравниваем метки с токенами
        label_ids = []
        current_word_idx = 0  # Индекс текущего слова в word_labels

        for j, offset in enumerate(offset_mapping):
            # Специальные токены (CLS, SEP, PAD) имеют offset [0,0]
            if offset[0] == 0 and offset[1] == 0:
                label_ids.append(-100)  # игнорируем в loss
            else:
                # Найдем, какой токен соответствует текущей позиции
                # В нашем случае мы уже знаем, что word_labels соответствует tokens
                # Но нужно аккуратно обработать ситуацию, когда один токен может быть частью нескольких слов
                # Упрощённая логика: берем метку по индексу текущего слова
                if current_word_idx < len(word_labels):
                    label_ids.append(label_to_id[word_labels[current_word_idx]])
                    current_word_idx += 1
                else:
                    label_ids.append(-100)

        # Если меток меньше, чем токенов — заполняем -100
        while len(label_ids) < len(offset_mapping):
            label_ids.append(-100)

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    # Убираем offset_mapping, чтобы не мешало
    tokenized_inputs.pop("offset_mapping")
    return tokenized_inputs


# --- Разделение на train/val ---
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Преобразование в Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Токенизация с разметкой
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["id", "label"])
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["id", "label"])

# Модель
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

# Обучение
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    fp16=True,
    gradient_checkpointing=False,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
    max_grad_norm=1.0
)

# --- Добавляем compute_metrics ---
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

    for prediction, label in zip(predictions, labels):
        for pred, lab in zip(prediction, label):
            if lab != -100:
                true_predictions.append(label_list[pred])
                true_labels.append(label_list[lab])

    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)

    return {
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"],
        "accuracy": report["accuracy"]
    }

# --- Теперь передаём compute_metrics в Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

# Запуск обучения
trainer.train()

# Сохранение модели
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Модель для извлечения фактов сохранена в {OUTPUT_DIR}")

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.9952,0.948286,0.374815,0.326567,0.32743,0.709241


Модель для извлечения фактов сохранена в ./fine_tuned_ner_model


In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model="./fine_tuned_ner_model", tokenizer="./fine_tuned_ner_model", aggregation_strategy="simple")

text = "На глубине 496.656 м: RHOВ=2.63 г/см³, GР=76.9 API, NPHI=не определено, PEF=3.73 барн/г, DTC=194.4 мкс/фут. Дата: 30.10.2025. Отв.: Игошина К.А."

results = ner_pipeline(text)

for result in results:
    print(f"{result['entity_group']}: {result['word']} ({result['score']:.2f})")

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found
