In [None]:
# Install required libraries
!pip install transformers torch

import torch
from transformers import BertTokenizerFast, BertForTokenClassification, BertForQuestionAnswering, Trainer, TrainingArguments
from torch.utils.data import Dataset
import json

# Load the dataset
with open('/content/processed_data.json', 'r') as f:
    data = json.load(f)

# Prepare data for named entity recognition (NER)
def prepare_ner_data(data):
    ner_data = []
    for entry in data:
        text = f"{entry['procedure']} {entry['impact']} {entry['tactic']}"
        labels = ['O'] * len(text.split())

        procedure_tokens = entry['procedure'].split()
        for i, token in enumerate(text.split()):
            if i < len(procedure_tokens) and token == procedure_tokens[i]:
                labels[i] = 'B-PROC' if i == 0 else 'I-PROC'

        impact_start = text.find(entry['impact'])
        if impact_start != -1:
            impact_tokens = entry['impact'].split()
            impact_index = len(text[:impact_start].split())
            for i in range(impact_index, impact_index + len(impact_tokens)):
                labels[i] = 'B-IMPACT' if i == impact_index else 'I-IMPACT'

        tactic_tokens = entry['tactic'].split()
        tactic_start = len(text.split()) - len(tactic_tokens)
        for i in range(tactic_start, len(text.split())):
            labels[i] = 'B-TACTIC' if i == tactic_start else 'I-TACTIC'

        ner_data.append((text.split(), labels))

    return ner_data

ner_data = prepare_ner_data(data)

# Create label to id mapping
label_list = ['O', 'B-PROC', 'I-PROC', 'B-IMPACT', 'I-IMPACT', 'B-TACTIC', 'I-TACTIC']
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

class NERDataset(Dataset):
    def __init__(self, ner_data, tokenizer, max_len):
        self.ner_data = ner_data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.ner_data)

    def __getitem__(self, idx):
        words, labels = self.ner_data[idx]
        encoding = self.tokenizer(words,
                                  is_split_into_words=True,
                                  max_length=self.max_len,
                                  padding='max_length',
                                  truncation=True)

        word_ids = encoding.word_ids()

        # Align labels with tokens
        label_ids = [-100 if i is None else label_to_id[labels[i]] for i in word_ids]

        return {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': torch.tensor(label_ids)
        }

# Load tokenizer and model for NER
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
ner_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

# Create dataset
dataset = NERDataset(ner_data, tokenizer, max_len=128)

# Training arguments for NER
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer for NER
trainer = Trainer(
    model=ner_model,
    args=training_args,
    train_dataset=dataset,
)

# Train the NER model
trainer.train()

# Save the NER model and tokenizer
ner_model.save_pretrained('./fine_tuned_ner_model')
tokenizer.save_pretrained('./fine_tuned_ner_model')

# Load model for passage retrieval (question answering)
qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_model.save_pretrained('./fine_tuned_qa_model')

def extract_entities(text, ner_model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = ner_model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    word_ids = inputs.word_ids()[0]  # Get word IDs for the first (and only) sentence

    entities = {"procedure": [], "impact": [], "tactic": []}
    current_entity = None
    for idx, (word_id, prediction) in enumerate(zip(word_ids, predictions[0])):
        if word_id is not None:  # We're on a new word
            label = id_to_label[prediction.item()]
            token = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][idx])

            if label.startswith("B-"):
                if current_entity:
                    entities[current_entity["type"]].append(" ".join(current_entity["tokens"]))
                current_entity = {"type": label[2:].lower(), "tokens": [token]}
            elif label.startswith("I-"):
                if current_entity and current_entity["type"] == label[2:].lower():
                    current_entity["tokens"].append(token)
            else:
                if current_entity:
                    entities[current_entity["type"]].append(" ".join(current_entity["tokens"]))
                current_entity = None

    if current_entity:
        entities[current_entity["type"]].append(" ".join(current_entity["tokens"]))

    # Clean up subword tokens
    for entity_type in entities:
        entities[entity_type] = [e.replace(' ##', '') for e in entities[entity_type]]

    return entities

def classify_and_retrieve(text, ner_model, qa_model, tokenizer, document):
    try:
        # Entity extraction
        entities = extract_entities(text, ner_model, tokenizer)

        # Construct a query for the QA model
        query = f"What is the impact of {entities['procedure'][0]}?" if entities['procedure'] else "What is the main impact described?"

        # Passage retrieval
        qa_encoding = tokenizer.encode_plus(
            query,
            document,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids = qa_encoding['input_ids']
        attention_mask = qa_encoding['attention_mask']
        token_type_ids = qa_encoding['token_type_ids']

        with torch.no_grad():
            qa_outputs = qa_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        start_scores, end_scores = qa_outputs.start_logits, qa_outputs.end_logits
        start_idx = torch.argmax(start_scores)
        end_idx = torch.argmax(end_scores) + 1

        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_idx:end_idx]))

        return entities, answer
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

# Example usage
raw_text = "Babuk is a ransomware that can stop specific services related to backups. It is used for Credential Access."
document = "Babuk is a ransomware strain that emerged in 2021. It is known for its ability to encrypt files and steal sensitive data. One of its notable features is that it can stop specific services related to backups, making it harder for victims to recover their data without paying the ransom. This capability is part of its credential access tactics, as it attempts to prevent easy recovery and increase the pressure on victims to pay."

entities, supporting_passage = classify_and_retrieve(raw_text, ner_model, qa_model, tokenizer, document)
print(f"Extracted entities: {entities}")
print(f"Supporting passage: {supporting_passage}")



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.357
1000,0.0333
1500,0.0036


config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


An error occurred: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)
Extracted entities: None
Supporting passage: None
