In [8]:
import spacy
from spacy.language import Language
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load spaCy's larger model for better NER capabilities
nlp = spacy.load("en_core_web_lg")  # or "en_core_web_lg"

# Load the Hugging Face tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bnsapa/cybersecurity-ner")
model = AutoModelForTokenClassification.from_pretrained("bnsapa/cybersecurity-ner")

# Define a custom component to use Hugging Face model for NER
@Language.component("cybersecurity_ner")
def cybersecurity_ner(doc):
    tokens = [token.text for token in doc]
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs).logits

    predicted_token_class_indices = torch.argmax(outputs, dim=2).squeeze().tolist()
    predicted_labels = [model.config.id2label[idx] for idx in predicted_token_class_indices]

    # Assign labels only to the first token of each entity
    previous_word_id = None
    for i, token in enumerate(doc):
        word_id = inputs.word_ids()[i]
        if word_id != previous_word_id:
            if predicted_labels[i] != 'O':  # Only assign non-'O' labels
                # Custom logic to improve accuracy
                if predicted_labels[i] in ["B-Malware", "B-Process"]:
                    token.ent_type_ = predicted_labels[i]
                # Prevent incorrect labels for known terms
                elif token.text.lower() in ["applicationimpersonation", "set-casmailbox"]:
                    token.ent_type_ = "B-Process"
                else:
                    token.ent_type_ = predicted_labels[i]
        previous_word_id = word_id

    return doc

# Add the custom NER component to the spaCy pipeline
nlp.add_pipe("cybersecurity_ner", last=True)

# Sample text for cybersecurity NER
text = """
Aoqin Dragon has used a dropper that employs a worm infection strategy using a removable device to breach a secure network environment.[1]
"""
# Process the text using the spaCy pipeline
doc = nlp(text)

# Display recognized entities from spaCy and the custom component
print("spaCy Recognized Entities:")
for ent in doc.ents:
    print(f"Token: {ent.text}, Label: {ent.label_}")

print("\nCustom Model Recognized Entities:")
output_found = False
for token in doc:
    if token.ent_type_ and token.ent_type_ != "O":  # Only show non-"O" labels
        print(f"Token: {token.text}, Label: {token.ent_type_}")
        output_found = True

if not output_found:
    print("No entities found by the custom model.")

spaCy Recognized Entities:
Token: Aoqin Dragon, Label: B-Organization

Custom Model Recognized Entities:
Token: Aoqin, Label: B-Organization
Token: Dragon, Label: PERSON
Token: used, Label: B-Organization


In [1]:
import spacy
from spacy.language import Language
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the Hugging Face tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bnsapa/cybersecurity-ner")
model = AutoModelForTokenClassification.from_pretrained("bnsapa/cybersecurity-ner")

# Initialize a blank spaCy pipeline
nlp = spacy.blank("en")

# Define a custom component to use the Hugging Face model for NER
@Language.component("cybersecurity_ner")
def cybersecurity_ner(doc):
    # Tokenize the doc using the Hugging Face tokenizer
    tokens = [token.text for token in doc]

    # Handle potential issues with tokenization
    try:
        inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, truncation=True, padding=True)
    except Exception as e:
        print(f"Tokenization error: {e}")
        return doc

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs).logits

    # Extract token predictions and map them to their labels
    predicted_token_class_indices = torch.argmax(outputs, dim=2).squeeze().tolist()
    predicted_labels = [model.config.id2label[idx] for idx in predicted_token_class_indices]

    # Initialize variables for subword processing
    subword_mask = inputs.word_ids()
    previous_word_id = None
    full_word = ""
    full_word_label = ""

    for i, token in enumerate(doc):
        word_id = subword_mask[i]

        if word_id != previous_word_id:
            # Finalize the previous word if needed
            if previous_word_id is not None and full_word_label:
                doc[previous_word_id].ent_type_ = full_word_label
            
            # Reset for new word
            full_word = token.text
            full_word_label = predicted_labels[i] if predicted_labels[i] != 'O' else ''
        else:
            # Accumulate the subword tokens
            full_word += token.text.replace("##", "")
        
        previous_word_id = word_id

    # Assign the last processed word
    if previous_word_id is not None and full_word_label:
        doc[previous_word_id].ent_type_ = full_word_label

    # Add additional label handling
    for token in doc:
        if token.text.lower().startswith("alert") or token.text.lower().startswith("report"):
            token.ent_type_ = "Alerting or Reporting"
        elif "registry key" in token.text.lower():
            token.ent_type_ = "Registry Keys"
        elif token.text.startswith("HKLM\\") or "SOFTWARE" in token.text.upper() or "Microsoft" in token.text:
            token.ent_type_ = "Registry Keys"
        elif token.text.startswith("\\"):
            token.ent_type_ = "Paths"

    return doc

# Add the custom NER component to the spaCy pipeline
nlp.add_pipe("cybersecurity_ner", last=True)

# Sample text to test
text = ('''Prevent administrator accounts from being enumerated when an application is elevating through UAC since it can lead to the disclosure of account names. The Registry key is located HKLM\\ SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Policies\\CredUI\\EnumerateAdministrators. It can be disabled through GPO: Computer Configuration > [Policies] > Administrative Templates > Windows Components > Credential User Interface: E numerate administrator accounts on elevation. [8]''')


# Process the text
doc = nlp(text)

# Display the recognized entities
output_found = False
for token in doc:
    if token.ent_type_ and token.ent_type_ != "O":  # Only show non-"O" labels
        print(f"Token: {token.text}, Label: {token.ent_type_}")
        output_found = True

if not output_found:
    print("No entities found.")


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Token: UAC, Label: B-System
Token: HKLM\, Label: Registry Keys
Token: SOFTWARE\Microsoft\Windows\CurrentVersion\Policies\CredUI\EnumerateAdministrators, Label: Registry Keys
