In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [88]:
import os

import pprint
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document

from langchain_openai import ChatOpenAI

os.environ['OPENAI_API_KEY'] = 'xxx'

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [89]:
df = pd.read_csv('./DATA/TTPHunter_dataset.csv')

In [90]:
df

Unnamed: 0,Domain,Technique ID,Technique Name,Sentences
0,Enterprise,T1543,Create or Modify System Process: Windows Service,Carbanak malware installs itself as a service ...
1,Enterprise,T1562,Impair Defenses: Disable or Modify System Fire...,Carbanak may use netsh to add local firewall r...
2,Enterprise,T1036,Masquerading: Masquerade Task or Service,Carbanak has copied legitimate service names t...
3,Enterprise,T1036,Masquerading: Match Legitimate Name or Location,"Carbanak has named malware ""svchost.exe,"" whic..."
4,Enterprise,T1588,Obtain Capabilities: Tool,Carbanak has obtained and used open-source too...
...,...,...,...,...
8382,Enterprise,T1573,Encrypted Channel: Symmetric Cryptography,NETEAGLE will decrypt resources it downloads w...
8383,Enterprise,T1041,Exfiltration Over C2 Channel,NETEAGLE is capable of reading files over the ...
8384,Enterprise,T1083,File and Directory Discovery,NETEAGLE allows adversaries to enumerate and m...
8385,Enterprise,T1057,Process Discovery,NETEAGLE can send process listings over the C2...


In [91]:
len(set(df["Technique ID"]))

50

In [92]:
df = df[["Technique ID", "Sentences"]]

In [94]:
train, test = train_test_split(df, test_size=0.005, random_state=0)

In [95]:
len(test)

42

In [10]:
### embedding
model = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
embeddings = HuggingFaceEmbeddings(model_name = model)

#embeddings=OpenAIEmbeddings()

docs = []
for index, row in train.iterrows():
    doc = Document(page_content=row["Sentences"], metadata={"TTP_ID": row["Technique ID"]})
    docs.append(doc)

print(len(docs))

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


8345


In [28]:
docs[0]

Document(page_content='Wizard Spider has exfiltrated domain credentials and network enumeration information over command and control (C2) channels.[6]', metadata={'TTP_ID': 'T1041'})

In [29]:
db = FAISS.from_documents(docs, embeddings)

In [30]:
db.save_local("TTPHunter_50_TTPs")

In [11]:
db = FAISS.load_local("TTPHunter_50_TTPs", embeddings, allow_dangerous_deserialization=True)

In [12]:
test

Unnamed: 0,Technique ID,Sentences
6290,T1082,Unknown Logger can obtain information about th...
6326,T1036,Skidmap has created a fake rm binary to replac...
6494,T1012,Carberp has searched the Image File Execution ...
2632,T1056,FakeM contains a keylogger module.[1]
7128,T1113,ZxShell can capture screenshots.[1]
5114,T1036,The CozyCar dropper has masqueraded a copy of ...
5659,T1053,Bad Rabbit’s infpub.dat file creates a schedul...
6126,T1106,PLEAD can use ShellExecute to execute applicat...
1921,T1071,APT32 has used JavaScript that communicates ov...
882,T1518,Operation Wocao has used scripts to detect sec...


### Few-Shot Learning

In [53]:
from langchain_openai import ChatOpenAI

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-4o", temperature=0)###, model_kwargs={"top_p": 0.0})
#llm = ChatOpenAI(model="chatgpt-4o-latest", temperature=0)

#llm.invoke(prompt)


predicted_labels = []
# Iterating over the DataFrame row by row
for index, row in test.iterrows():
    #print(f"Index: {index}")
    text_to_predict = row["Sentences"]

    ##### Prompt 1
    search_results = db.similarity_search_with_score(text_to_predict, k=45)

    
    xs = ["\nText:"+ doc.page_content.lower().strip()+"\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results if score < 2]
    #xs = ["\nText:'"+ doc.page_content.lower().strip()+"'\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results]
    

    xs = ''.join(xs)+"\nInputText:"+ text_to_predict.lower().strip() +"\nClassification:"
    xs = "Your task is to classify the last sentence of 'InputText' based on the provided examples. Each example shows a sentence, inverse similarity to InputText (the lower the better), and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.\n" + xs
    #xs = "You are an expert in text classification. Below are examples of texts with their classifications based on their similarity to the 'InputText'. For the 'InputText' provided, choose the most appropriate classification based on the examples given. Only return the classification label (e.g., TXXXX). Use the closest similarity as the primary criterion for classification.\n\n" + xs
    #####
    
    ### search
    '''
    search_results = db.similarity_search_with_score(text_to_predict, k=55)
    xs = ["\n"+ "Example "+ str(i) +":\nText:"+ doc.page_content.lower()+"\nClassification:"+doc.metadata['TTP_ID']+"\n" for i, (doc, _) in enumerate(search_results)]
    ##xs = ["\nText:"+" ".join(str(doc.page_content).lower().split())+"\nClassification:"+doc.metadata['TTP_ID'] for doc, _ in search_results]
    

    ### mmr search
    #search_results = db.max_marginal_relevance_search(text_to_predict,k=45, fetch_k=250)
    #xs = ["\nText:"+doc.page_content.lower()+"\nClassification:"+doc.metadata['TTP_ID'] for doc in search_results]

    
    xs += "\n\nNote: The examples show a pattern where certain attack behaviors correspond to specific classification labels. Use this pattern recognition to classify the new text accurately.\nNow, classify the following text:\n"
    xs = ''.join(xs)+"\nText:"+ text_to_predict.lower() +"\nClassification:"
    #xs = "Your task is to classify the last sentence of 'Text' based on the provided examples. Each example shows a sentence and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.\n" + xs
    xs = "Your task is to classify the last sentence of 'Text' based on the diverse examples provided below. Learn the patterns from these examples to make accurate classifications. Output only the classification label beginning with 'T'.\n\nExamples:\n" + xs
    '''
    #print(xs)
    prompt = xs#prompt#"Write a poem about a cat chasing a butterfly."
    result = llm.invoke(prompt)
    predicted_label = result.content
    
    #predicted_label = most_common_ttp_id
    #print(predicted_label)
    predicted_labels.append(predicted_label)
    print("actual:", row["Technique ID"], ", predicted:", predicted_label)
    #break

actual: T1082 , predicted: T1082
actual: T1036 , predicted: T1036
actual: T1012 , predicted: T1012
actual: T1056 , predicted: T1056
actual: T1113 , predicted: T1113
actual: T1036 , predicted: T1036
actual: T1053 , predicted: T1053
actual: T1106 , predicted: T1106
actual: T1071 , predicted: T1071
actual: T1518 , predicted: T1518
actual: T1083 , predicted: T1083
actual: T1071 , predicted: T1071
actual: T1562 , predicted: T1562
actual: T1021 , predicted: T1021
actual: T1074 , predicted: T1074
actual: T1112 , predicted: T1112
actual: T1218 , predicted: T1218
actual: T1057 , predicted: T1057
actual: T1059 , predicted: T1059
actual: T1106 , predicted: T1055
actual: T1016 , predicted: T1016
actual: T1102 , predicted: T1102
actual: T1547 , predicted: T1547
actual: T1562 , predicted: T1562
actual: T1069 , predicted: T1069
actual: T1555 , predicted: T1555
actual: T1562 , predicted: T1562
actual: T1056 , predicted: T1056
actual: T1204 , predicted: T1204
actual: T1140 , predicted: T1140
actual: T1

### Evaluate

In [54]:
cleaned_labels = [label.replace('Classification:', '') for label in predicted_labels]

# Example vectors containing strings
y_true = test["Technique ID"].values#["cat", "dog", "fish", "cat", "dog"]
#y_true = train_data.Key
y_pred = cleaned_labels#.values#["cat", "dog", "cat", "cat", "dog"]

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_true, y_pred, average='weighted')
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_true, y_pred, average='weighted')
print(f"Recall: {recall}")

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.9523809523809523
Precision: 0.9761904761904762
Recall: 0.9523809523809523
F1 Score: 0.9603174603174602


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
print(xs)

Your task is to classify the last sentence of 'InputText' based on the provided examples. Each example shows a sentence, inverse similarity to InputText (the lower the better), and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.

Text:powerstats uses powershell for obfuscation and execution.[1][4][5]
Similarity:0.36848262
Classification:T1059

InputText:powerstats uses character replacement, powershell environment variables, and xor encoding to obfuscate code. powerstats's backdoor code is a multi-layer obfuscated, encoded, and compressed blob. [3][4] powerstats has used powershell code with custom string obfuscation [5]
Classification:


In [2]:
["\nClassification:"+str(score)+"\n" for doc, score in search_results]

NameError: name 'search_results' is not defined

### Few Shot learning 2

In [53]:
# Aggregate predictions using majority voting
from collections import Counter

def majority_voting(predictions):
    count = Counter(predictions)
    return count.most_common(1)[0][0]

In [55]:
import random

def create_prompt(template, search_results, text_to_predict):
    xs = ["\nText:"+ doc.page_content.lower()+"\nClassification:"+doc.metadata['TTP_ID'] for doc, _ in search_results]
    xs = ''.join(xs)+"\nText:"+ text_to_predict.lower() +"\nClassification:"
    return template + xs

def split_list(lst):
    random.shuffle(lst)  # Shuffle the list to ensure randomness
    n = len(lst)
    third = n // 3
    remainder = n % 3

    # Determine the splitting points
    split1 = third
    split2 = third * 2

    if remainder == 1:
        split1 += 1
    elif remainder == 2:
        split1 += 1
        split2 += 1

    # Split the list
    first_third = lst[:split1]
    second_third = lst[split1:split2]
    third_third = lst[split2:]

    return first_third, second_third, third_third



predicted_labels = []
# Iterating over the DataFrame row by row
for index, row in test.iterrows():
    #print(f"Index: {index}")
    text_to_predict = row["Sentences"]

    search_results = db.similarity_search_with_score(text_to_predict, k=150)

    template1 = "Your task is to classify the last sentence of 'Text' based on the provided examples. Each example shows a sentence and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.\n"
    template2 = "Please classify the last sentence of 'Text' based on the provided examples. Each example shows a sentence and its classification label. Use the patterns from these examples to classify the given text correctly. Return only the classification label starting with 'T'.\n"
    template3 = "Your task is to determine the classification of the last sentence of 'Text' using the provided examples. Each example pairs a sentence with its classification label. Analyze these examples to classify the given text. Output only the classification label starting with 'T'.\n"

    ## Split the list
    first_part, second_part, third_part = split_list(search_results)

    
    prompt1 = create_prompt(template1, first_part, text_to_predict)
    prompt2 = create_prompt(template2, second_part, text_to_predict)
    prompt3 = create_prompt(template3, third_part, text_to_predict)
    
    # Collect predictions
    prompts = [prompt1, prompt2, prompt3]
    predictions = []
    
    for prompt in prompts:
        
        prediction = llm.invoke(prompt).content
        predictions.append(prediction)

    print(predictions)
    
    
    final_prediction = majority_voting(predictions)
    #print("Final Prediction:", final_prediction)
    predicted_labels.append(final_prediction)
    print("actual:", row["Technique ID"], ", predicted:", final_prediction)

['T1082', 'T1082', 'T1082']
actual: T1082 , predicted: T1082
['T1036', 'T1036', 'T1036']
actual: T1036 , predicted: T1036
['T1083', 'T1518', 'T1518']
actual: T1012 , predicted: T1518
['T1056', 'T1056', 'T1056']
actual: T1056 , predicted: T1056
['T1113', 'T1113', 'T1113']
actual: T1113 , predicted: T1113
['T1036', 'T1036', 'T1036']
actual: T1036 , predicted: T1036
['T1053', 'T1053', 'T1053']
actual: T1053 , predicted: T1053
['T1106', 'T1106', 'T1106']
actual: T1106 , predicted: T1106
['T1071', 'T1071', 'T1071']
actual: T1071 , predicted: T1071
['T1518', 'T1518', 'T1518']
actual: T1518 , predicted: T1518
['T1083', 'T1083', 'T1083']
actual: T1083 , predicted: T1083
['T1071', 'T1071', 'T1071']
actual: T1071 , predicted: T1071
['T1562', 'T1562', 'T1562']
actual: T1562 , predicted: T1562
['T1021', 'T1021', 'T1021']
actual: T1021 , predicted: T1021
['T1074', 'T1074', 'T1074']
actual: T1074 , predicted: T1074
['T1112', 'T1112', 'T1112']
actual: T1112 , predicted: T1112
['T1218', 'T1218', 'T121

### Evaluate

In [61]:
cleaned_labels = [label.replace('Classification:', '') for label in predicted_labels]

# Example vectors containing strings
y_true = test["Technique ID"].values#["cat", "dog", "fish", "cat", "dog"]
#y_true = train_data.Key
y_pred = cleaned_labels#.values#["cat", "dog", "cat", "cat", "dog"]

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_true, y_pred, average='weighted')
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_true, y_pred, average='weighted')
print(f"Recall: {recall}")

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.9285714285714286
Precision: 0.9642857142857143
Recall: 0.9285714285714286
F1 Score: 0.9365079365079364


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


temperature:0, k=45

Accuracy: 0.9523809523809523
Precision: 0.9761904761904762
Recall: 0.9523809523809523
F1 Score: 0.9603174603174602

temperature:1, k=45

Accuracy: 0.9285714285714286
Precision: 0.9642857142857143
Recall: 0.9285714285714286
F1 Score: 0.9365079365079364

### V2 - SecureBERT Embeddings

In [96]:
### embedding
model2 = "ehsanaghaei/SecureBERT"
embeddings2 = HuggingFaceEmbeddings(model_name = model2)

docs2 = []
for index, row in train.iterrows():
    doc = Document(page_content=row["Sentences"], metadata={"TTP_ID": row["Technique ID"]})
    docs2.append(doc)

print(len(docs2))

No sentence-transformers model found with name ehsanaghaei/SecureBERT. Creating a new one with MEAN pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at ehsanaghaei/SecureBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


8345


In [97]:
db2 = FAISS.from_documents(docs2, embeddings2)


In [108]:
from langchain_openai import OpenAIEmbeddings

embeddings3 = OpenAIEmbeddings(
    model="text-embedding-3-large",
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)

db3 = FAISS.from_documents(docs2, embeddings3)


In [13]:
db2

<langchain_community.vectorstores.faiss.FAISS at 0x7b116955bdf0>

In [109]:
from langchain_openai import ChatOpenAI

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-4o", temperature=0)###, model_kwargs={"top_p": 0.0})
#llm = ChatOpenAI(model="o1-mini", temperature=1)
#llm = ChatOpenAI(model="chatgpt-4o-latest", temperature=0)

#llm.invoke(prompt)

scores_list = []
predicted_labels = []
# Iterating over the DataFrame row by row
for index, row in test.iterrows():
    #print(f"Index: {index}")
    text_to_predict = row["Sentences"]

    ##### Prompt 1
    search_results = db3.similarity_search_with_score(text_to_predict, k=45)

    
    xs = ["\nText:"+ doc.page_content.lower().strip()+"\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results if score < 2]
    #xs = ["\nText:'"+ doc.page_content.lower().strip()+"'\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results]
    

    xs = ''.join(xs)+"\nInputText:"+ text_to_predict.lower().strip() +"\nClassification:"
    xs = "Your task is to classify the last sentence of 'InputText' based on the provided examples. Each example shows a sentence, inverse similarity to InputText (the lower the better), and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.\n" + xs
    #xs = "You are an expert in text classification. Below are examples of texts with their classifications based on their similarity to the 'InputText'. For the 'InputText' provided, choose the most appropriate classification based on the examples given. Only return the classification label (e.g., TXXXX). Use the closest similarity as the primary criterion for classification.\n\n" + xs
    #####
    [scores_list.append(score) for doc, score in search_results]
    ### search
    '''
    search_results = db.similarity_search_with_score(text_to_predict, k=55)
    xs = ["\n"+ "Example "+ str(i) +":\nText:"+ doc.page_content.lower()+"\nClassification:"+doc.metadata['TTP_ID']+"\n" for i, (doc, _) in enumerate(search_results)]
    ##xs = ["\nText:"+" ".join(str(doc.page_content).lower().split())+"\nClassification:"+doc.metadata['TTP_ID'] for doc, _ in search_results]
    

    ### mmr search
    #search_results = db.max_marginal_relevance_search(text_to_predict,k=45, fetch_k=250)
    #xs = ["\nText:"+doc.page_content.lower()+"\nClassification:"+doc.metadata['TTP_ID'] for doc in search_results]

    
    xs += "\n\nNote: The examples show a pattern where certain attack behaviors correspond to specific classification labels. Use this pattern recognition to classify the new text accurately.\nNow, classify the following text:\n"
    xs = ''.join(xs)+"\nText:"+ text_to_predict.lower() +"\nClassification:"
    #xs = "Your task is to classify the last sentence of 'Text' based on the provided examples. Each example shows a sentence and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.\n" + xs
    xs = "Your task is to classify the last sentence of 'Text' based on the diverse examples provided below. Learn the patterns from these examples to make accurate classifications. Output only the classification label beginning with 'T'.\n\nExamples:\n" + xs
    '''
    #print(xs)
    prompt = xs#prompt#"Write a poem about a cat chasing a butterfly."
    result = llm.invoke(prompt)
    predicted_label = result.content
    
    #predicted_label = most_common_ttp_id
    #print(predicted_label)
    predicted_labels.append(predicted_label)
    print("actual:", row["Technique ID"], ", predicted:", predicted_label)
    #break

actual: T1082 , predicted: T1082
actual: T1036 , predicted: T1036
actual: T1012 , predicted: T1012
actual: T1056 , predicted: T1056
actual: T1113 , predicted: T1113
actual: T1036 , predicted: T1036
actual: T1053 , predicted: T1053
actual: T1106 , predicted: T1106
actual: T1071 , predicted: T1105
actual: T1518 , predicted: T1518
actual: T1083 , predicted: T1083
actual: T1071 , predicted: T1071
actual: T1562 , predicted: T1562
actual: T1021 , predicted: T1021
actual: T1074 , predicted: T1074
actual: T1112 , predicted: T1112
actual: T1218 , predicted: T1218
actual: T1057 , predicted: T1057
actual: T1059 , predicted: T1059
actual: T1106 , predicted: T1055
actual: T1016 , predicted: T1016
actual: T1102 , predicted: T1102
actual: T1547 , predicted: T1547
actual: T1562 , predicted: T1562
actual: T1069 , predicted: T1069
actual: T1555 , predicted: T1555
actual: T1562 , predicted: T1562
actual: T1056 , predicted: T1056
actual: T1204 , predicted: T1204
actual: T1140 , predicted: T1140
actual: T1

### No LLM Call

In [22]:
from langchain_openai import ChatOpenAI

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-4o", temperature=0)###, model_kwargs={"top_p": 0.0})
#llm = ChatOpenAI(model="chatgpt-4o-latest", temperature=0)

#llm.invoke(prompt)

scores_list = []
predicted_labels = []
# Iterating over the DataFrame row by row
for index, row in test.iterrows():
    #print(f"Index: {index}")
    text_to_predict = row["Sentences"]

    ##### Prompt 1
    search_results = db2.similarity_search_with_score(text_to_predict, k=45)

    
    xs = ["\nText:"+ doc.page_content.lower().strip()+"\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results if score < 2]
    #xs = ["\nText:'"+ doc.page_content.lower().strip()+"'\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results]
    

    xs = ''.join(xs)+"\nInputText:"+ text_to_predict.lower().strip() +"\nClassification:"
    xs = "Your task is to classify the last sentence of 'InputText' based on the provided examples. Each example shows a sentence, inverse similarity to InputText (the lower the better), and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.\n" + xs
    #xs = "You are an expert in text classification. Below are examples of texts with their classifications based on their similarity to the 'InputText'. For the 'InputText' provided, choose the most appropriate classification based on the examples given. Only return the classification label (e.g., TXXXX). Use the closest similarity as the primary criterion for classification.\n\n" + xs
    #####
    [scores_list.append(score) for doc, score in search_results]
    
    #print(xs)
    #prompt = xs#prompt#"Write a poem about a cat chasing a butterfly."
    #result = llm.invoke(prompt)
    #predicted_label = result.content
    
    #predicted_label = most_common_ttp_id
    #print(predicted_label)
    #predicted_labels.append(predicted_label)
    #print("actual:", row["Technique ID"], ", predicted:", predicted_label)
    #break

In [15]:
### Evaluate

In [110]:
cleaned_labels = [label.replace('Classification:', '') for label in predicted_labels]

# Example vectors containing strings
y_true = test["Technique ID"].values#["cat", "dog", "fish", "cat", "dog"]
#y_true = train_data.Key
y_pred = cleaned_labels#.values#["cat", "dog", "cat", "cat", "dog"]

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_true, y_pred, average='weighted')
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_true, y_pred, average='weighted')
print(f"Recall: {recall}")

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.9285714285714286
Precision: 0.9761904761904762
Recall: 0.9285714285714286
F1 Score: 0.9396825396825396


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Secure Bert Embeddings - 
Accuracy: 0.9523809523809523
Precision: 0.9880952380952381
Recall: 0.9523809523809523
F1 Score: 0.9619047619047618

In [17]:
from statistics import mean
print(max(scores_list))
print(mean(scores_list))
print(min(scores_list))

0.28191215
0.15465398
4.058244e-12


In [23]:
from statistics import mean
print(max(scores_list))
print(mean(scores_list))
print(min(scores_list))

0.28191215
0.15465398
4.058244e-12


In [18]:
# Find incorrectly classified labels
incorrect_classifications = []
for index, (true_label, pred_label) in enumerate(zip(y_true, y_pred)):
    if true_label != pred_label:
        incorrect_classifications.append({
            'Index': index,
            'True Label': true_label,
            'Predicted Label': pred_label
        })

# Output the results
if incorrect_classifications:
    print("Incorrectly Classified Labels:")
    for item in incorrect_classifications:
        print(f"Index {item['Index']}: True Label = {item['True Label']}, Predicted Label = {item['Predicted Label']}")
else:
    print("All labels are correctly classified.")

Incorrectly Classified Labels:
Index 8: True Label = T1071, Predicted Label = T1105
Index 19: True Label = T1106, Predicted Label = T1055
Index 32: True Label = T1036, Predicted Label = T1553


In [52]:
print(xs)

Your task is to classify the last sentence of 'InputText' based on the provided examples. Each example shows a sentence, inverse similarity to InputText (the lower the better), and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.

Text:netwalker's powershell script has been obfuscated with multiple layers including base64 and hexadecimal encoding and xor-encryption, as well as obfuscated powershell functions and variables. netwalker's dll has also been embedded within the powershell script in hex format.[1][2]
Similarity:0.14041124
Classification:T1027

Text:machete has used pyobfuscate, zlib compression, and base64 encoding for obfuscation. machete has also used some visual obfuscation techniques by naming variables as combinations of letters to hinder analysis.[4][1]
Similarity:0.17555375
Classification:T1027

Text:comrat has used encryption and base64 to obfu

In [22]:
train["Technique ID"].value_counts()["T1036"]#.head()

245

### SImilarity Investigation

In [41]:
score_list = []
very_similar_string = "kerrdown can use a vbs base64 decoder function published by motobit"
totaly_wrong_string = "we have a nice weather today"
search_results2 = db2.similarity_search_with_score(very_similar_string, k=45)
[score_list.append(score) for doc, score in search_results2]
["\nText:"+ doc.page_content.lower().strip()+"\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results2 if score < 2]
#score_list

['\nText:kerrdown can use a vbs base64 decoder function published by motobit.[2]\nSimilarity:0.12265928\nClassification:T1059\n',
 '\nText:pteranodon can use a dynamic windows hashing algorithm to map api components.[4]\nSimilarity:0.23756832\nClassification:T1027\n',
 '\nText:carrotball has used a custom base64 alphabet to decode files.[1]\nSimilarity:0.23999828\nClassification:T1027\n',
 '\nText:mechaflounder has the ability to use base16 encoded strings in c2.[1]\nSimilarity:0.24867018\nClassification:T1132\n',
 '\nText:fysbis has masqueraded as trusted software rsyncd and dbus-inotifier.[2]\nSimilarity:0.25146213\nClassification:T1036\n',
 '\nText:konni has used a custom base64 key to encode stolen data before exfiltration.[4]\nSimilarity:0.25383323\nClassification:T1132\n',
 '\nText:esentutl can be used to copy files from a given url.[3]\nSimilarity:0.25800383\nClassification:T1105\n',
 '\nText:charmpower can send additional modules over c2 encoded with base64.[1]\nSimilarity:0.26

### V3

In [99]:
# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-4o", temperature=0)###, model_kwargs={"top_p": 0.0})
#llm = ChatOpenAI(model="chatgpt-4o-latest", temperature=0)

#llm.invoke(prompt)

def predict_label(text_to_predict):
    ##### Prompt 1
    search_results = db2.similarity_search_with_score(text_to_predict, k=45)

    
    xs = ["\nText:"+ doc.page_content.lower().strip()+"\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results if score < 2]
    #xs = ["\nText:'"+ doc.page_content.lower().strip()+"'\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results]
    

    xs = ''.join(xs)+"\nInputText:"+ text_to_predict.lower().strip() +"\nClassification:"
    xs = """You are a MITRE ATT&CK TTP classification expert.
    Your task is to classify the 'InputText' based on the provided examples below.
    Each example shows a sentence, inverse similarity to InputText (the lower the better),
    and its corresponding classification label starting with 'T'.
    Use these examples to determine the correct classification for the given text (InputText).
    If 'InputText' is completely not related to any of the provided examples, return class 'T0000'.
    To determine if the 'InputText' is not related to the given examples, you can use the 'Similarity' propery of the examples.
    Low 'Similarity' values indicate the high similarity in strings.
    Return only the classification label starting with 'T' or 'T0000' if you found no appropriate class for 'InputText'.\n""" + xs
    #xs = "You are an expert in text classification. Below are examples of texts with their classifications based on their similarity to the 'InputText'. For the 'InputText' provided, choose the most appropriate classification based on the examples given. Only return the classification label (e.g., TXXXX). Use the closest similarity as the primary criterion for classification.\n\n" + xs
    #####
    
    #print(xs)
    prompt = xs#prompt#"Write a poem about a cat chasing a butterfly."
    result = llm.invoke(prompt)
    predicted_label = result.content
    return predicted_label


scores_list = []
predicted_labels = []
# Iterating over the DataFrame row by row
for index, row in test.iterrows():
    #print(f"Index: {index}")
    text_to_predict = row["Sentences"]

    predicted_label = predict_label(text_to_predict)
    
    #predicted_label = most_common_ttp_id
    #print(predicted_label)
    predicted_labels.append(predicted_label)
    print("actual:", row["Technique ID"], ", predicted:", predicted_label)
    #break

actual: T1082 , predicted: T1082
actual: T1036 , predicted: T1036
actual: T1012 , predicted: T1012
actual: T1056 , predicted: T1056
actual: T1113 , predicted: T1113
actual: T1036 , predicted: T1036
actual: T1053 , predicted: T1053
actual: T1106 , predicted: T1106
actual: T1071 , predicted: T1105
actual: T1518 , predicted: T1518
actual: T1083 , predicted: T1083
actual: T1071 , predicted: T1071
actual: T1562 , predicted: T1562
actual: T1021 , predicted: T1021
actual: T1074 , predicted: T1074
actual: T1112 , predicted: T1112
actual: T1218 , predicted: T1218
actual: T1057 , predicted: T1057
actual: T1059 , predicted: T1059
actual: T1106 , predicted: T1055
actual: T1016 , predicted: T1016
actual: T1102 , predicted: T1102
actual: T1547 , predicted: T1547
actual: T1562 , predicted: T1562
actual: T1069 , predicted: T1069
actual: T1555 , predicted: T1555
actual: T1562 , predicted: T1562
actual: T1056 , predicted: T1056
actual: T1204 , predicted: T1204
actual: T1140 , predicted: T1140
actual: T1

In [100]:
print(xs)

You are a MITRE ATT&CK TTP classification expert.
    Your task is to classify the 'InputText' based on the provided examples.
    Each example shows a sentence, inverse similarity to InputText (the lower the better),
    and its corresponding classification label starting with 'T'.
    Use these examples to determine the correct classification for the given text (InputText).
    If 'InputText' is completely not related to any of the provided examples, return 'T0000'.
    To determine if the 'InputText' is not related to the given examples, you can use the 'Similarity' propery of the examples.
    Low 'Similarity' values indicate the similarity in strings.
    Return only the classification label starting with 'T' or 'T0000' if you found no appropriate class for 'InputText'.

Text:netwalker's powershell script has been obfuscated with multiple layers including base64 and hexadecimal encoding and xor-encryption, as well as obfuscated powershell functions and variables. netwalker's dll h

In [101]:
cleaned_labels = [label.replace('Classification:', '') for label in predicted_labels]

# Example vectors containing strings
y_true = test["Technique ID"].values#["cat", "dog", "fish", "cat", "dog"]
#y_true = train_data.Key
y_pred = cleaned_labels#.values#["cat", "dog", "cat", "cat", "dog"]

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_true, y_pred, average='weighted')
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_true, y_pred, average='weighted')
print(f"Recall: {recall}")

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.9285714285714286
Precision: 0.9761904761904762
Recall: 0.9285714285714286
F1 Score: 0.9396825396825396


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [102]:
predict_label("We have a beautiful wether today")

'T0000'

### Text to Predict

In [89]:
text_to_predict = """
sofacy uses dealerschoice to target european government agency
robert falcone	by robert falcone
march 15, 2018 at 1:00 pm
category: unit 42 tags: dealerschoice, european government agency, sofacy summary back in october 2016, unit 42 published an initial analysis on a flash exploitation framework used by the sofacy threat group called dealerschoice. the attack consisted of microsoft word delivery documents that contained adobe flash objects capable of loading additional malicious flash objects embedded in the file or directly provided by a command and control server. sofacy continued to use dealerschoice throughout the fall of 2016, which we also documented in our december 2016 publication discussing sofacy s larger campaign. on march 12 and march 14, we observed the sofacy group carrying out an attack on a european government agency involving an updated variant of dealerschoice. the updated dealerschoice documents used a similar process to obtain a malicious flash object from a c2 server, but the inner mechanics of the flash object contained significant differences in comparison to the original samples we analyzed. one of the differences was a particularly clever evasion technique: to our knowledge this has never been observed in use. with the previous iterations of dealerschoice samples, the flash object would immediately load and begin malicious tasks. in the march attacks, the flash object is only loaded if the user scrolls through the entire content of the delivery document and views the specific page the flash object is embedded on. also, dealerschoice requires multiple interactions with an active c2 server to successfully exploit an end system. the overall process to result in a successful exploitation is: user must open the microsoft word email attachment
user must scroll to page three of the document, which will run the dealerschoice flash object
the flash object must contact an active c2 server to download an additional flash object containing exploit code
the initial flash object must contact the same c2 server to download a secondary payload
victim host must have a vulnerable version of flash installed the attack the attack involving this updated variant of dealerschoice was targeting a european government organization. the attack relied on a spear-phishing email with a subject of defence & security 2018 conference agenda that had an attachment with a filename of defence & security 2018 conference . the attached document contains a conference agenda that the sofacy group appears to have copied directly from the website for the underwater defence & security 2018 conference here. opening the attached defence & security 2018 conference file does not immediately run malicious code to exploit the system. instead, the user must scroll to the third page of the document, which will load a flash object that contains actionscript that will attempt to exploit the user s system to install a malicious payload. the flash object embedded within this delivery document is a variant of an exploit tool that we call dealerschoice. this suggests that the sofacy group is confident that the targeted individuals would be interested enough in the content to peruse through it. we analyzed the document to determine the reason that the malicious flash object only ran when the user scrolled to the third page. according to the file, the dealerschoice loader swf exists after the covert-shores- image file within the delivery document. this image file exists on the third page of the document, so the user would have to scroll down in the document to this third page to get the swf file to run. the user may not notice the flash object on the page, as word displays it as a tiny black box in the document, as seen in figure 1. this is an interesting anti-sandbox technique, as it requires human interaction prior to the document exhibiting any malicious activity. updated dealerschoice this dealerschoice flash object shares a similar process to previous variants; however, it appears that the sofacy actors have made slight changes to its internal code. also, it appears that the actors used actionscript from an open source video player called f4player , which is freely available on github with the following description: f4player is an open source flash (as3) video player and library project. it is so small that it is only 10kb (with skin file) and totally free under gpl license. the sofacy developer modified the f4player s actionscript to include additional code to load an embedded flash object. the additions include code to decrypt an embedded flash object and an event handler that calls a newly added function ( skinevent2 ) that plays the decrypted object, as seen in the code snippet below: the above code allows dealerschoice to load a second swf object, specifically loading it with an argument that includes a c2 url of hxxp://ndpmedia24[.]com/0pq6m4f.m3u8 . the embedded swf extracts the domain from the c2 url passed to it and uses it to craft a url to get the server s file in order to obtain permissions to load additional flash objects from the c2 domain. the actionscript relies on event listeners to call specific functions when the event event.complete is triggered after successful http requests are issued to the c2 server. the event handlers call functions with the following names, which includes an incrementing number that represents the order in which the functions are called: with these event handlers created, the actionscript starts by gathering system data from the flash.system.capabilities.serverstring property (just like in the original samples) and issues an http get with the system data as a parameter to the c2 url that was passed as an argument to the embedded swf when it was initially loaded. when this http request completes, the event listener will call the onload1 function. the onload1 function parses the response data from the request to the c2 url using regular expressions. according to the following code snippet, it appears the regular expression is looking for a hexadecimal string after / and before /sec , as well as any string between /hls/ and /tracks : the regular expressions suggest that the c2 server responds with content that is meant to resemble http live steaming (hls) traffic, which is a protocol that uses http to deliver audio and video files for streaming. the use of hls coincides with the use of actionscript code from the f4player to make the traffic seem legitimate. the variables storing the results of the regular expression matches are used within the actionscript for further interaction with the c2 server. the following is a list of these variables and their purpose: the onload1 function then sends an http get request to the c2 domain using the value stored in the r3 variable as a url. when this http request completes, the event listener will call the onload2 function. the onload2 function decrypts the response received from the http request issued in onload1 function. it does so by calling a sub-function to decrypt the content, using the value stored in the r1 variable as a key. the sub-function to decrypt the content skips the first 4 bytes, suggesting that the first four bytes of the downloaded content is in cleartext (most likely the fws or cws header to look legitimate). after decrypting the content, the onload2 function will issue another http get request with the system data as a parameter, but this time to the c2 using a url from the r4 variable. when this request completes, the event listener will call the onload3 function. the onload3 function will take the response to the http request in onload2 and treat it as the payload. the actionscript will read each byte of the c2 response and get the hexadecimal value of each byte and create a text array of 4-byte hexadecimal values with 0x prepended and , appended to each using the following code: this hexadecimal string will most likely be a string of shellcode that will contain and decrypt the ultimate portable executable (pe) payload. the string of comma separated hexadecimal values is passed as a parameter when loading the swf file downloaded in onload2 . this function creates an event listener for when the swf file is successfully loaded, which will call the onload5 function. the onload5 function is responsible for adding the newly loaded swf object as a child object to the current running object using the following code: this loads the swf file, effectively running the malicious code on the system. during our analysis, we were unable to coerce the c2 into providing a malicious swf or payload. as mentioned in our previous blogs on dealerschoice, the payload of choice for previous variants was sofacycarberp (seduploader), but we have no evidence to suggest this tool was used in this attack. we are actively researching and will update this blog in the event we discover the malicious flash object and payload delivered in this attack. linkage to prior campaign the delivery document used in this attack was last modified by a user named nick daemoji , which provides a linkage to previous sofacy related delivery documents. the previous documents that used this user name were macro-laden delivery documents that installed sofacycarberp/seduploader payloads, as discussed in talos blog. this overlap also points to a similar social engineering theme between these two campaigns, as both used content from upcoming military and defense conferences as a lure. conclusion the sofacy threat group continues to use their dealerschoice framework to exploit flash vulnerabilities in their attack campaigns. in the most recent variant, sofacy modified the internals of the malicious scripts, but continues to follow the same process used by previous variants by obtaining a malicious flash object and payload directly from the c2 server. unlike previous samples, this dealerschoice used a docx delivery document that required the user to scroll through the document to trigger the malicious flash object. the required user interaction turned out to be an interesting anti-sandbox technique that we had not seen this group perform in the past.
"""

In [90]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

text = "Hello world. How are you? I'm doing great!"
sentences = sent_tokenize(text)

print(sentences)

['Hello world.', 'How are you?', "I'm doing great!"]


[nltk_data] Downloading package punkt to /home/hamzicd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [94]:
###
text_sentences = sent_tokenize(text_to_predict)

text_ttps = []
for sentence in text_sentences:
    text_ttp = predict_label(sentence)
    print(sentence, " ->", text_ttp, "\n---")
    text_ttps.append(text_ttp)


sofacy uses dealerschoice to target european government agency
robert falcone	by robert falcone
march 15, 2018 at 1:00 pm
category: unit 42 tags: dealerschoice, european government agency, sofacy summary back in october 2016, unit 42 published an initial analysis on a flash exploitation framework used by the sofacy threat group called dealerschoice.  -> T0000 
---
the attack consisted of microsoft word delivery documents that contained adobe flash objects capable of loading additional malicious flash objects embedded in the file or directly provided by a command and control server.  -> T1204 
---
sofacy continued to use dealerschoice throughout the fall of 2016, which we also documented in our december 2016 publication discussing sofacy s larger campaign.  -> T1204 
---
on march 12 and march 14, we observed the sofacy group carrying out an attack on a european government agency involving an updated variant of dealerschoice.  -> T0000 
---
the updated dealerschoice documents used a sim

In [95]:
set(text_ttps)

{'T0000',
 'T1027',
 'T1036',
 'T1059',
 'T1071',
 'T1082',
 'T1083',
 'T1105',
 'T1106',
 'T1132',
 'T1140',
 'T1203',
 'T1204',
 'T1497',
 'T1562',
 'T1566',
 'T1574',
 'T1588'}

In [103]:
### 
predict_label("Process Hashes and Process Privilege Checks Upon being loaded, the malware starts a subroutine that hashes each running process on the system, and compares each hash with 3 hardcoded hashes: 0x6403527E → avp.exe associated with Kaspersky AV 0x23214B44 → ns.exe associated with Norton Security 0x651B3005 → ccSvcHst.exe associated with Symantec A global flag PROC_FLAG is used to track which of the 3 processes are running on the system. PROC_FLAG")

'T1057'

In [104]:
predict_label("PROC_FLAG")

'T0000'

### V4 - DB Index Optimization

In [9]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import KMeans

# Load SecureBERT
tokenizer = AutoTokenizer.from_pretrained(model2)
model = AutoModel.from_pretrained(model2)

# Move model to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to generate SecureBERT embeddings for a single text
def get_embedding(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**tokens)
    # Pool the last hidden state to get sentence embedding
    embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move embedding to CPU
    return embedding

# Extract texts and labels
texts = list(train["Sentences"].values)  # List of text items
labels = list(train["Technique ID"].values)  # List of labels

# Get SecureBERT embeddings for each document, item by item
embeddings = []

for text in texts:
    embedding = get_embedding(text)
    embeddings.append(embedding)

# Convert list of embeddings to a numpy array
embeddings = np.vstack(embeddings)  # Stack embeddings vertically to create a single array

# Normalize embeddings (L2 normalization)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)





Some weights of RobertaModel were not initialized from the model checkpoint at ehsanaghaei/SecureBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
# Normalize embeddings (L2 normalization)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Supervised clustering using labels

# Get unique labels
unique_labels = set(labels)
n_classes = len(set(labels))
label_embeddings = {label: [] for label in unique_labels}

# Group embeddings by label
for embedding, label in zip(embeddings, labels):
    label_embeddings[label].append(embedding)

# Create label-guided centroids using KMeans for each class
centroids = []
for label, emb_list in label_embeddings.items():
    emb_array = np.array(emb_list)
    if len(emb_list) > 1:
        # Use KMeans to create centroids for each class, useful if there are multiple samples per class
        kmeans = KMeans(n_clusters=1, random_state=42)
        kmeans.fit(emb_array)
        centroids.append(kmeans.cluster_centers_[0])
    else:
        # If only one sample, use it directly as the centroid
        centroids.append(emb_array[0])

# Convert centroids to a numpy array
centroids = np.array(centroids)

# Initialize FAISS Index
d = embeddings.shape[1]  # dimension of the embeddings
nlist = n_classes  # number of clusters, one per class

#quantizer = faiss.IndexFlatL2(d)  # the quantizer used for clustering
#index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

index = faiss.IndexFlatL2(d)

# Train the FAISS index with the supervised centroids
#index.train(centroids)
#index.train(embeddings)
# Add embeddings to the index
index.add(embeddings)



In [78]:
centroids

array([[-1.6527151e-03,  3.8150046e-04, -1.0620928e-02, ...,
        -8.6208305e-04, -1.1872868e-03, -7.6173092e-03],
       [-4.0231571e-03, -3.1447550e-03, -1.2281622e-02, ...,
        -5.1729507e-03, -2.3175338e-05, -4.6655484e-03],
       [-3.3973509e-03,  4.7342048e-04, -1.1983840e-02, ...,
        -2.9581608e-03, -4.1522929e-03, -4.7668172e-03],
       ...,
       [-1.4198234e-03,  9.8186999e-04, -1.2064490e-02, ...,
         6.9928961e-04, -5.5266325e-03, -6.1790240e-03],
       [-4.9770181e-03, -4.2459662e-03, -1.3979421e-02, ...,
        -1.7549030e-02, -5.6825363e-04, -4.0208292e-03],
       [-2.4662260e-04,  7.1807654e-04, -7.2919363e-03, ...,
         1.9297655e-03, -4.2824163e-03, -7.3954808e-03]], dtype=float32)

In [31]:
# Example query


query_texts = ["Search the document with relevant content."]
query_embeddings = get_embedding(query_texts)
query_embeddings = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)

# Perform search
k = 45  # number of nearest neighbors
distances, indices = index.search(query_embeddings, k)

print(distances)

'''
# Print the search results
for i, query in enumerate(query_texts):
    print(f"Query: {query}")
    for j in range(k):
        if indices[i][j] != -1:  # valid index check
            print(f"Neighbor {j+1}: Document: {documents[indices[i][j]]['text']}, Distance: {distances[i][j]}")
'''

[[0.08969013 0.09562056 0.09712166 0.09998698 0.10022077 0.10045148
  0.10063277 0.10251164 0.10299776 0.10328254 0.10380837 0.10381554
  0.10431865 0.10431865 0.10483451 0.10497555 0.10505636 0.10540015
  0.10561184 0.10614038 0.10618956 0.10621552 0.10649391 0.10664568
  0.1068537  0.10705724 0.1081408  0.10863376 0.10905172 0.10916521
  0.10925332 0.10938725 0.11014912 0.11049917 0.11102483 0.11114426
  0.11129779 0.11182325 0.11188525 0.1119341  0.11204872 0.11249419
  0.11252905 0.11258065 0.11263433]]


'\n# Print the search results\nfor i, query in enumerate(query_texts):\n    print(f"Query: {query}")\n    for j in range(k):\n        if indices[i][j] != -1:  # valid index check\n            print(f"Neighbor {j+1}: Document: {documents[indices[i][j]][\'text\']}, Distance: {distances[i][j]}")\n'

In [84]:
llm = ChatOpenAI(model="gpt-4o", temperature=0)###, model_kwargs={"top_p": 0.0})
predicted_labels = []
scores_list = []
for i, row in test.iterrows():
    #print(f"Index: {index}")
    text_to_predict = row["Sentences"]

    query_texts = text_to_predict#["Search the document with relevant content."]
    query_embeddings = get_embedding(query_texts)
    query_embeddings = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)
    
    # Perform search
    k = 60  # number of nearest neighbors
    distances, indices = index.search(query_embeddings, k)

    retrieved_docs = [texts[i] for i in indices[0] if i != -1]
    retrieved_labels = [labels[i] for i in indices[0] if i != -1]


    
    #[print(doc, score, label) for doc, score, label in zip(retrieved_docs, distances, labels) if score < 2]
    
    xs = ["\nText:"+ doc.lower().strip()+"\nSimilarity:"+ str(score) +"\nClassification:"+label+"\n" for doc, score, label in zip(retrieved_docs, distances.flatten(), retrieved_labels) if score < 2]
    #xs = ["\nText:'"+ doc.page_content.lower().strip()+"'\nSimilarity:"+ str(score) +"\nClassification:"+doc.metadata['TTP_ID']+"\n" for doc, score in search_results]
    

    xs = ''.join(xs)+"\nInputText:"+ text_to_predict.lower().strip() +"\nClassification:"
    xs = "Your task is to classify the last sentence of 'InputText' based on the provided examples. Each example shows a sentence, inverse similarity to InputText (the lower the better), and its corresponding classification label. Use these examples to determine the correct classification for the given text. Return only the classification label starting with 'T'.\n" + xs
    
    #####
    
    #print(xs)
    prompt = xs#prompt#"Write a poem about a cat chasing a butterfly."
    result = llm.invoke(prompt)
    predicted_label = result.content

    predicted_labels.append(predicted_label)
    print("actual:", row["Technique ID"], ", predicted:", predicted_label)
    
    
    #print(distances)
    #print("----")
   
    [scores_list.append(score) for score in distances.flatten()]

actual: T1082 , predicted: T1082
actual: T1036 , predicted: T1036
actual: T1012 , predicted: T1012
actual: T1056 , predicted: T1056
actual: T1113 , predicted: T1113
actual: T1036 , predicted: T1036
actual: T1053 , predicted: T1053
actual: T1106 , predicted: T1106
actual: T1071 , predicted: T1105
actual: T1518 , predicted: T1518
actual: T1083 , predicted: T1083
actual: T1071 , predicted: T1071
actual: T1562 , predicted: T1562
actual: T1021 , predicted: T1021
actual: T1074 , predicted: T1074
actual: T1112 , predicted: T1112
actual: T1218 , predicted: T1218
actual: T1057 , predicted: T1057
actual: T1059 , predicted: T1059
actual: T1106 , predicted: T1055
actual: T1016 , predicted: T1016
actual: T1102 , predicted: T1102
actual: T1547 , predicted: T1547
actual: T1562 , predicted: T1562
actual: T1069 , predicted: T1069
actual: T1555 , predicted: T1555
actual: T1562 , predicted: T1562
actual: T1056 , predicted: T1056
actual: T1204 , predicted: T1204
actual: T1140 , predicted: T1140
actual: T1

In [85]:
cleaned_labels = [label.replace('Classification:', '') for label in predicted_labels]

# Example vectors containing strings
y_true = test["Technique ID"].values#["cat", "dog", "fish", "cat", "dog"]
#y_true = train_data.Key
y_pred = cleaned_labels#.values#["cat", "dog", "cat", "cat", "dog"]

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_true, y_pred, average='weighted')
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_true, y_pred, average='weighted')
print(f"Recall: {recall}")

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"F1 Score: {f1}")

Accuracy: 0.9285714285714286
Precision: 0.9761904761904762
Recall: 0.9285714285714286
F1 Score: 0.9396825396825396


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [81]:
#scores_list

In [82]:
from statistics import mean
print(max(scores_list))
print(mean(scores_list))
print(min(scores_list))

0.08248685
0.048083905
0.0


In [46]:
len(retrieved_docs)

45