# 

In [1]:
# !git clone https://github.com/blackorbird/APT_REPORT

In [2]:
# install required libraries
# !pip install -q -U git+https://github.com/huggingface/transformers.git

# !pip install accelerate
# !pip install bitsandbytes
# !pip install langchain
# !pip install sentence-transformers
# !conda install -c pytorch -c nvidia faiss-gpu=1.8.0
# !pip install pypdf

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [4]:
from transformers import BitsAndBytesConfig
import torch

In [5]:
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor

In [6]:
access_token = "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"

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
class Assistant:
    """Gemma 2b based assistant that replies given the retrieved documents"""
    def __init__(self):

        self.nf4_config = BitsAndBytesConfig(
               load_in_4bit=True,
               bnb_4bit_quant_type="nf4",
               bnb_4bit_use_double_quant=True,
               bnb_4bit_compute_dtype=torch.bfloat16
            )

        # self.tokenizer = AutoTokenizer.from_pretrained("Nexusflow/Starling-LM-7B-beta", token=access_token)
        # self.model = AutoModelForCausalLM.from_pretrained("Nexusflow/Starling-LM-7B-beta", token=access_token, quantization_config=self.nf4_config)
        
        self.tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", token=access_token)
        self.model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", token=access_token, quantization_config=self.nf4_config)

    def create_prompt(self, query, retrieved_info):
        # instruction to areply to query given the retrived information
        # prompt = f"""You are helpful Cyber Security Assistant Chatbot. You need either to explain the concept or answer the question about Cyber Security.
        # Be detailed, use simple words and examples in your explanations. If required, utilize the relevant information.
        # Instruction: {query}
        # Relevant information: {retrieved_info}
        # Output:
        # """

        # prompt = f"""You are helpful Cyber Security Assistant Chatbot. You need either to explain the concept or answer the question about Cyber Security.
        # # Be detailed, use simple words and examples in your explanations. If required, utilize the relevant information. GPT4 Correct User: {query} Additional Information: {retrieved_info}<|end_of_turn|>GPT4 Correct Assistant:"""

        ## Mistral Prompt
        prompt = f""" ### [INST] 
        Instruction: You are helpful Cyber Security Assistant Chatbot. You need either to explain the concept or answer the question about Cyber Security.
        Be detailed, use simple words and examples in your explanations but do not miss important technical details. If required, utilize the relevant information. Here is context to help: {retrieved_info}

        ### QUESTION: {query} 
        [/INST]
        """
        
        return prompt

    def reply(self, query, retrieved_info):
        
        prompt = self.create_prompt(query, retrieved_info)
        input_ids = self.tokenizer(query, return_tensors="pt").input_ids.to(device)
        
        # Generate text with a focus on factual responses
        
        generated_text = self.model.generate(
            input_ids,
            max_length=2048, # let answers be not that long
            temperature=0.9, # Adjust temperature according to the task, for code generation it can be 0.9
            do_sample=True,
        )
        
        # Decode and return the answer
        answer = self.tokenizer.decode(generated_text[0], skip_special_tokens=True)
        
        return answer


In [9]:
class Retriever:
    def __init__(self, num_retrieved_docs=100, pdf_folder_path=None, debug=False):
        
        ## remove the [:40] to index all the 300-ish documents
        self.pdf_files = self.get_all_pdfs(pdf_folder_path)

        if debug == True:
            
            print("Documents used: ")
            [print("\n ", pdf_file) for pdf_file in self.pdf_files]
            
            # print("\n")
            
            print("Total Number of Documents", len(self.pdf_files))
            
        self.embeddings_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
        
        self.all_documents = self.read_documents()
        
        self.db = FAISS.from_documents(self.all_documents, self.embeddings_model)
        self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})

    def read_documents(self):
    
        with tqdm(total=len(self.pdf_files)) as pbar:
            with ThreadPoolExecutor(max_workers=24) as executor:
                future_to_pdf = {executor.submit(self.load_and_process_document, pdf_file): pdf_file for pdf_file in self.pdf_files}
                all_documents = []
                for future in as_completed(future_to_pdf):
                    documents = future.result()
                    all_documents.extend(documents)
                    pbar.update(1)
                return all_documents

    def get_all_pdfs(self, directory):
        """Get the list of pdf files in the directory."""
        pdf_files = []
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith(".pdf"):
                    pdf_files.append(os.path.join(root, file))
        return pdf_files

    def load_and_process_document(self, pdf_file):
        loader = PyPDFLoader(pdf_file)
        raw_documents = loader.load()
        
        text_splitter = CharacterTextSplitter(
            separator="\n\n",
            chunk_size=256,
            chunk_overlap=28,
            length_function=len,
        )
        
        documents = text_splitter.split_documents(raw_documents)
        return documents
    
    
    def search(self, query):
        docs = self.retriever.get_relevant_documents(query)
        return docs

In [10]:
chatbot = Assistant()

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
%%time
retriever = Retriever(pdf_folder_path="./APT_REPORT", num_retrieved_docs=100, debug=False)

Documents used: 

  ./APT_REPORT/Threat Group Cards.pdf

  ./APT_REPORT/APT_2021_global_research_qianxin.pdf

  ./APT_REPORT/A_Threat_Actor_Encyclopedia.pdf

  ./APT_REPORT/Threat_Group_Cards_v2.0.pdf

  ./APT_REPORT/National Cyber Power Index 2020.pdf

  ./APT_REPORT/SideCopy/SideCopy.pdf

  ./APT_REPORT/UNC****/NoName/study_of_a_targeted_attack_on_a_russian_enterprise_in_the_mechanical_engineering_sector_en.pdf

  ./APT_REPORT/UNC****/UNC3347/GroundPeony_Crawling_with_Malice.pdf

  ./APT_REPORT/TransparentTribe/transparent-tribe-threat-insight-en2020.pdf

  ./APT_REPORT/Osint/Arastirma-KitiPriviaSecurity.pdf

  ./APT_REPORT/Agrius/evol-agrius.pdf

  ./APT_REPORT/.ipynb_checkpoints/Threat Group Cards-checkpoint.pdf

  ./APT_REPORT/.ipynb_checkpoints/A_Threat_Actor_Encyclopedia-checkpoint.pdf

  ./APT_REPORT/.ipynb_checkpoints/National Cyber Power Index 2020-checkpoint.pdf

  ./APT_REPORT/APT-hunting/THREAT HUNTING PLAYBOOK .pdf

  ./APT_REPORT/APT-hunting/hunting-cobaltstrike-beacons-

 12%|█████████████████████▍                                                                                                                                                        | 42/340 [01:19<17:59,  3.62s/it]could not convert string to float: '0.00-30' : FloatObject (b'0.00-30') invalid; use 0.0 instead
could not convert string to float: '0.00-30' : FloatObject (b'0.00-30') invalid; use 0.0 instead
could not convert string to float: '0.00-30' : FloatObject (b'0.00-30') invalid; use 0.0 instead
could not convert string to float: '0.00-30' : FloatObject (b'0.00-30') invalid; use 0.0 instead
could not convert string to float: '0.00-30' : FloatObject (b'0.00-30') invalid; use 0.0 instead
 42%|████████████████████████████████████████████████████████████████████████▊                                                                                                    | 143/340 [06:02<05:49,  1.77s/it]Created a chunk of size 382, which is longer than the specified 256
Created a chunk of size

In [None]:
def generate_reply(query):
    related_docs = retriever.search(query)
    # print('related docs', related_docs)
    reply = chatbot.reply(query, related_docs)
    return reply

In [None]:
# example
# %%time
reply = generate_reply("What are some persistence mechanisms being used by APT groups?")
for s in reply.split('\n'):
    print(s)