## Cyber News Monitoring and Tracking Emerging Threat Actors 

In [37]:
!pip freeze > requirements.txt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
# Load Important libraies 
import pandas as pd
import torch 
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext,PromptTemplate,set_global_service_context,Document
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import warnings 
import re 
import requests
import feedparser

warnings.filterwarnings("ignore")
print("Done")

# loading saved LLM model weights
save_path = '/root/threat_landscape/model/llm_weights/'
model = AutoModelForCausalLM.from_pretrained(save_path,device_map='auto',torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(save_path)


# System prompt is given to LLM model to instructs the model on how to respond to each request.
system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful, and honest cybersecurity analyst.
If a question does not make any sense, or is not factually coherent, do not answer. 
If you don't know the answer to a question, please don't share any information.
Only answer from the document file given without eleborating it.
Never ask for next question.
Never ask to select option <</SYS>>"""

query_wrapper_prompt = "{query_str}"

# Initialization of LLM Model
llm = HuggingFaceLLM(
    context_window=4096, #lenght of input 
    max_new_tokens=256, # length of output
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    model=model,
    tokenizer=tokenizer
)

#Initialize and load the model embeddings
embeddings = LangchainEmbedding(HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))
service_context = ServiceContext.from_defaults(chunk_size=4098, llm=llm, embed_model=embeddings)
set_global_service_context(service_context)

# load library and function to extract data from news urls
from newspaper import Article
def get_text(rss):
    
    article = Article(rss) 
    try:
        article.download() # download the article
        article.parse() # parse the article to get the meta data
        rawText = article.text 
        return rawText
    except:
        return False


Done


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.65it/s]
The model `StabilityAI/stablelm-tuned-alpha-3b` and tokenizer `/root/threat_landscape/model/llm_weights/` are different, please ensure that they are compatible.


In [3]:
df = pd.read_excel('References.xlsx')
df = df[df['Reference'].str.startswith(('http', 'https'))]
links = df['Reference'].tolist()


In [13]:

xml_links = []

for url in links:
    try:
        response = requests.head(url, timeout=10)
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type', '')

            if 'xml' in content_type:
                print("The content is XML.", url)
                xml_links.append(url)
            else:
                print("The content is not XML.",url)
        else:
            print("Failed to fetch content.",url)
    except requests.exceptions.RequestException as e:
        print("An error occurred while making the request:", e)
links = [url for url in links if url not in xml_links]


The content is not XML. https://malpedia.caad.fkie.fraunhofer.de/actors
The content is not XML. https://attack.mitre.org/groups/
The content is not XML. https://attack.mitre.org/software/
Failed to fetch content. http://www.cerist.dz/index.php/en/?format=feed&type=rss
Failed to fetch content. https://auscert.org.au/rss/bulletins/
Failed to fetch content. https://cyber.gc.ca/webservice/en/rss/alerts
Failed to fetch content. https://cyber.gc.ca/webservice/en/rss/news
Failed to fetch content. https://www.egcert.eg/feed/
Failed to fetch content. https://cert.europa.eu/publications/security-advisories/2023
An error occurred while making the request: HTTPSConnectionPool(host='cert.lv', port=443): Read timed out. (read timeout=10)
The content is not XML. https://www.csa.gov.sg/Content/RSS-Feed
The content is not XML. https://www.sk-cert.sk/index.html%3Ffeed=rss
Failed to fetch content. https://www.ccn-cert.cni.es/component/obrss/rss-ultimas-vulnerabilidades.feed
Failed to fetch content. https

In [12]:
for i in links:
    print(i)

https://malpedia.caad.fkie.fraunhofer.de/actors
https://attack.mitre.org/groups/
https://attack.mitre.org/software/
http://www.cerist.dz/index.php/en/?format=feed&type=rss
https://auscert.org.au/rss/bulletins/
https://cyber.gc.ca/webservice/en/rss/alerts
https://cyber.gc.ca/webservice/en/rss/news
https://www.egcert.eg/feed/
https://cert.europa.eu/publications/security-advisories/2023
https://cert.lv/en/feed/rss/all
https://www.csa.gov.sg/Content/RSS-Feed
https://www.sk-cert.sk/index.html%3Ffeed=rss
https://www.ccn-cert.cni.es/component/obrss/rss-ultimas-vulnerabilidades.feed
https://www.govcert.ch/blog/rss.xml
https://www.govcert.ch/whitepapers/rss.xml
https://www.cisa.gov/uscert/ncas/all.xml
https://www.cfr.org/cyber-operations/
https://malpedia.caad.fkie.fraunhofer.de/library
https://www.bleepingcomputer.com/feed/
https://www.nextinpact.com/rss/news.xml
https://www.cisa.gov/uscert/ncas/alerts.xml
https://www.fireeye.com/blog/threat-research/_jcr_content.feed
https://www.cisa.gov/usce

In [None]:
all_responses_df = pd.DataFrame()
for url in links: 
        text = get_text(url)
        if not text:
            print(f"Failed to retrieve or parse article from URL: {url}")
            xml_links.append(url)
            
        print(url, text)
        try:
            ArticleText = pd.DataFrame([x.split('kk') for x in text.split('mnl')], columns=['text']) #important
        except Exception as e:
            print(f"Error processing text for URL: {url}, error: {e}")
        documents = [
            Document(
            text=" ".join([f"{col}: {value}" for col, value in zip(['text'], row.astype(str))]),
            metadata={"row_num": idx}
        )
        for idx, row in ArticleText.iterrows()
        ]
        # Create an index using the DataFrame's content
        index = VectorStoreIndex.from_documents(documents)
        query_engine = index.as_query_engine(verbose=False)
        def generate_response(query_text):
            response = query_engine.query(query_text)
            return response
        
        
        # Queries List
        queries = [
        """from the given article tell me article belongs to which categories: Policy and Law Enforcement , Cyber policy,Cybercrime, Hacktivism, Disruption,Data Leaks/Data Breach and Significant Vulnerabilities just give me the names of category(ies) no explantion or any other details""",
        "wirte thw summary of the provided  article ",
        "Only List down names of cyber threat actors involved if any without source information in the text provided in the article based on the text provided",
        "Only List down motives of cyberattack discussed if any in the text provided in the article based on the text provided",
        "Only give  the year of attack discussed in the text provided in the article based on the text provided",
        "Is the given text provided in the article based on the text provided involved cyber security news or anything related to it, reply as yes and no only",
        """"indentidy from given article that the  thread actor attack which sector from the given list
        Government
        Healthcare
        Media
        None Provided
        Defense
        Telecommunications
        Education
        Energy
        High-Tech
        IT
        Mining
        NGOs
        Casinos and Gambling
        Construction
        Hospitality
        Retail
        Technology
        Transportation
        Aviation
        Industrial
        Manufacturing
        Oil and gas
        Pharmaceutical
        Chemical
        Engineering
        Aerospace
        Shipping and Logistics
        Embassies
        Research
        Think Tanks
        Law enforcement
        Entertainment
        Automotive
        Utilities
        Food and Agriculture
        Gaming
        Maritime and Shipbuilding
        Critical infrastructure
        Petrochemical
        Online video game companies
        Non-profit organizations
        Satellites
        listen down only name no extra info """,
        ]
        
        # Defined function to clean results
        def remove_strings_starting_with_please(list1):
            for str1 in list1:
                if "Please" in str1:
                    list1.remove(str1)
            return list1
        # why we are removing the string which have word please        
        def cleanOutput(responseStr):
            while("" in responseStr):
                responseStr.remove("")
            responseStr = remove_strings_starting_with_please(responseStr)
            return responseStr
        
        ### Preprocess Results
        responseString= str(generate_response(queries[0])).strip()
        responseString1= str(generate_response(queries[1])).strip()
        substr1 = re.sub(r"[0-9.]+",'' ,responseString1)
        substr1 = substr1.split('\n')
        responseString1 = [i.strip() for i in substr1]
        
        responseString2= str(generate_response(queries[2])).split('\n')
        responseString2 = cleanOutput(responseString2)
        
        responseString3= str(generate_response(queries[3])).split('\n')
        responseString3 = cleanOutput(responseString3)
        
        responseString4= str(generate_response(queries[4])).split('\n')
        responseString4 = cleanOutput(responseString4)
        
        responseString6= str(generate_response(queries[6])).split('\n')
        responseString6 = cleanOutput(responseString6)

        pd.set_option('display.max_colwidth', None)
        #Store Results to DataFrame
        dataTable = pd.DataFrame()
        dataTable.loc[0,'url'] = url
        dataTable.loc[0,'Summary'] = ",".join(responseString1)
        dataTable.loc[0,'news_Tags'] = responseString
        dataTable.loc[0,'ThreatActor']= ",".join(responseString2)
        dataTable.loc[0,'Motive']=  ",".join(responseString3)
        dataTable.loc[0,'YearMonth']= ",".join(responseString4)
        dataTable.loc[0,'Organistion_Name']= ",".join(responseString6)
        url,text,responseString,responseString1,responseString2,responseString3,responseString4,responseString4,responseString5,responseString6="","","","","","","","","",""
        if all_responses_df.empty:
            all_responses_df = dataTable.copy()
        else:
            all_responses_df = pd.concat([all_responses_df, dataTable], ignore_index=True)

https://malpedia.caad.fkie.fraunhofer.de/actors The following table provides a mapping of the actor groups tracked by the MISP Galaxy Project, augmented with the families covered in Malpedia.

Enter keywords to filter the actors below Common Name Coverage Operation DarkSeoul, Dark Seoul, Hidden Cobra, Hastati Group, Andariel, Unit 121, Bureau 121, NewRomanic Cyber Army Team, Bluenoroff, Subgroup: Bluenoroff, Group 77, Labyrinth Chollima, Operation Troy, Operation GhostSecret, Operation AppleJeus, APT38, APT 38, Stardust Chollima, Whois Hacking Team, Zinc, Appleworm, Nickel Academy, APT-C-26, NICKEL GLADSTONE, COVELLITE, ATK3, G0032, ATK117, G0082, Citrine Sleet, DEV-0139, DEV-1222, Diamond Sleet, ZINC, Sapphire Sleet, COPERNICIUM, TA404, Lazarus group, BeagleBoyz, Lazarus, Genie Spider Lazarus Group 129 Operation Cleaver, Op Cleaver, Tarh Andishan, Alibaba, TG-2889, Cobalt Gypsy, G0003, Hazel Sandstorm, EUROPIUM, APT34, OilRig Cleaver 36 COMMENT PANDA, PLA Unit 61398, Comment Crew, Byz

In [9]:
all_responses_df

Unnamed: 0,url,Keywords,Summary,ThreatActor,Motive,YearMonth,Organistion_Name
0,https://malpedia.caad.fkie.fraunhofer.de/actors,"The top keyphrases from the provided text are:,,question,question,question,S,question,question,S,S,S,S,question,question,question,S,question,,Note: The refined answer is based on the updated context provided, and the keyphrases have been re-ranked based on their frequency of appearance in the updated text","The article belongs to the categories: Policy and Law Enforcement, Cybercrime, Hacktivism, and Data Leaks/Data Breach.","The cyber threat actors involved in the provided text are:,* Nobody,* Everybody,* Nobody,* Его (Hint),* sierp (Hint),* February (Hint),* Nobody,* Nobody,* April (Hint),* Nobody,* Nobody,* Everybody (Hint),* Nobody,* Nobody,* April (Hint),* Nobody,* Nobody,* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint),* Nobody (Hint)","The motives of cyberattacks discussed in the text provided are:,1. Operation Kabar Cobra: The motive is to gain unauthorized access to a target's system or network for malicious purposes.,2. Operation Parliament: The motive is to steal sensitive information, such as login credentials or financial data, from a target's system or network.,3. Operation Poison Needles: The motive is to spread malware or viruses through email attachments or links, with the goal of compromising a target's system or network.,4. Operation Red Signature: The motive is to use social engineering tactics to trick a target into revealing sensitive information or performing an action that benefits the attacker.,5. Operation Shadow Force: The motive is to conduct a cyberattack anonymously, using techniques such as VPNs or proxy servers to hide the attacker's identity.,6. Operation Triangulation: The motive is to use multiple vectors to gain access to a target's system or network, such as phishing, spear phishing, or exploiting vulnerabilities.",2017,L
1,https://attack.mitre.org/groups/,"keyphrases about the text provided in the article are:,,Lazarus Group,North Korean group,Chinese state-sponsored cyber threat group,Iranian threat group,Leviathan,Lotus Blossom,LuminousMoth,Machete,Thrip,ToddyCat,Tonto Team,Transparent Tribe,Tropic Trooper,Turla,Volatile Cedar","Ћ\n\nPlease select the category(ies) that the article belongs to from the following options:\nPolicy and Law Enforcement, Cyber policy, Cybercrime, Hacktivism, Disruption, Data Leaks/Data Breach, Significant Vulnerabilities.","The following cyber threat actors are involved in the provided text without source information:,1. Aquatic Panda,2. Axiom Group,3. BackdoorDiplomacy,4. BITTER T-APT-17,5. BlackOasis,6. BlackTech,7. Blue Mockingbird,8. Bouncing Golf","The motives of cyberattacks discussed in the text provided are:,1. Economic gain: BITTER T-APT-17 and BlackOasis are suspected of targeting government, energy, and engineering organizations in order to gain access to sensitive information and intellectual property.,2. Political espionage: Axiom Group, BackdoorDiplomacy, and BlackOasis are believed to be targeting Ministries of Foreign Affairs and opposition figures in order to gain insight into political decision-making processes.,3. Financial gain: BlackTech is suspected of targeting organizations in East Asia, particularly Taiwan, Japan, and Hong Kong, in order to gain access to financial information and intellectual property.,4. Cyber espionage: Bouncing Golf and Blue Mockingbird are suspected of targeting Middle Eastern countries in order to gather sensitive information and intellectual property.,5. Cyber espionage: Confucius is a cyber espionage group that has primarily targeted military personnel, high-profile personalities, business persons, and government organizations in South Asia since at least 2013.,6. Cyber espionage: Cy",2023,Lazarus Group
2,https://attack.mitre.org/software/,"The top keyphrases about the text provided in the article are:,,questions,you,can,direct,question,questions,or,the,you,but,questions,or,you,or,real","0, Policy and Law Enforcement, Cybercrime, Hacktivism, Disruption, Data Leaks/Data Breach, Significant Vulnerabilities","The following cyber threat actors are involved in the provided text without source information:,* Lazarus Group,* BRONZE BUTLER,* BBSRAT,* BendyBear,* APT1,* Tonto Team,* Black Basta,Note: These are the actors mentioned in the provided text without any additional information or context.","Љ,Motives of cyberattack discussed in the text are:,* Stealing sensitive data (Bazar KEGTAP, Team9, Bazaloader),* Deploying additional malware (Bazar KEGTAP, Team9, Bazaloader),* Targeted compromises (BBSRAT),* Ransomware (BendyBear, BitPaymer),* Remote access tool (RAT) (Bankshot, Bisonal, Black Basta),* Espionage (BISCUIT),* Extortion (Black Basta),Note: The answer is refined based on the new context provided, and it includes additional malware and motives that were not mentioned in the original answer.",2022,"Sector: Defense,Explanation: Based on the provided context, the actor is likely to be Threat Group-3390, which has been active in the defense industry since at least 2020. Therefore, the sector that is most likely to be targeted by the actor is Defense."
3,http://www.cerist.dz/index.php/en/?format=feed&type=rss,"Please provide the top keyphrases about the text provided in the article,,Note: The article is about the importance of cybersecurity in the digital age and the role of cybersecurity professionals in protecting individuals and organizations from cyber threats,,Please provide the top keyphrases about the text provided in the article,,I will provide the answer","Please provide the next query or question.\n\n]]\n\nI apologize, but the question you provided does not make sense. The article does not mention any categories or labels for the article. Therefore, I cannot provide an answer.\n\nPlease provide a coherent and factual question, and I will do my best to assist you.",,,"2017,Note: The answer should be based solely on the information provided in the text given in the article.",Note: I will only answer based on the information provided in the given article.


In [29]:
categories_to_check = [
            "Policy and Law Enforcement",
            "Cyber policy",
            "Cybercrime",
            "Hacktivism",
            "Disruption",
            "Data Leaks/Data Breach",
            "Significant Vulnerabilities"
        ]
all_responses_df['Summary'] = all_responses_df['Summary'].apply(lambda x: [cat for cat in categories_to_check if cat in x])
all_responses_df = all_responses_df[all_responses_df['Summary'].map(lambda x: len(x) > 0)]

all_responses_df['Summary']

0                                                       [Policy and Law Enforcement, Cybercrime, Hacktivism, Significant Vulnerabilities]
1                                               [Cybercrime, Hacktivism, Disruption, Data Leaks/Data Breach, Significant Vulnerabilities]
2                                                            [Policy and Law Enforcement, Cybercrime, Hacktivism, Data Leaks/Data Breach]
8                                                                                                            [Policy and Law Enforcement]
10    [Policy and Law Enforcement, Cyber policy, Cybercrime, Hacktivism, Disruption, Data Leaks/Data Breach, Significant Vulnerabilities]
11                                                                                               [Policy and Law Enforcement, Cybercrime]
13                                                                                                           [Policy and Law Enforcement]
14                                

In [27]:
# Given list of categories
categories_to_check = [
    "Government",
    "Healthcare",
    "Media",
    "None Provided",
    "Defense",
    "Telecommunications",
    "Education",
    "Energy",
    "High-Tech",
    "IT",
    "Mining",
    "NGOs",
    "Casinos and Gambling",
    "Construction",
    "Hospitality",
    "Retail",
    "Technology",
    "Transportation",
    "Aviation",
    "Industrial",
    "Manufacturing",
    "Oil and gas",
    "Pharmaceutical",
    "Chemical",
    "Engineering",
    "Aerospace",
    "Shipping and Logistics",
    "Embassies",
    "Research",
    "Think Tanks",
    "Law enforcement",
    "Entertainment",
    "Automotive",
    "Utilities",
    "Food and Agriculture",
    "Gaming",
    "Maritime and Shipbuilding",
    "Critical infrastructure",
    "Petrochemical",
    "Online video game companies",
    "Non-profit organizations",
    "Satellites","NO"
]
all_responses_df['Organistion_Name'] = all_responses_df['Organistion_Name'].apply(lambda x: [cat for cat in categories_to_check if cat in x])
all_responses_df['Organistion_Name']

0     [Government]
1               []
2        [Defense]
3     [Government]
4     [Government]
5     [Government]
6        [Defense]
7               []
8               []
9     [Government]
10              []
11              []
12    [Industrial]
13              []
14              []
Name: Organistion_Name, dtype: object

### Testing

In [38]:
output_file = 'output.xlsx'
all_responses_df.to_excel(output_file, index=False)