In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
from urllib.parse import unquote
import os

def get_filename(url: str):
    try:
        with requests.get(url) as req:
            if content_disposition := req.headers.get("Content-Disposition"):
                param, options = werkzeug.http.parse_options_header(content_disposition)
                if param == 'attachment' and (filename := options.get('filename')):
                    return filename

            path = urlparse(req.url).path
            name = path[path.rfind('/') + 1:]
            return unquote(name)
        
    except requests.exceptions.RequestException as e:
        raise e
        
index = []

for page_id in range(0,100):
    
    page_now = f"https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A65&page={page_id}"
        
    r_page = requests.get(page_now)
    
    if r_page.status_code == 200:
        print(f"Page {page_id}")
        
        soup = BeautifulSoup(r_page.content, "html.parser")

        titles = soup.find_all("h3",class_="c-teaser__title")
        
        for title in titles:
            ref_link = title.find_all("a",href=True)
            if len(ref_link) > 0:
                article_name = ref_link[0].text.strip()
                article_url = "https://www.cisa.gov" + ref_link[0]['href']
                print(article_name)

                soup_report = BeautifulSoup(requests.get(article_url).content, "html.parser")

                downloads = soup_report.find_all("div",class_="c-file__download")

                for download in downloads:
                    down_link = download.find_all("a",class_="c-file__link",href=True)
                    file_name = down_link[0].text.strip()
                    file_url = "https://www.cisa.gov" + down_link[0]['href']

                    r = requests.get(file_url, allow_redirects=True)

                    original_name = get_filename(file_url)

                    info = {"name":article_name,"url":article_url,"report_name":file_name,"file_name":original_name,"file_url":file_url}
                    index.append(info)
                    
                    if os.path.isfile(original_name): continue
                    else: open(original_name, 'wb').write(r.content)
                    
                if len(downloads) == 0:
                    downloads = soup_report.find_all("a",href=True)
                    for download in downloads:
                        
                        if download['href'].startswith("https://"):
                            file_url = download['href']
                        else:
                            file_url = "https://www.cisa.gov" + download['href']
                        file_name = download.text.strip()
                        
                        if file_url.endswith("xml") or file_url.endswith("json"):
                            info = {"name":article_name,"url":article_url,"report_name":file_name,"file_name":original_name,"file_url":file_url}
                            index.append(info)
                            r = requests.get(file_url, allow_redirects=True)

                            original_name = get_filename(file_url)
                            if os.path.isfile(original_name): continue
                            else: open(original_name, 'wb').write(r.content)
                        
                        else: continue
            else: continue
    else:
        print("All pages parsed")
        break
    
info_df = pd.DataFrame(index)
info_df.to_csv("cisa_reports.csv",index=False)

Page 0
MAR-10448362-1.v1 Volt Typhoon
MAR-10478915-1.v1 Citrix Bleed
MAR-10430311-1.v1 Multiple Nation-State Threat Actors Exploit CVE-2022-47966 and CVE-2022-42475
MAR-10454006.r5.v1 SUBMARINE, SKIPJACK, SEASPRAY, WHIRLPOOL, and SALTWATER Backdoors
Infamous Chisel Malware Analysis Report
MAR-10459736.r1.v1 WHIRLPOOL Backdoor
MAR-10454006.r4.v2 SEASPY and WHIRLPOOL Backdoors
MAR-10454006-r1.v2 SUBMARINE Backdoor
MAR-10454006-r2.v1 SEASPY Backdoor
MAR-10454006-r3.v1 Exploit Payload Backdoor
Page 1
MAR-10445155-1.v1 Truebot Activity Infects U.S. and Canada Based Networks
MAR-10443863-1.v1 CVE-2017-9248 Exploitation in U.S. Government IIS Server
MAR-10435108-1.v1 ICONICSTEALER
MAR-10413062-1.v1 Telerik Vulnerability in U.S. Government IIS Server
MAR-10365227-1.v1 - Impacket
MAR-10365227-2.v1 - Impacket 2
MAR-10365227-3.v1 - Impacket 3
MAR-10365227-2.v1
Download link: #main
Download link: /
Download link: /topics
Download link: /topics/cybersecurity-best-practices
Download link: /topics/cy

In [5]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         138 non-null    object
 1   url          138 non-null    object
 2   report_name  138 non-null    object
 3   file_name    138 non-null    object
 4   file_url     138 non-null    object
dtypes: object(5)
memory usage: 5.5+ KB


In [6]:
info_df.head(4)

Unnamed: 0,name,url,report_name,file_name,file_url
0,MAR-10448362-1.v1 Volt Typhoon,https://www.cisa.gov/news-events/analysis-repo...,MAR-10448362-1.v1 Volt Typhoon,MAR-10448362.c1.v1.CLEAR_.pdf,https://www.cisa.gov/sites/default/files/2024-...
1,MAR-10448362-1.v1 Volt Typhoon,https://www.cisa.gov/news-events/analysis-repo...,AR24-038A STIX JSON,MAR-10448362.c1.v1.CLEAR_stix2.json,https://www.cisa.gov/sites/default/files/2024-...
2,MAR-10478915-1.v1 Citrix Bleed,https://www.cisa.gov/news-events/analysis-repo...,MAR-10478915-1.v1 Citrix Bleed,MAR-10478915.r1.v1.CLEAR_.pdf,https://www.cisa.gov/sites/default/files/2023-...
3,MAR-10478915-1.v1 Citrix Bleed,https://www.cisa.gov/news-events/analysis-repo...,AR23-325A JSON,MAR-10478915.r1.v1.CLEAR_stix2.json,https://www.cisa.gov/sites/default/files/2023-...


In [7]:
info_df.tail(4)

Unnamed: 0,name,url,report_name,file_name,file_url
134,MAR-10201537 – HIDDEN COBRA FASTCash-Related M...,https://www.cisa.gov/news-events/analysis-repo...,MAR-10201537,MAR-10164494.r1.v1.stix.xml,https://www.cisa.gov/sites/default/files/publi...
135,MAR-10135536-17 – North Korean Trojan: KEYMARBLE,https://www.cisa.gov/news-events/analysis-repo...,MAR-10135536-17.stix,MAR-10201537_stix.xml,https://www.cisa.gov/sites/default/files/publi...
136,MAR-10135536-12 – North Korean Trojan: TYPEFRAME,https://www.cisa.gov/news-events/analysis-repo...,MAR-10135536-12.stix,MAR-10135536.r17.v1.WHITE_stix.xml,https://www.cisa.gov/sites/default/files/publi...
137,MAR-10135536-3 - HIDDEN COBRA RAT/Worm,https://www.cisa.gov/news-events/analysis-repo...,MAR-10135536-3.stix,MAR-10135536-12_WHITE_stix.xml.xml,https://www.cisa.gov/sites/default/files/publi...
