In [1]:
import requests
import re
import execjs
import hashlib
import json
from requests.utils import add_dict_to_cookiejar


def getCookie(data):
    """
    通过加密对比得到正确cookie参数
    :param data: 参数
    :return: 返回正确cookie参数
    """
    chars = len(data['chars'])
    for i in range(chars):
        for j in range(chars):
            clearance = data['bts'][0] + data['chars'][i] + data['chars'][j] + data['bts'][1]
            encrypt = None
            if data['ha'] == 'md5':
                encrypt = hashlib.md5()
            elif data['ha'] == 'sha1':
                encrypt = hashlib.sha1()
            elif data['ha'] == 'sha256':
                encrypt = hashlib.sha256()
            encrypt.update(clearance.encode())
            result = encrypt.hexdigest()
            if result == data['ct']:
                return clearance

def setup_session(session, url, header):
    # 使用session保持会话
    res1 = session.get(url, headers=header)
    jsl_clearance_s = re.findall(r'cookie=(.*?);location', res1.text)[0]
    # 执行js代码
    jsl_clearance_s = str(execjs.eval(jsl_clearance_s)).split('=')[1].split(';')[0]
    # add_dict_to_cookiejar方法添加cookie
    add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s})
    res2 = session.get(url, headers=header)
    # 提取go方法中的参数
    data = json.loads(re.findall(r';go\((.*?)\)', res2.text)[0])
    jsl_clearance_s = getCookie(data)
    # 修改cookie
    add_dict_to_cookiejar(session.cookies, {'__jsl_clearance_s': jsl_clearance_s})

    return session

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import re
from tqdm import tqdm
import time

# Initialize a session
session = requests.Session()
url = 'https://www.cnvd.org.cn/flaw/typeResult?typeId=33&max=100&offset=0'
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}

def clean_text(text):
    # Replace sequences of whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    return text.strip()

def extract_data(html):
    soup = BeautifulSoup(html, 'html.parser')
    data = {}

    # Extracting and cleaning data
    data['CNVD-ID'] = clean_text(soup.find(string='CNVD-ID').find_next().text)
    data['CVE-ID'] = clean_text(soup.find(string='CVE ID').find_next().text)
    data['Public Date'] = clean_text(soup.find(string='公开日期').find_next().text)
    data['Harm Level'] = clean_text(soup.find(string='危害级别').find_next().text)
    data['Affected Products'] = [clean_text(product) for product in soup.find(string='影响产品').find_next().stripped_strings]
    data['Description'] = clean_text(soup.find(string='漏洞描述').find_next().text)
    data['Vulnerability Type'] = clean_text(soup.find(string='漏洞类型').find_next().text)
    data['Reference Link'] = clean_text(soup.find(string='参考链接').find_next().text)
    data['Solution'] = clean_text(soup.find(string='漏洞解决方案').find_next().text)
    data['Vendor Patch'] = clean_text(soup.find(string='厂商补丁').find_next().text)

    return data



def extract_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    # Find all 'a' tags with 'href' attributes containing '/flaw/show/CNVD-'
    flaw_links = soup.find_all('a', href=lambda href: href and '/flaw/show/CNVD-' in href)

    # Prepend the base URL to each link and store them in a list
    base_url = "https://www.cnvd.org.cn"
    full_links = [base_url + link['href'] for link in flaw_links]
    return full_links

def save_list_to_text_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for item in data:
            file.write(f'{item}\n')

def file_to_list(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = file.readlines()
        return [item.strip() for item in data]



In [None]:
# Get a working session
session = setup_session(session, url, header)

# Retrieve the links to the vulnerabilities
cnvd_links = []
for i in tqdm(range(0, 1500, 100), desc='Retrieving links', unit='pages'):
    url = f'https://www.cnvd.org.cn/flaw/typeResult?typeId=33&max=100&offset={i}'
    response = session.get(url, headers=header)
    #print(response.text)
    if response.status_code == 200:
        #extracted_data = extract_data(response.text)
        page_links = extract_links(response.text)
        cnvd_links.extend(page_links)
    else:
        print(f'Failed to retrieve page {i//1500 + 1}')
        break

# Save the links to a text file
save_list_to_text_file(cnvd_links, 'iot_cnvd_links.txt')

In [3]:
# Retrieve the data from each link
list_of_links = file_to_list('iot_cnvd_links.txt')
print(f'Retrieving data from {len(list_of_links)} links')
session.close()
# Initialize CSV file and write the headers
csv_file = 'cnvd-iot-vulnerabilities.csv'
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['CNVD-ID', 'CVE-ID', 'Public Date', 'Harm Level', 'Affected Products', 'Description', 'Vulnerability Type', 'Reference Link', 'Solution', 'Vendor Patch'])
    writer.writeheader()

session = setup_session(session, url, header)

for link in tqdm(list_of_links, desc='Retrieving data', unit='links'):
    if (list_of_links.index(link) + 1) % 5 == 0:
        time.sleep(60)
    response = session.get(link, headers=header)
    print(response.status_code)
    if response.status_code == 200:
        extracted_data = extract_data(response.text)
        with open(csv_file, 'a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['CNVD-ID', 'CVE-ID', 'Public Date', 'Harm Level', 'Affected Products', 'Description', 'Vulnerability Type', 'Reference Link', 'Solution', 'Vendor Patch'])
            writer.writerow(extracted_data)
        time.sleep(5)
    else:
        print(f'Failed to retrieve data from {link}')

Retrieving data from 1506 links


Retrieving data:   0%|          | 0/1506 [00:00<?, ?links/s]

200


Retrieving data:   0%|          | 1/1506 [00:05<2:14:32,  5.36s/links]

200


Retrieving data:   0%|          | 2/1506 [00:10<2:14:31,  5.37s/links]

200


Retrieving data:   0%|          | 3/1506 [00:16<2:14:28,  5.37s/links]

200


Retrieving data:   0%|          | 4/1506 [00:21<2:15:20,  5.41s/links]

200


Retrieving data:   0%|          | 5/1506 [01:26<11:16:11, 27.03s/links]

200


Retrieving data:   0%|          | 6/1506 [01:32<8:11:36, 19.66s/links] 

200


Retrieving data:   0%|          | 7/1506 [01:38<5:49:53, 14.00s/links]

200





AttributeError: 'NoneType' object has no attribute 'find_next'

In [None]:
 # Write the extracted data to the CSV file
        with open(csv_file, 'a', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['CNVD-ID', 'CVE-ID', 'Public Date', 'Harm Level', 'Affected Products', 'Description', 'Vulnerability Type', 'Reference Link', 'Solution', 'Vendor Patch'])
            writer.writerow(extracted_data)
