## Import libraries

In [1]:
import json
import os

## Read data

In [2]:
data = json.load(open('Dataset/CVE_data/nvdcve-1.1-2024.json', encoding='utf-8'))
modified_data = []

In [3]:
for item in data['CVE_Items']:
    id = item['cve']['CVE_data_meta']['ID']
    assigner = item['cve']['CVE_data_meta']['ASSIGNER']
    lang = item['cve']['description']['description_data'][0]['lang']
    value = item['cve']['description']['description_data'][0]['value']
    publish_date = item['publishedDate'].split('T')[0]
    cve = {
        'id': id,
        'assigner': assigner,
        'lang': lang,
        'value': value,
        'publish_date': publish_date
    }
    modified_data.append(cve)


### Store modified data

In [4]:
with open('Dataset/CVE_data/modified_nvdcve.json', 'w') as file:
    json.dump(modified_data, file, indent=4)

## Create MD files

In [5]:
os.makedirs('Dataset/markdown_files/10_000', exist_ok=True)

for i in range(0, 10_000):
    cve = modified_data[i]
    
    id = cve['id']
    assigner = cve['assigner']
    description = cve['value']
    publish_date = cve['publish_date']

    # Create Markdown content
    markdown_content = f"# {id} - {assigner}\n\n"
    markdown_content += f"{description}\n\n"
    markdown_content += f"**Publish date:** {publish_date}\n"

    # Save the Markdown file
    with open(f'Dataset/markdown_files/10_000/{id}.md', 'w', encoding='utf-8') as file:
        file.write(markdown_content)

### (Not needed) to save PDF files

In [None]:
from fpdf import FPDF
os.makedirs('pdf_files', exist_ok=True)

for i in range(0, 10_000):
    cve = modified_data[i]

    # Assign the values to the variables
    id = cve['id']
    assigner = cve['assigner']
    description = cve['value']
    publish_date = cve['publish_date']

    # Create the PDF
    pdf = FPDF('P', 'mm', 'A4')
    pdf.add_page()
    pdf.set_margins(0, 0, 0)

    # Page title
    pdf.set_font('Arial', 'B', 14)
    pdf.cell(0, 10, f'{id} - {assigner}', 0, 1, 'L')
    pdf.ln(2)

    # Page body
    pdf.set_font('Arial', '', 12)
    description = description.encode('latin-1', 'replace').decode('latin-1')
    pdf.multi_cell(0, 10, description)
    pdf.ln(2)
    pdf.set_font('Arial', 'B', 12)
    pdf.multi_cell(0, 10, 'Publish date: ' + publish_date)
    pdf.ln()

    # Save the PDF
    pdf.output(f'pdf_files/{id}.pdf', 'F')