In [None]:
# Build a dataset that includes the following data: num,cve,description,cwe_id_list,cwe_name_list,url,html_url,details,filter_patch_list,filename_list
import json
import os
import pandas as pd
import re
import requests

cwe_data = pd.read_csv('./data/nvd_data/1000_CWE_raw data.csv',sep=',',encoding="utf-8")

headers = {'Authorization': 'token'} # API tokens

# Tool: Return all files starting with a prefix in the directory
def prefix_file_list(folder_path, prefix_number):
    matched_files = []
    for filename in os.listdir(folder_path):
        prefix = f"{prefix_number}_"
        file_path = os.path.join(folder_path, filename)
        if filename.startswith(prefix) and os.path.isfile(file_path):
            matched_files.append(file_path)
    return matched_files

# Tool: Extract all modified code blocks from the patch
def extract_modified_blocks(language,patch):
    modified_blocks = []
    current_block = None
    current_function = ""
    block_num = 0
    
    lines = patch.split('\n')

    for line in lines:
        if line.startswith("@@"):
            if current_block is not None:
                modified_blocks.append(current_block)
            block_num += 1
            if language == "python":
                if "def" in line:
                    current_function = line.split("@@")[-1].split('def')[1].strip()
                else:
                    current_function = ""
            elif language == "java":
                if line.split("@@")[2] != "":
                    current_function = line.split("@@")[2].strip()
                else:
                    current_function = ""
            current_block = {
                "Modified_Block_Num": str(block_num),
                "Modified_Function": current_function,
                "Modified_Line": ""
            }
        elif line.startswith("+") or line.startswith("-"):
            if line.startswith("\\"):
                continue
            if current_block is not None:
                current_block["Modified_Line"] += line + "\n"
    
    if current_block is not None:
        modified_blocks.append(current_block)

    return modified_blocks

# Switching between using java, python, or other languages
language = "python"
file_extension = ".py"

# Retrieve the filtered file list
num_save = []
folder_path = f"./data/{language}/{language}_filter/"
for filename in os.listdir(folder_path):
    num1 = filename.split("_",1)[0]
    if num1 not in num_save:
        num_save.append(num1)
sorted_num_save = sorted(num_save, key=lambda x: int(x))

# preload all files first
with open(f'./data/nvd_data/nvdcve-1.1-2020.json', 'rb') as f2:
    content2 = f2.read()
with open(f'./data/nvd_data/nvdcve-1.1-2021.json', 'rb') as f3:
    content3 = f3.read()
with open(f'./data/nvd_data/nvdcve-1.1-2022.json', 'rb') as f4:
    content4 = f4.read()
with open(f'./data/nvd_data/nvdcve-1.1-2023.json', 'rb') as f5:
    content5 = f5.read()
with open(f'./data/nvd_data/nvdcve-1.1-2024.json', 'rb') as f6:
    content6 = f6.read()

num = 0 # Read the original REEF file line by line, with one CVE per line
with open(f'./data/REEF_data/query_{language}.jsonl', 'rb') as f1:
    for line in f1:
        if num >= 0: # Starting from which line, usually set to 0
            if num >= 863: # Ending to which line, total: python-863, java-541
                break
            else:
                # Only process filtered files, that have saved 
                if str(num) in sorted_num_save:
                    data_info = json.loads(line) 
                else:
                    num += 1
                    continue 
        else: # skip below the starting point
            num += 1
            continue

        print(f"{num} Start construct basic info")
        data = {} # final dict
        cve = data_info['cve_id'] # cve id
        year = cve.split("-")[1]

        # Find the corresponding CVE number from the NVD data for corresponding year
        if year == "2020":
            content = content2
        elif year == "2021":
            content = content3
        elif year == "2022":
            content = content4
        elif year == "2023":
            content = content5
        elif year == "2024":
            content = content6
        nvd_cve = json.loads(content)
        cve_lst = nvd_cve["CVE_Items"]
        for j in cve_lst:
            cve_id = j["cve"]["CVE_data_meta"]["ID"]
            if cve_id == cve:
                cve_info = j
                break
        # description information
        des = cve_info['cve']['description']['description_data'][0]['value']

        # CWE information
        cweid = []
        cwe_name_lst = []
        if cve_info['cve']['problemtype']['problemtype_data'][0]['description'] != []:
            cweid_list1 = cve_info['cve']['problemtype']['problemtype_data']
            for j in cweid_list1:
                cweid_list2 = j['description']
                for k in cweid_list2:
                    if k['value'] not in cweid:
                        cweid.append(k['value'])
        if cweid == ['NVD-CWE-noinfo'] or cweid == []:
            cweid = data_info['CWEs']

        for j in cweid:
            cwe_name = "none"
            if j != "NVD-CWE-noinfo":
                cwe_num = ''.join(re.findall(r'\d+', j)) # extract numbers in CWE-ID
                for i in range(0,len(cwe_data)):
                    if str(cwe_data.iloc[i]['CWE-ID']) == cwe_num:
                        cwe_name = cwe_data.iloc[i]['Name']
                        break
            if cwe_name not in cwe_name_lst:
                cwe_name_lst.append(cwe_name)
        
        url = data_info['url']
        html_url = data_info['html_url']
        filename_list = prefix_file_list(f"./data/{language}/{language}_filter/", num)

        # acquire diff code in the patch from GitHub
        patch_lst = []
        r1 = ""
        t1 = 0 
        while t1 < 3: 
            try:
                r1 = requests.get(url, headers = headers, timeout = 10)
                break
            except:
                t1 += 1
        if isinstance(r1,str): 
            print(f'{num} cannot get commit content')
        else:
            if r1.status_code == 200: 
                response_dict1 = r1.json()
                for j in response_dict1["files"]:
                    file_string = j['filename'].lower()
                    if file_extension in file_string: 
                        if "test" not in file_string:
                            patch_dict = {}
                            if "patch" in j:
                                patch_content = extract_modified_blocks(language,j['patch'])
                                patch_dict = {"patch_file": j['filename'],"patch_content":patch_content}
                                patch_lst.append(patch_dict)
                            else:
                                continue
            else:
                print('%d_%d 3 timeout retries'%(num,j))
                    
        data["num"] = num
        data["cve"] = cve
        data["des"] = des
        data["cwe_list"] = cweid
        data["cwe_name_lst"] = cwe_name_lst
        data["url"] = url
        data["html_url"] = html_url
        data["filename_list"] = filename_list
        data["patch_list"] = patch_lst

        file_path = f"./data/{language}/{language}_vul.jsonl"
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        file_mode = 'a' if os.path.exists(file_path) else 'w'
        with open(file_path, file_mode, encoding='utf-8') as f:
            json_line = json.dumps(data, ensure_ascii=False)
            f.write(json_line + '\n')

        num += 1 # next line

print('end')