In [None]:
# Filter and download source files
# acquire data_info：cve、cwe、fix commit content

import re
import os
import json
import os
import requests

headers = {'Authorization': 'token'} # API tokens

# Tool: Delete txt files starting with the specified number in the specified folder
def delete_files_with_prefix(folder_path, prefix_number): 
    for filename in os.listdir(folder_path):
        prefix = f"{prefix_number}_"
        if filename.startswith(prefix) and filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            os.remove(file_path)

# Tool: Remove comments and blank lines from source code to reduce input tokens
def remove_comments(language,source_code):
    if language == "python":
        source_code = re.sub(r'(\'\'\'[\s\S]*?\'\'\'|\"\"\"[\s\S]*?\"\"\")', '', source_code)
        source_code = re.sub(r'#.*', '', source_code)
        lines = source_code.split('\n')
        non_empty_lines = [line for line in lines if line.strip() != '']
        source_code = '\n'.join(non_empty_lines)
    elif language == "java":
        source_code = re.sub('//.*', '', source_code)*/
        source_code = re.sub('/\*.*?\*/', '', source_code, flags=re.DOTALL)
        source_code = '\n'.join([line for line in source_code.splitlines() if line.strip()])
    # elif language == "...": # Can be extended to other languages
    return source_code



# Switching between using java, python, or other languages
language = "python" 
file_extension = ".py"

cve_valuable_lst = []
cve_onlyadd_lst = []
cve_onelanguage_lst = []

# Check if the file has been saved
num_save = []
folder_path = f"./data/{language}/{language}_filter/"
for filename in os.listdir(folder_path):
    num1 = filename.split("_",1)[0]
    if num1 not in num_save:
        num_save.append(num1)
sorted_num_save = sorted(num_save, key=lambda x: int(x))

# If there is enough memory, preload all files first
with open(f'./data/nvd_data/nvdcve-1.1-2020.json', 'rb') as f2:
    content2 = f2.read()
with open(f'./data/nvd_data/nvdcve-1.1-2021.json', 'rb') as f3:
    content3 = f3.read()
with open(f'./data/nvd_data/nvdcve-1.1-2022.json', 'rb') as f4:
    content4 = f4.read()
with open(f'./data/nvd_data/nvdcve-1.1-2023.json', 'rb') as f5:
    content5 = f5.read()
with open(f'./data/nvd_data/nvdcve-1.1-2024.json', 'rb') as f6:
    content6 = f6.read()

# Read the original REEF file line by line, with one CVE per line
num = 0
with open(f'./data/REEF_data/query_{language}.jsonl', 'rb') as f1:
    for line in f1:
        if num >= 0: # Starting from which line, usually set to 0
            if num >= 863: # Ending to which line, total: python-863, java-541
                break
            else:
                # skip if the file has already been saved
                if str(num) in sorted_num_save:
                    num += 1
                    continue 
                else:
                    data_info = json.loads(line) 
        else: # skip below the starting point
            num += 1
            continue

        # Filter vulnerability data from 2020 to 2024
        cve = data_info['cve_id']
        year = cve.split("-")[1]
        if year not in ['2020','2021','2022','2023','2024']: 
            num += 1
            continue

        # If there is a duplicate in CVE, skip it
        if cve in cve_valuable_lst:
            num += 1
            continue

        # Find the corresponding CVE number from the NVD data for corresponding year
        if year == "2020":
            content = content2
        elif year == "2021":
            content = content3
        elif year == "2022":
            content = content4
        elif year == "2023":
            content = content5
        elif year == "2024":
            content = content6
        nvd_cve = json.loads(content)
        cve_lst = nvd_cve["CVE_Items"]
        for j in cve_lst: 
            cve_id = j["cve"]["CVE_data_meta"]["ID"]
            if cve_id == cve:
                cve_info = j
                break

        # Remove "rejected" vulnerabilities
        des = cve_info["cve"]["description"]["description_data"][0]["value"]
        if "Rejected" in des:
            num += 1
            continue
        cve_valuable_lst.append(cve)

        # Filter patches. Rule: Contains suffixes such as'. py ', does not include 'test'
        patch_file_list1 = data_info['details']
        patch_file_list2 = []
        for i in patch_file_list1:
            file_string = i['raw_url'].lower()
            if file_extension in file_string: 
                if "test" not in file_string:
                    patch_file_list2.append(i)

        # Skip without files after filtering
        if patch_file_list2 == []:
            if num not in cve_onelanguage_lst:
                cve_onelanguage_lst.append(num)
            num += 1
            continue

        # Download every file in the patch from GitHub
        patch_file_list3 = []
        wrong_flag1 = 0
        for j in range(0,len(patch_file_list2)): 
            comp = patch_file_list2[j]['raw_url'].split("/")
            org =  comp[3] 
            project = comp[4]
            file_path = comp[7].replace(r"%2F","/")
            url1 = data_info['url']

            r1 = ""
            t1 = 0 
            while t1 < 3: 
                try:
                    r1 = requests.get(url1, headers = headers, timeout = 10)
                    break
                except:
                    t1 += 1
            if isinstance(r1,str):
                print(f'{num}_{j} cannot get commit content')
                wrong_flag1 = 1
                break
            else:
                if r1.status_code == 200: # if not exceed GitHub limit
                    response_dict1 = r1.json()
                    sha2 = response_dict1["parents"][0]["sha"] # sha in any parent commit is the same, so just use the first one.

                    filename1 = f"./data/{language}/{language}_source/{num}_{j}@{org}@{project}@{sha2}@{comp[7]}.txt"
                    safe_filename1 = filename1.replace("%2F", "_")

                    filename2 = f"./data/{language}/{language}_filter/{num}_{j}@{org}@{project}@{sha2}@{comp[7]}.txt"
                    safe_filename2 = filename2.replace("%2F", "_")
                    if os.path.exists(safe_filename1): # If it has already been downloaded, skip it and return directly to the file list
                        print(f'{num} vul file has downloaded')
                        patch_file_list3 = patch_file_list2
                        break
                    
                    vul_file_source = f'https://raw.githubusercontent.com/{org}/{project}/{sha2}/{file_path}'
                    file1 = ""
                    t2 = 0
                    while t2 < 3:
                        try:
                            file1 = requests.get(vul_file_source, headers = headers, timeout = 10)
                            break
                        except:
                            t2 += 1

                    if isinstance(file1,str):
                        print(f'{num}_{j} cannot get vul file content')
                        wrong_flag1 = 1
                        break
                    
                    else:
                        if file1.status_code == 200: # If it's not 200, it means the original version only has the newly added file.
                            patch_file_list3.append(patch_file_list2[j]) # final patch file list

                            os.makedirs(os.path.dirname(safe_filename1), exist_ok=True)
                            with open(safe_filename1,"wb") as f2: # save the source file
                                f2.write(file1.content)

                            file2 = file1.content.decode()
                            file2 = remove_comments(language,file2) # Store after removing comments
                            file2 = file2.encode()
                            os.makedirs(os.path.dirname(safe_filename2), exist_ok=True)
                            with open(safe_filename2,"wb") as f3: # save the filter file
                                f3.write(file2)
                        else:
                            if num not in cve_onlyadd_lst:
                                cve_onlyadd_lst.append(num)
                            wrong_flag1 = 1
                            break
                else:
                    print(f'{num}_{j} 3 timeout retries')
                    if num not in cve_onlyadd_lst:
                        cve_onlyadd_lst.append(num)
                    wrong_flag1 = 1                    
                    break

        if wrong_flag1 == 1: # If any stage fails, delete all stored files
            folder_path1 = f"./data/{language}/{language}_source/"
            delete_files_with_prefix(folder_path1,num)
            folder_path2 = f"./data/{language}/{language}_filter/"
            delete_files_with_prefix(folder_path2,num)
            print(f"delete all files of vul {num}")
            num += 1
            continue

        num += 1 # next line
        
print("end")