In [None]:
!pip install gdown --quiet
!mkdir DiverseVul
!gdown  12IWKhmLhq7qn5B_iXgn5YerOQtkH-6RG 
!mv diversevul_20230702.json ./DiverseVul/diversevul.json
!ls -l

In [None]:
import json
import pandas as pd
import re
import hashlib

rm_ws_re = re.compile(r'\s+')
rm_cmmnt_re = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')

def remove_whitespaces(code):
    return re.sub(rm_ws_re, '', code)

def remove_comments(code):
    return re.sub(rm_cmmnt_re,'',code)


# in the diversevul dataset, each line is a seperate json object
# so each line has to be treated as essentially its own json file
def load_diversevul(path="./DiverseVul/diversevul.json"):
    diversevul_df = pd.DataFrame()
    full_json = {}
    with open(path, 'r') as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            x = json.loads(line)
            
            # hash after removing comments and whitespaces
            normalized_func = remove_comments(x['func'])
            # exceptional case where some multiline comments are not closed
            # or the function is simply not scraped correctly, we simply skip these cases
            # though this does eliminate normal functions with "/*" in string literals
            if "/*" in normalized_func or "*/" in normalized_func:
                continue
            
            normalized_func = remove_whitespaces(normalized_func)
            
            x['hash'] = hashlib.sha1(normalized_func.encode('utf-8')).hexdigest()
            
            for key in x:
                if key in full_json:
                    full_json[key][i] = x[key]
                else:
                    full_json[key] = {i: x[key]}
            print("\r{}".format(i), end=' ')

    diversevul_df = pd.DataFrame(full_json)
    return diversevul_df

df = load_diversevul()
df.info()

In [None]:
df = df.drop_duplicates(subset=['hash'], keep=False)
df.info()

In [None]:
df_vuln = df[df['target'] == 1]
df_nonvuln = df[df['target'] == 0]

df_vuln.info()
print("==================================================")
df_nonvuln.info()

In [None]:
len(df)

In [None]:
from difflib import SequenceMatcher
import hashlib

vuln_patch_map = {}

one_match_ctr = 0
many_matches_ctr = 0
no_match_ctr = 0

for vuln_index, vuln_func, vuln_commit_id, vuln_project in zip(df_vuln.index, df_vuln['func'], df_vuln['commit_id'], df_vuln['project']):
    df_matches = df_nonvuln[(df_nonvuln['commit_id'] == vuln_commit_id) & (df_nonvuln['project'] == vuln_project)]
    if len(df_matches) == 0:
        continue
    # compare normalized function signatures
    # 1. one match: we have the patch
    # 2. more than one match: similarity check on the set of matches
    # 3. no match: similarity check on all matches
    
    norm_vuln_func_sign = remove_whitespaces(remove_comments(vuln_func)).split(")")[0]
    strong_match_indices = []
    for i, candidate_patch in enumerate(df_matches['func']):
        norm_candidate_sign = remove_whitespaces(remove_comments(candidate_patch)).split(")")[0]
        if norm_vuln_func_sign == norm_candidate_sign:
            strong_match_indices.append(i)
            
    if len(strong_match_indices) == 1:
        vuln_patch_map[vuln_index] = df_matches.iloc[strong_match_indices[0]].name
        one_match_ctr += 1
        
    elif len(strong_match_indices) > 1:
        df_strong_matches = df_matches.iloc[strong_match_indices]
        similarity_scores = []
        for candidate_func in df_strong_matches['func']:
            similarity_scores.append(SequenceMatcher(None, vuln_func, candidate_func).ratio())
        strongest_match_idx = similarity_scores.index(max(similarity_scores))
        
        vuln_patch_map[vuln_index] = df_strong_matches.iloc[strongest_match_idx].name
        many_matches_ctr += 1
        
    elif len(strong_match_indices) == 0:
        similarity_scores = []
        for candidate_func in df_matches['func']:
            similarity_scores.append(SequenceMatcher(None, vuln_func, candidate_func).ratio())
        strongest_match_idx = similarity_scores.index(max(similarity_scores))
        
        vuln_patch_map[vuln_index] = df_matches.iloc[strongest_match_idx].name
        no_match_ctr += 1
    
    print(("\r1 match: {}, many matches: {}, no matches: {}" + " " * 200).format(one_match_ctr, many_matches_ctr, no_match_ctr), end=' ')

In [None]:
#vuln_patch_map = {100: 1000, 200: 2000, 300: 3050, 400: 4050, 500: 5000}

vuln_idx_list = []
patch_idx_list = []

for k, v in vuln_patch_map.items():
    vuln_idx_list.append(k)
    patch_idx_list.append(v)

df_vuln_part = df.loc[vuln_idx_list].reset_index().rename(columns={"func": "vuln", "hash": "vuln_hash"}).drop(columns=['target'])
df_patch_part = df.loc[patch_idx_list].reset_index().rename(columns={"func": "patch", "hash": "patch_hash"}).drop(columns=[
    'target',
    'commit_id',
    'size',
    'message',
    'project',
    'cwe'
])

df_final = pd.concat([df_vuln_part, df_patch_part], axis=1).drop(columns='index')
df_final.to_json('DiverseVulnPatchPairs.json')

df_final.info()
df_final.head()