In [None]:
!pip install gdown --quiet
!mkdir DiverseVul
!gdown  12IWKhmLhq7qn5B_iXgn5YerOQtkH-6RG 
!mv diversevul_20230702.json ./DiverseVul/diversevul.json
!ls -l

In [None]:
import json
import pandas as pd
import re
import hashlib

rm_ws_re = re.compile(r'\s+')
rm_cmmnt_re = re.compile(r'(/\*([^*]|(\*+[^*/]))*\*+/)|(//.*)')

def remove_whitespaces(code):
    return re.sub(rm_ws_re, '', code)

def remove_comments(code):
    return re.sub(rm_cmmnt_re,'',code)


# in the diversevul dataset, each line is a seperate json object
# so each line has to be treated as essentially its own json file
def load_diversevul(path="./DiverseVul/diversevul.json"):
    diversevul_df = pd.DataFrame()
    full_json = {}
    with open(path, 'r') as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            x = json.loads(line)
            
            # hash after removing comments and whitespaces
            normalized_func = remove_comments(x['func'])
            # exceptional case where some multiline comments are not closed
            # or the function is simply not scraped correctly, we simply skip these cases
            # though this does eliminate normal functions with "/*" in string literals
            if "/*" in normalized_func or "*/" in normalized_func:
                continue
            
            normalized_func = remove_whitespaces(normalized_func)
            
            x['hash'] = hashlib.sha1(normalized_func.encode('utf-8')).hexdigest()
            
            for key in x:
                if key in full_json:
                    full_json[key][i] = x[key]
                else:
                    full_json[key] = {i: x[key]}
            print("\r{}".format(i), end=' ')

    diversevul_df = pd.DataFrame(full_json)
    return diversevul_df

df = load_diversevul()
df.info()

In [None]:
df = df.drop_duplicates(subset=['hash'], keep=False)
df.info()

In [None]:
!rm -r *
df.to_csv("./DiverseVul-Cleaned.csv")