In [1]:
import pandas as pd
import json

In [37]:
data_file_path = "./dataset/msr/MSR_data_cleaned.csv"
json_out_file_path = "./dataset/msr/msr.json"
msr_1000_file_path = "./dataset/msr/msr_balanced_1000.json"
msr_500_file_path = "./dataset/msr/msr_balanced_500.json"
train_idxs_out_file = "./dataset/msr/train_msr.txt"
test_idxs_out_file = "./dataset/msr/test_msr.txt"
valid_idxs_out_file = "./dataset/msr/valid_msr.txt"

In [3]:
# Access Gained,Attack Origin,Authentication Required,Availability,CVE ID,CVE Page,CWE ID,Complexity,Confidentiality,Integrity,Known Exploits,Publish Date,Score,Summary,Update Date,Vulnerability Classification,add_lines,codeLink,commit_id,commit_message,del_lines,file_name,files_changed,func_after,func_before,lang,lines_after,lines_before,parentID,patch,project,project_after,project_before,vul,vul_func_with_fix

cols = ["project", "lang", "commit_id", "func_before", "vul"]
cols_mapped = ["project", "commit_id", "func", "target"]
msr = pd.read_csv(
    filepath_or_buffer=data_file_path,
    usecols=cols
)
msr.head()

Unnamed: 0,commit_id,func_before,lang,project,vul
0,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static bool check_rodc_critical_attribute(stru...,C,samba,0
1,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_entry(struct samldb_ctx ...,C,samba,0
2,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_entry_callback(struct ld...,C,samba,0
3,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_handle_msDS_IntId(struct...,C,samba,0
4,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_step(struct samldb_ctx *...,C,samba,0


In [5]:
msr.drop(
    msr[msr.lang != "C"].index,
    inplace=True
)
msr.vul.value_counts()

0    175211
1     10786
Name: vul, dtype: int64

In [6]:
msr.lang.value_counts()

C    185997
Name: lang, dtype: int64

In [7]:
msr.rename(
    columns={
        "func_before": "func",
        "vul": "target"
    },
    inplace=True
)

msr.drop(
    columns=["lang"],
    inplace=True
)

In [12]:
# import json
#
# msr.to_json(
#     path_or_buf=json_out_file_path,
#     # index=False,
#     orient="records"
# )
#
# # with open("sample.json", "w") as outfile:
# #     json.dump(dictionary, outfile)

In [16]:
msr.head()

Unnamed: 0,commit_id,func,project,target
0,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static bool check_rodc_critical_attribute(stru...,samba,0
1,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_entry(struct samldb_ctx ...,samba,0
2,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_entry_callback(struct ld...,samba,0
3,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_handle_msDS_IntId(struct...,samba,0
4,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_step(struct samldb_ctx *...,samba,0


In [8]:
from sklearn.model_selection import train_test_split


def split_3(
    df_input: pd.DataFrame,
    stratify_col: str = 'target',
    frac_train: float = 0.6,
    frac_test: float = 0.25,
    frac_val: float = 0.15,
    random_state=None
):
    if frac_train + frac_val + frac_test != 1.0:
        frac_train = 0.6
        frac_test = 0.25
        frac_val = 0.15
        print(f"Invalid ratio, defaulting to train {frac_train}, test {frac_test}, val {frac_val}")

    if stratify_col not in df_input:
        stratify_col = 'target'
        print(f"Invalid col, defaulting to {stratify_col}")

    X = df_input  # Contains all columns.
    y = df_input[[stratify_col]]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X,
        y,
        stratify=y,
        test_size=(1.0 - frac_train),
        random_state=random_state
    )

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test

In [10]:
train, val, test = split_3(
    df_input=msr
)
print(train.shape, val.shape, test.shape)

(111598, 4) (27899, 4) (46500, 4)


In [12]:
train.target.value_counts()

0    105126
1      6472
Name: target, dtype: int64

In [25]:
val.target.value_counts()

0    26281
1     1618
Name: target, dtype: int64

In [26]:
test.target.value_counts()

0    43804
1     2696
Name: target, dtype: int64

In [27]:
train.head()

Unnamed: 0,commit_id,func,project,target
99101,6c5d779aaf0dec9628da8a20751e95fd09554b14,void ResourceDispatcherHost::OnReadCompleted(U...,Chrome,0
38813,31400a673325147e1205326008e32135a78b4d8a,box_ht(BOX *box)\n{\n\treturn box->high.y - bo...,postgres,0
17690,55caa8b08c84af2b50fbc936cf334a5a93dd7db5,ProcXFixesQueryVersion(ClientPtr client)\n{\n ...,xserver,0
142177,d4e0a7273cd8d7a9ee667ad5b5c8aad0f5f59251,std::string GetMimeType(const AddEntriesMess...,Chrome,0
7489,7d65a3a6ed8815e34a99c680ac3869fde49dbbd4,_dbus_validate_signature_with_reason (const DB...,dbus,0


In [32]:
import numpy as np
def  indexes_to_file(
    df_input: pd.DataFrame,
    out_file: str
):
    idxs = list(train.index.values)
    np.savetxt(out_file, idxs, delimiter="\n", fmt="%s")

indexes_to_file(train, train_idxs_out_file)
indexes_to_file(test, test_idxs_out_file)
indexes_to_file(val, valid_idxs_out_file)

In [25]:
# def generate_balance_dataset(df: pd.DataFrame):
msr_head = msr.head(500)
msr_tail = msr.tail(500)
msr_balanced_500 = pd.concat([msr_head, msr_tail])

print(msr_head.target.value_counts())
print(msr_tail.target.value_counts())
print(msr_balanced_500.target.value_counts())

0    500
Name: target, dtype: int64
1    500
Name: target, dtype: int64
1    500
0    500
Name: target, dtype: int64


In [26]:
msr_balanced_500

Unnamed: 0,commit_id,func,project,target
0,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static bool check_rodc_critical_attribute(stru...,samba,0
1,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_entry(struct samldb_ctx ...,samba,0
2,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_entry_callback(struct ld...,samba,0
3,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_handle_msDS_IntId(struct...,samba,0
4,b000da128b5fb519d2d3f2e7fd20e4a25b7dae7d,static int samldb_add_step(struct samldb_ctx *...,samba,0
...,...,...,...,...
188631,58a6822d7140137ce957c6d2fc20bae1374186c1,void impeg2d_dec_p_mb_params(dec_state_t *ps_d...,Android,1
188632,58a6822d7140137ce957c6d2fc20bae1374186c1,void impeg2d_dec_pnb_mb_params(dec_state_t *ps...,Android,1
188633,d72ea85c78a1a68bf99fd5804ad9784b4102fe57,int equalizer_get_parameter(effect_context_t *...,Android,1
188634,9fe27a9b445f7e911286ed31c1087ceac567736b,"uint8_t rfc_parse_data(tRFC_MCB* p_mcb, MX_FRA...",Android,1


In [27]:
msr_balanced_500 = msr_balanced_500.sample(frac=1, random_state=69)
msr_balanced_500

Unnamed: 0,commit_id,func,project,target
188459,04839626ed859623901ebd3a5fd483982186b59d,"SeekHead::SeekHead(\n Segment* pSegment,\n ...",Android,1
188635,d4a34fefbf292d1e02336e4e272da3ef1e3eef85,"uint8_t rfc_parse_data(tRFC_MCB* p_mcb, MX_FRA...",Android,1
188527,5a9753fca56f0eeb9f61e342b2fccffc364f9426,virtual void SetUp() {\n UUT_ = GET_PAR...,Android,1
188417,04839626ed859623901ebd3a5fd483982186b59d,void CuePoint::Load(IMkvReader* pReader)\n{\n ...,Android,1
188603,5a9753fca56f0eeb9f61e342b2fccffc364f9426,"SvcTest()\n : codec_iface_(0),\n ...",Android,1
...,...,...,...,...
188238,295c883fe3105b19bcd0f9e07d54c6b589fc5bff,OMX_ERRORTYPE SoftOpus::internalGetParameter(\...,Android,1
188255,ca8ac8acdad662230ae37998c6c4091bb39402b6,bool CmapCoverage::getCoverage(SparseBitSet& c...,Android,1
188605,5a9753fca56f0eeb9f61e342b2fccffc364f9426,"int main(int argc, char **argv) {\n ::testing:...",Android,1
203,1ddf72180a52d247db88ea42a3e35f824a8fbda2,"int phar_mount_entry(phar_archive_data *phar, ...",php,0


In [35]:
import numpy as np
len_size = len(msr_balanced_500)
slices = msr_balanced_500.groupby(np.arange(len_size)//100)
slices = [(s, slice.apply(lambda x: x)) for s, slice in slices]
for s, slice in slices:
    print(s)
    print(slice.target.value_counts())


0
0    52
1    48
Name: target, dtype: int64
1
0    53
1    47
Name: target, dtype: int64
2
1    56
0    44
Name: target, dtype: int64
3
0    55
1    45
Name: target, dtype: int64
4
1    52
0    48
Name: target, dtype: int64
5
0    54
1    46
Name: target, dtype: int64
6
0    57
1    43
Name: target, dtype: int64
7
1    56
0    44
Name: target, dtype: int64
8
1    51
0    49
Name: target, dtype: int64
9
1    56
0    44
Name: target, dtype: int64


In [39]:
msr_balanced_500.to_json(
    path_or_buf=msr_500_file_path,
    # index=False,
    orient="records"
)