In [1]:
import pandas as pd
import os
import clang.cindex
import gc
import json
import os
import subprocess
import swifter
import tempfile

import dask.dataframe as dd
import numpy as np
import pandas as pd

from tqdm import tqdm

# This cell might not be needed for you.
clang.cindex.Config.set_library_file(
    '/usr/lib/llvm-7/lib/libclang-7.so.1'
)



In [2]:
import networkx as nx

In [3]:
def generate_ast_roots(code):
    """
    Takes in a list of files/datapoints from juliet.csv.zip (as loaded with pandas) matching one particular
    testcase, and preprocesses it ready for the feature matrix.
    """
    index = clang.cindex.Index.create()
    parse_list = [('test.cpp', code)]
    translation_unit = index.parse(
    path='test.cpp',
    unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor
    
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)
    
    return ast_root

def generate_features(ast_root):
    """
    Given a concretised & numbered clang ast, return a dictionary of
    features in the form:
        {
            <node_id>: [<degree>, <type>, <identifier>, <line_num>],
            ...
        }
    """
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree

        features[node.identifier] = [degree, str(node.kind), node.displayname, node.location.line]

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

def concretise_ast(node):
    """
    Everytime you run .get_children() on a clang ast node, it
    gives you new objects. So if you want to modify those objects
    they will lose their changes everytime you walk the tree again.
    To avoid this problem, concretise_ast walks the tree once,
    saving the resulting list from .get_children() into a a concrete
    list inside the .children.
    You can then use .children to consistently walk over tree, and
    it will give you the same objects each time.
    """
    node.children = list(node.get_children())

    for child in node.children:
        counter = concretise_ast(child)


def number_ast_nodes(node, counter=1):
    """
    Given a concretised clang ast, assign each node with a unique
    numerical identifier. This will be accessible via the .identifier
    attribute of each node.
    """
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)

    return counter

def generate_edgelist(ast_root):
    """
    Given a concretised & numbered clang ast, return a list of edges
    in the form:
        [
            [<start_node_id>, <end_node_id>],
            ...
        ]
    """
    edges = []

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges.append([node.identifier, child.identifier])
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)

    return edges

def generate_long_path(ast_root):
    """
    Given a concretised & numbered clang ast, return long paths
    """
    long_path = []
    edgelist = generate_edgelist(ast_root)
    G = nx.DiGraph()
    G.add_edges_from(edgelist)
    leafnodes = [x for x in G.nodes() if G.out_degree(x)==0 and G.in_degree(x)==1]
    rootnode = [x for x in G.nodes() if G.in_degree(x)==0]
    for node in leafnodes:
        long_path.append(nx.shortest_path(G, source=rootnode[0], target=node))
    return long_path

In [95]:
samll_data = pd.read_csv("./data/all_function_no_comments_drop_duplicates.csv")

In [96]:
samll_data["long_path_combine"] = None#[[[[5, 4, 3, 2, 1, 2, 3, 6, 7], [0, 1, 13, 5]], 1],...,...]
samll_data["long_path_greedy"] = None
samll_data["long_path"] = None
samll_data["nodes"] = None
samll_data["edge_list"] = None
samll_data["flaw_loc"] = None
samll_data["longest_path_token_num"] = None
samll_data["path_num"] = None

In [97]:
samll_data

Unnamed: 0.1,Unnamed: 0,id,bug,code,flaw,func_with_fix,vul_lines,vul_lines_fix,code_no_comment,long_path_combine,long_path_greedy,long_path,nodes,edge_list,flaw_loc,longest_path_token_num,path_num
0,0,0,1,pango_glyph_string_set_size (PangoGlyphString ...,CWE-189,pango_glyph_string_set_size (PangoGlyphString ...,\tstring->space = 1;\n\tstring->space *= 2;\n ...,\t{\n\t string->space = 4;\n\t}\n\t const gu...,pango_glyph_string_set_size (PangoGlyphString ...,,,,,,,,
1,1,1,1,"dispatch_cmd(conn c)\n{\n int r, i, timeout...",,"dispatch_cmd(conn c)\n{\n int r, i, timeout...","return reply_msg(c, MSG_JOB_TOO_BI...",/* throw away the job body and res...,"dispatch_cmd(conn c)\n{\n int r, i, timeout...",,,,,,,,
2,2,2,1,static GIOChannel *irssi_ssl_get_iochannel(GIO...,CWE-20,static GIOChannel *irssi_ssl_get_iochannel(GIO...,,static GIOChannel *irssi_ssl_get_iochannel(GIO...,static GIOChannel *irssi_ssl_get_iochannel(GIO...,,,,,,,,
3,3,3,1,int irssi_ssl_handshake(GIOChannel *handle)\n{...,CWE-20,int irssi_ssl_handshake(GIOChannel *handle)\n{...,\tret = !chan->verify || irssi_ssl_verify(chan...,\tret = !chan->verify || irssi_ssl_verify(chan...,int irssi_ssl_handshake(GIOChannel *handle)\n{...,,,,,,,,
4,4,4,1,"static gboolean irssi_ssl_verify(SSL *ssl, SSL...",CWE-20,"static gboolean irssi_ssl_verify(SSL *ssl, SSL...",,/* Checks if the given string has internal NUL...,"static gboolean irssi_ssl_verify(SSL *ssl, SSL...",,,,,,,,
5,5,5,1,"GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...",CWE-20,"GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...",\tssl_handle = irssi_ssl_get_iochannel(handle...,"GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...","GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...",,,,,,,,
6,6,6,1,static void server_real_connect(SERVER_REC *se...,CWE-20,static void server_real_connect(SERVER_REC *se...,"\t\t\tnet_connect_ip_ssl(ip, port, own_ip, ser...","\t\t\tnet_connect_ip_ssl(ip, port, server->con...",static void server_real_connect(SERVER_REC *se...,,,,,,,,
7,7,7,1,static int try_read_command(conn *c) {\n as...,CWE-20,static int try_read_command(conn *c) {\n as...,"if (strcmp(ptr, ""get "") && str...",if (ptr - c->rcurr > 100 ||\n ...,static int try_read_command(conn *c) {\n as...,,,,,,,,
8,8,8,1,test_js (void) {\n GString *result = g_stri...,CWE-264,test_js (void) {\n GString *result = g_stri...,/* uzbl commands can be run from javascrip...,,test_js (void) {\n GString *result = g_stri...,,,,,,,,
9,9,9,1,"eval_js(WebKitWebView * web_view, gchar *scrip...",CWE-264,"eval_js(WebKitWebView * web_view, gchar *scrip...",JSStringRef var_name;\n /* uzbl javascr...,,"eval_js(WebKitWebView * web_view, gchar *scrip...",,,,,,,,


In [98]:
for index, row in samll_data.iterrows():
    
    #get flaw line
    flaw_loc= []
    flaw_line_code = row["vul_lines"]
    if not str(flaw_line_code) == "nan":
        flaw_line_code = flaw_line_code.strip().split("\n")
        for each_flaw_line in range(len(flaw_line_code)):
            flaw_line_code[each_flaw_line] = flaw_line_code[each_flaw_line].strip()
        code = row["code"].strip().split("\n")
        for code_line in range(len(code)):
            code[code_line] = code[code_line].strip()
            
        for flaw_line in flaw_line_code:
            for line_loc , line in enumerate(code):
                if flaw_line == line:
                    flaw_loc.append(line_loc)
    flaw_loc = list(set(flaw_loc))
    if len(flaw_loc)>0:
        samll_data.loc[index,"flaw_loc"] = str(flaw_loc)
    
    #get ast
    code = row["code"]
    ast_root = generate_ast_roots(code)
    
    #get edge
    edge_list = generate_edgelist(ast_root)
    
    #get nodes
    nodes = generate_features(ast_root)
    
    #get long path
    long_path = generate_long_path(ast_root)
    if not ( (len(long_path)==1 and len(long_path[0])==2) or (len(long_path)==0) ):
        samll_data.loc[index,"long_path"] = str(long_path)
        samll_data.loc[index,"edge_list"] = str(edge_list)
        samll_data.loc[index,"nodes"] = str(nodes)
        
        long_path_greedy = sorted(long_path,key = lambda i:len(i),reverse=True)
        
        path_len = len(long_path_greedy)
        long_path_greedy_2=[]
        long_path_2=[]
        i = 0
        if path_len%2 == 0:
            while i <  path_len:
                start1 = long_path_greedy[i][::-1]
                end1 = long_path_greedy[i+1][1:]
                path1 = start1+end1
                long_path_greedy_2.append(path1)

                start2 = long_path[i][::-1]
                end2 = long_path[i+1][1:]
                path2 = start2+end2
                long_path_2.append(path2)

                i+=2
        else:
            while i <  path_len:
                if i == path_len-1:
                    start1 = long_path_greedy[i][::-1]
                    end1 = long_path_greedy[i][1:]
                    path1 = start1+end1
                    long_path_greedy_2.append(path1)

                    start2 = long_path[i][::-1]
                    end2 = long_path[i][1:]
                    path2 = start2+end2
                    long_path_2.append(path2)
                    i+=2
                else:    
                    start1 = long_path_greedy[i][::-1]
                    end1 = long_path_greedy[i+1][1:]
                    path1 = start1+end1
                    long_path_greedy_2.append(path1)

                    start2 = long_path[i][::-1]
                    end2 = long_path[i+1][1:]
                    path2 = start2+end2
                    long_path_2.append(path2)

                    i+=2
        
        long_path_greedy_2_with_cover_lines = []
        long_path_2_with_cover_lines = []
        
        for each_path in long_path_greedy_2:
            cover_lines = []
            for node_id in each_path:
                cover_lines.append(nodes[node_id][3])
            long_path_greedy_2_with_cover_lines.append([each_path,list(set(cover_lines))])
            
        for each_path in long_path_2:
            cover_lines = []
            for node_id in each_path:
                cover_lines.append(nodes[node_id][3])
            long_path_2_with_cover_lines.append([each_path,list(set(cover_lines))])
        
        long_path_greedy_2_with_label = []
        long_path_2_with_label = []
        
        for each_path in long_path_greedy_2_with_cover_lines:
            loc_find = False
            for node_id in each_path[0]:
                if nodes[node_id][3] in flaw_loc:
                    loc_find = True
            if loc_find:
                long_path_greedy_2_with_label.append([each_path,1])
            else:
                long_path_greedy_2_with_label.append([each_path,0])
                    
        for each_path in long_path_2_with_cover_lines:
            loc_find = False
            for node_id in each_path[0]:
                if nodes[node_id][3] in flaw_loc:
                    loc_find = True
            if loc_find:
                long_path_2_with_label.append([each_path,1])
            else:
                long_path_2_with_label.append([each_path,0])
    
        samll_data.loc[index,"long_path_greedy"] = str(long_path_greedy_2_with_label)
        samll_data.loc[index,"long_path_combine"] = str(long_path_2_with_label)
        samll_data.loc[index,"path_num"] = len(long_path_2_with_label)
        longest_path_token_num = 0
        for path in long_path_2_with_label:
            if len(path[0][0]) > longest_path_token_num:
                longest_path_token_num = len(path[0][0])
        samll_data.loc[index,"longest_path_token_num"] = longest_path_token_num
    #print(flaw_loc)
    
#     graph_rep.loc[index,"nodes"] = str(dic)
    if (index%999 ==0):    
        print("index",index)

index 0
index 999
index 1998
index 2997
index 3996
index 4995
index 5994
index 6993
index 7992
index 8991
index 9990
index 10989
index 11988
index 12987
index 13986
index 14985
index 15984
index 16983
index 17982
index 18981
index 19980
index 20979
index 21978
index 22977
index 23976
index 24975
index 25974
index 26973
index 27972
index 28971
index 29970
index 30969
index 31968
index 32967
index 33966
index 34965
index 35964
index 36963
index 37962
index 38961
index 39960
index 40959
index 41958
index 42957
index 43956
index 44955
index 45954
index 46953
index 47952
index 48951
index 49950
index 50949
index 51948
index 52947
index 53946
index 54945
index 55944
index 56943
index 57942
index 58941
index 59940
index 60939
index 61938
index 62937
index 63936
index 64935
index 65934
index 66933
index 67932
index 68931
index 69930
index 70929
index 71928
index 72927
index 73926
index 74925
index 75924
index 76923
index 77922
index 78921
index 79920
index 80919
index 81918
index 82917
index 8

In [99]:
samll_data

Unnamed: 0.1,Unnamed: 0,id,bug,code,flaw,func_with_fix,vul_lines,vul_lines_fix,code_no_comment,long_path_combine,long_path_greedy,long_path,nodes,edge_list,flaw_loc,longest_path_token_num,path_num
0,0,0,1,pango_glyph_string_set_size (PangoGlyphString ...,CWE-189,pango_glyph_string_set_size (PangoGlyphString ...,\tstring->space = 1;\n\tstring->space *= 2;\n ...,\t{\n\t string->space = 4;\n\t}\n\t const gu...,pango_glyph_string_set_size (PangoGlyphString ...,"[[[[3, 2, 1, 2, 3], [0, 1, 2]], 0]]","[[[[3, 2, 1, 2, 3], [0, 1, 2]], 0]]","[[1, 2, 3]]","{1: [2, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [2, 3]]","[7, 8, 9, 11, 12]",5,1
1,1,1,1,"dispatch_cmd(conn c)\n{\n int r, i, timeout...",,"dispatch_cmd(conn c)\n{\n int r, i, timeout...","return reply_msg(c, MSG_JOB_TOO_BI...",/* throw away the job body and res...,"dispatch_cmd(conn c)\n{\n int r, i, timeout...","[[[[5, 4, 3, 2, 1, 2, 3, 4, 6], [0, 1, 2, 3]],...","[[[[76, 75, 74, 73, 44, 42, 41, 37, 3, 2, 1, 2...","[[1, 2, 3, 4, 5], [1, 2, 3, 4, 6], [1, 2, 3, 4...","{1: [2, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [2, 3], [3, 4], [4, 5], [4, 6], [4, 7...",[40],19,39
2,2,2,1,static GIOChannel *irssi_ssl_get_iochannel(GIO...,CWE-20,static GIOChannel *irssi_ssl_get_iochannel(GIO...,,static GIOChannel *irssi_ssl_get_iochannel(GIO...,static GIOChannel *irssi_ssl_get_iochannel(GIO...,"[[[[2, 1, 3], [0, 3, 4]], 0], [[[4, 1, 5], [0,...","[[[[2, 1, 3], [0, 3, 4]], 0], [[[4, 1, 5], [0,...","[[1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7...","{1: [10, 'CursorKind.TRANSLATION_UNIT', 'test....","[[1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7...",,3,5
3,3,3,1,int irssi_ssl_handshake(GIOChannel *handle)\n{...,CWE-20,int irssi_ssl_handshake(GIOChannel *handle)\n{...,\tret = !chan->verify || irssi_ssl_verify(chan...,\tret = !chan->verify || irssi_ssl_verify(chan...,int irssi_ssl_handshake(GIOChannel *handle)\n{...,"[[[[3, 2, 1, 2, 4, 5, 6], [0, 1, 2, 3]], 0], [...","[[[[30, 29, 28, 23, 20, 19, 14, 4, 2, 1, 2, 4,...","[[1, 2, 3], [1, 2, 4, 5, 6], [1, 2, 4, 7, 8], ...","{1: [2, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [2, 3], [2, 4], [4, 5], [5, 6], [4, 7...",[36],18,10
4,4,4,1,"static gboolean irssi_ssl_verify(SSL *ssl, SSL...",CWE-20,"static gboolean irssi_ssl_verify(SSL *ssl, SSL...",,/* Checks if the given string has internal NUL...,"static gboolean irssi_ssl_verify(SSL *ssl, SSL...",,,,,,,,
5,5,5,1,"GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...",CWE-20,"GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...",\tssl_handle = irssi_ssl_get_iochannel(handle...,"GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...","GIOChannel *net_connect_ip_ssl(IPADDR *ip, int...","[[[[2, 1, 3], [0, 4, 7]], 0], [[[4, 1, 4], [8,...","[[[[2, 1, 3], [0, 4, 7]], 0], [[[4, 1, 4], [8,...","[[1, 2], [1, 3], [1, 4]]","{1: [4, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [1, 3], [1, 4]]",[6],3,2
6,6,6,1,static void server_real_connect(SERVER_REC *se...,CWE-20,static void server_real_connect(SERVER_REC *se...,"\t\t\tnet_connect_ip_ssl(ip, port, own_ip, ser...","\t\t\tnet_connect_ip_ssl(ip, port, server->con...",static void server_real_connect(SERVER_REC *se...,"[[[[3, 2, 1, 2, 4], [0, 1]], 0], [[[5, 2, 1, 2...","[[[[8, 7, 6, 2, 1, 2, 6, 9, 10], [0, 1, 3, 4, ...","[[1, 2, 3], [1, 2, 4], [1, 2, 5], [1, 2, 6, 7,...","{1: [2, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [2, 3], [2, 4], [2, 5], [2, 6], [6, 7...",[24],9,7
7,7,7,1,static int try_read_command(conn *c) {\n as...,CWE-20,static int try_read_command(conn *c) {\n as...,"if (strcmp(ptr, ""get "") && str...",if (ptr - c->rcurr > 100 ||\n ...,static int try_read_command(conn *c) {\n as...,"[[[[3, 2, 1, 2, 4, 5, 6], [0, 1, 6]], 0], [[[9...","[[[[70, 69, 68, 67, 66, 65, 64, 61, 59, 58, 53...","[[1, 2, 3], [1, 2, 4, 5, 6], [1, 2, 4, 5, 7, 8...","{1: [2, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [2, 3], [2, 4], [4, 5], [5, 6], [5, 7...",[101],28,20
8,8,8,1,test_js (void) {\n GString *result = g_stri...,CWE-264,test_js (void) {\n GString *result = g_stri...,/* uzbl commands can be run from javascrip...,,test_js (void) {\n GString *result = g_stri...,"[[[[5, 4, 3, 2, 1, 2, 3, 4, 5], [0, 1, 2]], 0]]","[[[[5, 4, 3, 2, 1, 2, 3, 4, 5], [0, 1, 2]], 0]]","[[1, 2, 3, 4, 5]]","{1: [2, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [2, 3], [3, 4], [4, 5]]","[8, 9, 10, 7]",9,1
9,9,9,1,"eval_js(WebKitWebView * web_view, gchar *scrip...",CWE-264,"eval_js(WebKitWebView * web_view, gchar *scrip...",JSStringRef var_name;\n /* uzbl javascr...,,"eval_js(WebKitWebView * web_view, gchar *scrip...","[[[[5, 4, 3, 2, 1, 2, 3, 6, 7], [0, 1, 2, 3]],...","[[[[27, 26, 25, 23, 22, 20, 3, 2, 1, 2, 3, 20,...","[[1, 2, 3, 4, 5], [1, 2, 3, 6, 7], [1, 2, 3, 8...","{1: [2, 'CursorKind.TRANSLATION_UNIT', 'test.c...","[[1, 2], [2, 3], [3, 4], [4, 5], [3, 6], [6, 7...","[4, 39, 40, 17, 18, 19, 20, 21]",17,6


In [101]:
print(eval(samll_data.iloc[76]["long_path_greedy"]))

[[[[41, 40, 39, 38, 36, 8, 3, 1, 3, 8, 47, 49, 50, 51, 52], [0, 1, 2, 13, 15, 22, 24]], 1], [[[46, 45, 44, 42, 8, 3, 1, 3, 8, 53, 55, 56, 57], [0, 1, 2, 18, 19, 29, 31]], 1], [[[62, 61, 60, 58, 8, 3, 1, 3, 8, 63, 65, 66, 67], [0, 1, 2, 38, 41, 47, 49]], 1], [[[72, 71, 70, 68, 8, 3, 1, 3, 8, 9, 10, 11], [0, 1, 2, 3, 54, 58]], 1], [[[15, 14, 12, 8, 3, 1, 3, 8, 16, 18, 19], [0, 1, 2, 4, 5]], 0], [[[23, 22, 20, 8, 3, 1, 3, 8, 24, 26, 27], [0, 1, 2, 6, 7]], 0], [[[35, 34, 33, 8, 3, 1, 3, 8, 76, 77, 78], [0, 1, 2, 70, 10]], 0], [[[83, 82, 81, 8, 3, 1, 3, 8, 12, 13], [0, 1, 2, 4, 74]], 1], [[[17, 16, 8, 3, 1, 3, 8, 20, 21], [0, 1, 2, 5, 6]], 0], [[[25, 24, 8, 3, 1, 3, 8, 28, 29], [0, 1, 2, 7, 8]], 0], [[[30, 28, 8, 3, 1, 3, 8, 31, 32], [0, 1, 2, 8, 9]], 0], [[[37, 36, 8, 3, 1, 3, 8, 42, 43], [0, 1, 2, 13, 18]], 0], [[[48, 47, 8, 3, 1, 3, 8, 53, 54], [0, 1, 2, 22, 29]], 0], [[[59, 58, 8, 3, 1, 3, 8, 63, 64], [0, 1, 2, 38, 47]], 1], [[[69, 68, 8, 3, 1, 3, 8, 73, 74], [0, 1, 2, 54, 61]], 0], [[[

In [102]:
print(samll_data.iloc[76]["flaw_loc"])

[65, 37, 38, 39, 40, 41, 74, 15, 48, 49, 24, 57, 58, 30, 31]


In [103]:
print(samll_data.iloc[88]["vul_lines"])

				int npages;
				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;



In [104]:
print(samll_data.iloc[170]["func_with_fix"])

static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
//fix_flaw_line_below:
//static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
 {
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
	struct nfs_server       *server = NFS_SERVER(dir);
	struct nfs4_opendata *opendata;
	int status;

	/* Protect against reboot recovery conflicts */
	status = -ENOMEM;
	if (!(sp = nfs4_get_state_owner(server, cred))) {
		dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
		goto out_err;
	}
	status = nfs4_recover_expired_lease(server);
 	if (status != 0)
 		goto err_put_state_owner;
 	if (path->dentry->d_inode != NULL)
//flaw_line_below:
		nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
//fix_flaw_line_below:
//		nfs4_return_incompatible_delega

In [105]:
samll_data.to_csv("./data/all_function_long_path.csv")

In [106]:
samll_data.dropna(subset=['long_path'],inplace=True)

In [107]:
len(samll_data)

113299

In [108]:
samll_data.to_csv("./data/all_function_long_path_dropna.csv")

In [109]:
good_data = samll_data.loc[(samll_data["bug"]==0)]
bad_data = samll_data.loc[(samll_data["bug"]==1)]

In [110]:
len(good_data)

106555

In [111]:
len(bad_data)

6744

In [112]:
good_data.to_csv("./data/good_long_path_dropna.csv")
bad_data.to_csv("./data/bad_long_path_dropna.csv")

In [None]:
myList = [[1, 2], [1, 3, 4, 5], [1, 3, 6, 7, 8], [1, 3, 6, 9, 10], [1, 3, 6, 9, 11, 12], [1, 3, 6, 13, 14], [1, 3, 6, 13, 15, 16], [1, 3, 6, 17, 18], [1, 3, 6, 19, 20], [1, 3, 6, 19, 21, 22], [1, 3, 6, 23, 24], [1, 3, 6, 23, 25, 26, 27]]
myList1 = sorted(myList,key = lambda i:len(i),reverse=True)

In [None]:
myList

In [None]:
myList1

In [None]:
def print_ast(cursor, deep=0):
    print(' '.join((deep*"    ",str(deep), str(cursor.kind), str(cursor.spelling), str(cursor.location.line))))
    
    for child in cursor.get_children():
        print_ast(child, deep+1)

In [None]:
print(samll_data.iloc[198]["code"])

In [None]:
print_ast(generate_ast_roots(samll_data.iloc[198]["code"]))

In [None]:
long_path =[[1,2,3],[1,8],[1,4,5,7],[1,9]] 

In [None]:
long_path_greedy = sorted(long_path,key = lambda i:len(i),reverse=True)       
path_len = len(long_path_greedy)
long_path_greedy_2=[]
long_path_2=[]
i = 0
if path_len%2 == 0:
    while i <  path_len:
        start1 = long_path_greedy[i][::-1]
        end1 = long_path_greedy[i+1][1:]
        print(end1)
        path1 = start1+end1
        long_path_greedy_2.append(path1)

        start2 = long_path[i][::-1]
        end2 = long_path[i+1][1:]
        path2 = start2+end2
        long_path_2.append(path2)

        i+=2
else:
    while i <  path_len:
        if i == path_len-1:
            start1 = long_path_greedy[i][::-1]
            end1 = long_path_greedy[i][1:]
            print(end1)
            path1 = start1+end1
            long_path_greedy_2.append(path1)

            start2 = long_path[i][::-1]
            end2 = long_path[i][1:]
            path2 = start2+end2
            long_path_2.append(path2)
            i+=2
        else:    
            start1 = long_path_greedy[i][::-1]
            end1 = long_path_greedy[i+1][1:]
            print(end1)
            path1 = start1+end1
            long_path_greedy_2.append(path1)

            start2 = long_path[i][::-1]
            end2 = long_path[i+1][1:]
            path2 = start2+end2
            long_path_2.append(path2)

            i+=2

In [None]:
long_path_greedy_2

In [None]:
long_path_2

In [None]:
long_path =[[1,2,3],[1,4,5,7],[1,8]] 
aa = long_path[1][::-1] 

In [None]:
print(aa)