In [1]:
from collections import Counter

import numpy as np
from matplotlib import pyplot as plt

from project.data.preprocessed.unsplit import unsplit_data as UNSPLIT
from project.data.preprocessed.split import split_data as SPLIT
from project.utils.tokenize import nltk_tok

TOTAL = []
TOTAL.extend(UNSPLIT.train)
TOTAL.extend(UNSPLIT.valid)
# TOTAL.extend(UNSPLIT.test)


## Documentation Investigations

In [2]:
def print_top_and_bottom(counter_obj, count, name, cols=2, width=35):
    mc = counter_obj.most_common()
    top_x = mc[:count]
    bottom_x = mc[-count:]
    
    top = "\n".join("{}. {}".format(i+1, x) for i,x in enumerate(top_x))
    bottom = "\n".join("{}. {}".format(i+1, x) for i,x in enumerate(reversed(bottom_x)))
    
    s = '''TOP {count} {name}\n{top}\nBOTTOM {count} {name}\n{bottom}'''.format(
        count=count, name=name, top=top, bottom=bottom
    )
    print(to_columns(s, cols, width))
    return mc

def to_columns(string, cols, width):
    lines = string.split("\n")
    lpc = int(len(lines) / cols)
    columns = [lines[i*lpc:(i+1)*lpc] for i in range(cols)]
    
    max_c = max([len(c) for c in columns])
    for c in columns:
        size = len(c)
        for i in range(max_c - size):
            c.append(" ")
    
    final_text = []
    for i in range(len(lines)):
        final_text.append(str.ljust(columns[i % cols][(i // cols)], width, " "))
        if i % cols == cols-1:
            final_text.append('\n')
    return "".join(final_text)

def get_histogram(counter, bins, name):
    counts = [n[1] for n in counter]
    total = sum(counts)
    args = len(counter)
    meta_counter = Counter(counts)
    
    h = np.histogram(counts, bins)

    lines = [ 
        "Histogram: {}".format(name),
        "\n",
        str.ljust("Bin", 10, " "),
        str.rjust("Count", 7, " "),
        str.rjust("% of names", 12, " "),
        str.rjust("% of vars", 10, " "), 
        str.rjust("%-ile vars ", 13, " "), 
        "\n",
    ]
    
    cumulative = 0 
    for i in range(1, len(h[1])-1):
        bucket_min, bucket_max = h[1][i], h[1][i+1] 
        lines.append(str.ljust("{}-{}".format(bucket_min, bucket_max), 10, " "))
        lines.append(str.rjust("{}".format(h[0][i]), 7, " "))
        lines.append(str.rjust("{:.3f}".format(100*h[0][i]/args), 11, " "))
        
        tot = 0
        for i in range(bucket_min, bucket_max):
            if i in meta_counter:
                tot += i * meta_counter[i]
        lines.append(str.rjust("{:.3f}".format(100*tot/total), 11, " "))
        cumulative += 100*tot/total
        lines.append(str.rjust("{:.2f}".format(cumulative), 11, " "))
        
        lines.append("\n")
    return "".join(lines)
        
        

In [3]:
TOP_N = 10
def counts():
    arg_names = Counter(x['arg_name'] for x in TOTAL)
    func_names = Counter(x['name'] for x in TOTAL)
    
    mc_arg_name = print_top_and_bottom(arg_names, TOP_N, "Argument Names")
    mc_func_name = print_top_and_bottom(func_names, TOP_N, "Function Names")
    
    name_bins =  [0,1,2,3,4,5,10,20,50,100,200,500,3000]
    name_h = get_histogram(mc_arg_name, name_bins, "Arg Names")
    func_bins =  [0,1,2,3,4,5,10,20,50,100,200]
    func_h = get_histogram(mc_func_name, name_bins, "Func Names")
    print(to_columns(name_h + '\n' + func_h , 2, 60))

        
counts()

TOP 10 Argument Names              BOTTOM 10 Argument Names           
1. ('name', 1898)                  1. ('std_factor', 1)               
2. ('x', 432)                      2. ('num_parallel_parser_calls', 1)
3. ('kwargs', 299)                 3. ('sketch', 1)                   
4. ('dtype', 260)                  4. ('autoplay', 1)                 
5. ('axis', 260)                   5. ('alignment', 1)                
6. ('a', 227)                      6. ('shear_range', 1)              
7. ('input', 222)                  7. ('main_loss', 1)                
8. ('G', 221)                      8. ('max_lags', 1)                 
9. ('inputs', 218)                 9. ('link_color_func', 1)          
10. ('value', 205)                 10. ('vcs_args', 1)                

TOP 10 Function Names              BOTTOM 10 Function Names           
1. ('fit', 120)                    1. ('exit', 1)                     
2. ('transform', 101)              2. ('get_consuming_ops', 1)        
3. ('

In [4]:
TOP_N = 20
def check_for_duplicates():
    div = "<!!S!!>"
    arg_names = Counter(x['arg_name'] + div + " ".join(nltk_tok(x['arg_desc'])) for x in TOTAL)
    mc = arg_names.most_common()
    print("Check for Unique [Arg, Desc]\n")
    print("{}".format("N. Count  [Arg, Desc]"))
    
    for i, (arg_desc, c) in enumerate(mc[:TOP_N]):
        line = [
            str.ljust("{}".format(i+1), 3, " "),
            str.ljust("{}".format(c), 7, " "),
            "{}".format(arg_desc.split(div)),
            "\n"
        ]
        print("".join(line))
        
    print()
    print()
    name_bins =  [0,1,2,3,4,5,10,20,50,100,200,3000]
    name_h = get_histogram(mc, name_bins, "Unique Names + Desc")
    print(name_h)
    
check_for_duplicates()

Check for Unique [Arg, Desc]

N. Count  [Arg, Desc]
1  1043   ['name', 'a name for the operation ( optional ) .']

2  94     ['retry', 'a retry object used to retry requests . if `` none `` is specified , requests will not be retried .']

3  81     ['timeout', 'the amount of time , in seconds , to wait for the request to complete . note that if `` retry `` is specified , the timeout applies to each individual attempt .']

4  69     ['name', 'optional op name .']

5  59     ['options', 'overrides the default settings for this call , e.g , timeout , retries etc .']

6  54     ['G', 'a networkx graph']

7  44     ['image', 'input image .']

8  40     ['name', 'an optional variable_scope name .']

9  39     ['random_state', 'if int , random_state is the seed used by the random number generator ; if randomstate instance , random_state is the random number generator ; if none , the random number generator is the randomstate instance used by ` np.random ` .']

10 37     ['name', 'a name for t

In [5]:

def count_descs_per_arg():
    ARGS= 10
    TOP_DESC = 5
    
    arg_names = Counter(x['arg_name'] for x in TOTAL)
    mc = arg_names.most_common()
    
    tally = {}
    for d in TOTAL:
        name = d['arg_name']
        if name in tally:
            tally[name]["desc"].append(d['arg_desc'].lower())
            tally[name]["pkg"].append(d['pkg'])
        else:
            tally[name] = {"desc": [d['arg_desc']], "pkg": [d['pkg']]}
    
    tuple_tally = {k: (Counter(v['desc']).most_common(), 
                       Counter(v['pkg']).most_common()) for k,v in tally.items()}
    
    for i, (arg, c) in enumerate(mc[:ARGS]):
        
        line = [
            str.ljust("{}.".format(i+1), 3, " "),
            str.ljust("{}".format(arg), 7, " "),
            str.rjust("{} ".format(c), 5, " "),
            "\n",
            str.ljust("    (TOP PKG) ", 11, " "),
            str.ljust("", 11, " "),
            str.ljust("|  ", 6, " "),
            str.ljust("  (TOP DESC)", 5, " "),
            str.ljust("", 5, " "),
            "\n"
        ]
        
        for (desc, cd), (repo, cr) in list(zip(*tuple_tally[arg]))[:TOP_DESC]:
            trim = 60
            ellipse = " [...]" if len(desc) > trim else ""
            sub_lines = [
                str.ljust("    ({}) ".format(cr), 11, " "),
                str.ljust("{}".format(repo), 14, " "),
                str.ljust("|".format(repo), 3, " "),
                str.ljust(" ({})".format(cd), 7, " "),
                str.ljust("{}".format(desc[:trim]+ellipse), 5, " "),
                "\n"
            ]
            line.extend(sub_lines)
        print("".join(line))
        
#     func_names = Counter(x['name'] for x in TOTAL)
    
count_descs_per_arg()  

1. name   1898 
    (TOP PKG)            |       (TOP DESC)     
    (1636) tensorflow    |   (1043) a name for the operation (optional).
    (51)   google        |   (69)   optional op name.
    (49)   tflearn       |   (40)   an optional variable_scope name.
    (14)   external      |   (37)   a name for this operation (optional).
    (13)   absl          |   (30)   a string, the name of the layer.

2. x       432 
    (TOP PKG)            |       (TOP DESC)     
    (294)  tensorflow    |   (31)   tensor or variable.
    (42)   matplotlib    |   (17)   a tensor or variable.
    (35)   scipy         |   (14)   `bfloat16`, `half`, `float32`, `float64`, `complex64`, `com [...]
    (12)   tflearn       |   (12)   numeric `tensor`.
    (9)    dask          |   (10)   array or sequence containing the data

3. kwargs  299 
    (TOP PKG)            |       (TOP DESC)     
    (60)   tensorflow    |   (12)   additional keyword arguments which will be passed to the ap [...]
    (47)   google 

In [6]:

def count_args_per_desc():
    ARGS= 30
    TOP_DESC = 5
    
    arg_desc = Counter(x['arg_desc'].strip().lower() for x in TOTAL)
    mc = arg_desc.most_common()
    
    tally = {}
    for d in TOTAL:
        desc = d['arg_desc'].strip().lower()
        if desc in tally:
            tally[desc]["name"].append(d['arg_name'].lower())
            tally[desc]["pkg"].append(d['pkg'])
        else:
            tally[desc] = {"name": [d['arg_name']], "pkg": [d['pkg']]}
    
    tuple_tally = {k: (Counter(v['name']).most_common(), 
                       Counter(v['pkg']).most_common()) for k,v in tally.items()}
    
    for i, (arg, c) in enumerate(mc[:ARGS]):
        
        line = [
            str.ljust("{}.".format(i+1), 3, " "),
            str.rjust("({})  ".format(c), 5, " "),
            str.ljust("{}".format(arg), 7, " "),
            "\n",
        ]
        
        for (name, cd), (repo, cr) in list(zip(*tuple_tally[arg]))[:TOP_DESC]:
            sub_lines = [
                str.ljust("            {}".format(name), 30, " "),
                str.ljust("({})".format(cd), 10, " "),
                str.ljust("|", 7, " "),
                
                str.ljust("{}".format(repo), 15, " "),
                str.ljust(" ({}) ".format(cr), 7, " "),

                "\n"
            ]
            line.extend(sub_lines)
        print("".join(line))
        
#     func_names = Counter(x['name'] for x in TOTAL)
    
count_args_per_desc()  

1. (1043)  a name for the operation (optional).
            name              (1043)    |      tensorflow      (1043) 

2. (94)  a retry object used to retry requests. if ``none`` is specified, requests will not be retried.
            retry             (94)      |      google          (94)  

3. (81)  the amount of time, in seconds, to wait for the request to complete. note that if ``retry`` is specified, the timeout applies to each individual attempt.
            timeout           (81)      |      google          (81)  

4. (70)  input tensor.
            labeled_tensor    (20)      |      tensorflow      (70)  

5. (69)  optional op name.
            name              (69)      |      tensorflow      (69)  

6. (59)  overrides the default settings for this call, e.g, timeout, retries etc.
            options           (59)      |      google          (59)  

7. (58)  a `tensor`.
            input             (20)      |      tensorflow      (58)  

8. (58)  an optional `string`. def

## Code Investigations

In [7]:
import project.utils.code_tokenize as ct

# import importlib
# importlib.reload(ct)

<module 'project.utils.code_tokenize' from '/Users/erichambro/Desktop/SummerProject/project/project/utils/code_tokenize.py'>

In [8]:
train = ct.populate_codepath(SPLIT.train)
valid = ct.populate_codepath(SPLIT.valid)

ERROR in 1153: name: node pkg: theano
ERROR in 3059: name: name pkg: theano
ERROR in 5657: name: inputs pkg: theano
ERROR in 6545: name: response pkg: werkzeug
ERROR in 6766: name: symbol pkg: sympy
ERROR in 7915: name: b pkg: sympy
ERROR in 8504: name: mimetype pkg: werkzeug
ERROR in 9881: name: domain pkg: sympy
ERROR in 10503: name: n pkg: tqdm
ERROR in 12025: name: f pkg: sympy
ERROR in 13445: name: outputs pkg: theano
ERROR in 13541: name: sub pkg: theano
ERROR in 13957: name: headers pkg: werkzeug
ERROR in 15677: name: direct_passthrough pkg: werkzeug
ERROR in 17293: name: status pkg: werkzeug
ERROR in 20297: name: content_type pkg: werkzeug
ERROR in 413: name: public_key pkg: asn1crypto
ERROR in 993: name: address_family pkg: asn1crypto
ERROR in 1366: name: value pkg: asn1crypto
ERROR in 2623: name: value pkg: asn1crypto


In [13]:
def get_all_counts(data):
    all_paths = []
    all_counts = []
    for x in data:
        counter = 0
        for p in x['codepaths']:
            if p.path[0][0] == 'keyword' or len(p.path)>17:
                continue
            else:
                all_paths.append(" ".join(c[0] for c in p.path))
                counter += 1
        all_counts.append(counter)
    return all_paths, all_counts

def _alt_histogram(mc, bins):
    tally = [0]
    j =0 
    for i in mc:
        if i[0] > bins[j]:
            j+=1
            tally.append(0)
        tally[-1] += i[1]
    tot = sum(tally)
    lines = [[
            str.ljust("{}.".format("PathsPerPoint"), 10, " "),
            str.ljust("{}  ".format("Count"), 10, " "),
            str.ljust("{}".format("%-ile"), 10, " "),
            "\n",
        ]]
    
    c = 0 
    for b,t in list(zip(bins, tally)):
        c += t
        lines.append([
            str.ljust("{}.".format(b), 10, " "),
            str.ljust("{}  ".format(t), 10, " "),
            str.ljust("{:.5f}".format(100*c/tot), 10, " "),
            "\n",
        ])
    return " ".join(["".join(l) for l in lines])


def most_common_paths(data):
    all_paths, all_counts = get_all_counts(data)

    codepaths = Counter(all_paths)
    mc_codepaths = print_top_and_bottom(codepaths, TOP_N, "Top Paths", cols=1)    
    path_bins =  [0,1,10,100,1000,10000,100000]
    path_histogram = get_histogram(mc_codepaths, path_bins, "Code Paths")
    print(to_columns(path_histogram , 1, 90))
    
    codepaths_per_point = Counter(all_counts)
    
    mc_cpp = sorted(codepaths_per_point.most_common())

    bins = [0,1,2,5,10,20,50,100, 500,1000,10000,100000,1000000]
    print(_alt_histogram(mc_cpp, bins))

most_common_paths(train)

TOP 20 Top Paths                   
1. ('Name <- keyword <- Call -> keyword', 70490)
2. ('Name <- keyword <- Call -> keyword -> Name', 67035)
3. ('Name <- Call -> Name', 66587) 
4. ('Name <- Call -> Attribute', 30429)
5. ('Name <- keyword <- Call <- Assign <- If -> Assign -> Name', 28112)
6. ('Name <- Call -> Attribute -> Name', 24764)
7. ('Name <- Call <- Assign <- Try <- If -> Assign -> Name', 23698)
8. ('Name <- keyword <- Call <- Assign <- If -> Try -> Assign -> Call -> Name', 23693)
9. ('Name <- Call <- Assign <- Try <- If -> Assign -> Call -> keyword', 22707)
10. ('Name <- Call <- Assign <- Try <- If -> Assign -> Call -> keyword -> Name', 22696)
11. ('Name <- keyword <- Call <- Assign <- If -> Try -> Assign -> Call -> Attribute', 18505)
12. ('Name <- keyword <- Call <- Assign -> Tuple -> Name', 16538)
13. ('Name <- keyword <- Call <- Assign <- If -> Expr -> Call -> Name', 16297)
14. ('Name <- Call <- Assign <- Try -> ExceptHandler -> Return -> Call -> Name', 15832)
15. ('Name <- 

In [14]:
most_common_paths(valid)

TOP 20 Top Paths                   
1. ('Name <- Call -> Name', 4998)  
2. ('Name <- keyword <- Call -> keyword', 4941)
3. ('Name <- keyword <- Call -> keyword -> Name', 4707)
4. ('Name <- Assign <- FunctionDef <- ClassDef -> FunctionDef', 4553)
5. ('Name <- Assign <- FunctionDef <- ClassDef -> FunctionDef -> Assign -> Name', 4132)
6. ('Name <- Assign <- FunctionDef <- ClassDef -> FunctionDef -> Assign -> Call -> Name', 2725)
7. ('Name <- Assign <- FunctionDef <- ClassDef -> FunctionDef -> If -> Assign -> Name', 2668)
8. ('Attribute <- Assign <- FunctionDef <- ClassDef -> FunctionDef', 2456)
9. ('Name <- keyword <- Call <- Return <- FunctionDef <- ClassDef -> FunctionDef -> Assign -> Name', 2382)
10. ('Name <- Assign <- FunctionDef <- ClassDef -> FunctionDef -> Assign -> Call -> keyword', 2366)
11. ('Name <- keyword <- Call <- Return <- FunctionDef <- ClassDef -> FunctionDef', 2333)
12. ('Name <- Assign <- FunctionDef <- ClassDef -> FunctionDef -> Assign -> Call -> Attribute', 2229)
13