In [None]:
from collections import Counter

import numpy as np
from matplotlib import pyplot as plt

from project.data.preprocessed.unsplit import unsplit_data as UNSPLIT
from project.data.preprocessed.split import split_data as SPLIT

TOTAL = []
TOTAL.extend(UNSPLIT.train)
TOTAL.extend(UNSPLIT.valid)
TOTAL.extend(UNSPLIT.test)


In [None]:
def print_top_and_bottom(counter_obj, count, name, cols=2, width=35):
    mc = counter_obj.most_common()
    top_x = mc[:count]
    bottom_x = mc[-count:]
    
    top = "\n".join("{}. {}".format(i+1, x) for i,x in enumerate(top_x))
    bottom = "\n".join("{}. {}".format(i+1, x) for i,x in enumerate(reversed(bottom_x)))
    
    s = '''TOP {count} {name}\n{top}\nBOTTOM {count} {name}\n{bottom}'''.format(
        count=count, name=name, top=top, bottom=bottom
    )
    print(to_columns(s, cols, width))
    return mc

def to_columns(string, cols, width):
    lines = string.split("\n")
    lpc = int(len(lines) / cols)
    columns = [lines[i*lpc:(i+1)*lpc] for i in range(cols)]
    
    max_c = max([len(c) for c in columns])
    for c in columns:
        size = len(c)
        for i in range(max_c - size):
            c.append(" ")
    
    final_text = []
    for i in range(len(lines)):
        final_text.append(str.ljust(columns[i % cols][(i // cols)], width, " "))
        if i % cols == cols-1:
            final_text.append('\n')
    return "".join(final_text)

def get_histogram(counter, bins, name):
    counts = [n[1] for n in counter]
    total = sum(counts)
    args = len(counter)
    meta_counter = Counter(counts)
    
    h = np.histogram(counts, bins)

    lines = [ 
        "Histogram: {}".format(name),
        "\n",
        str.ljust("Bin", 10, " "),
        str.rjust("Count", 7, " "),
        str.rjust("% of names", 11, " "),
        str.rjust("% of vars", 11, " "), 
        "\n",
    ]
    for i in range(1, len(h[1])-1):
        bucket_min, bucket_max = h[1][i], h[1][i+1] 
        lines.append(str.ljust("{}-{}".format(bucket_min, bucket_max), 10, " "))
        lines.append(str.rjust("{}".format(h[0][i]), 7, " "))
        lines.append(str.rjust("{:.3f}".format(100*h[0][i]/args), 11, " "))
        
        tot = 0
        for i in range(bucket_min, bucket_max):
            if i in meta_counter:
                tot += i * meta_counter[i]
        lines.append(str.rjust("{:.3f}".format(100*tot/total), 11, " "))
        
        lines.append("\n")
    return "".join(lines)
        
        

In [None]:
TOP_N = 10
def counts():
    arg_names = Counter(x['arg_name'] for x in TOTAL)
    func_names = Counter(x['name'] for x in TOTAL)
    
    mc_arg_name = print_top_and_bottom(arg_names, TOP_N, "Argument Names")
    mc_func_name = print_top_and_bottom(func_names, TOP_N, "Function Names")
    
    name_bins =  [0,1,2,3,4,5,10,20,50,100,200,500,3000]
    name_h = get_histogram(mc_arg_name, name_bins, "Arg Names")
    func_bins =  [0,1,2,3,4,5,10,20,50,100,200]
    func_h = get_histogram(mc_func_name, name_bins, "Func Names")
    print(to_columns(name_h + '\n' + func_h , 2, 50))

        
counts()

In [None]:
TOP_N = 20
def check_for_duplicates():
    div = "<!!S!!>"
    arg_names = Counter(x['arg_name'] + div + x['arg_desc'].lower() for x in TOTAL)
    mc = arg_names.most_common()
    print("Check for Unique Arg Desc\n")
    print("{}".format("N. Count  [Arg, Desc]"))
    
    for i, (arg_desc, c) in enumerate(mc[:TOP_N]):
        line = [
            str.ljust("{}".format(i+1), 3, " "),
            str.ljust("{}".format(c), 7, " "),
            "{}".format(arg_desc.split(div)),
            "\n"
        ]
        print("".join(line))
        
    print()
    print()
    name_bins =  [0,1,2,3,4,5,10,20,50,100,200,3000]
    name_h = get_histogram(mc, name_bins, "Unique Names + Desc")
    print(name_h)
    
check_for_duplicates()

In [None]:

def count_descs_per_arg():
    ARGS= 10
    TOP_DESC = 5
    
    arg_names = Counter(x['arg_name'] for x in TOTAL)
    mc = arg_names.most_common()
    
    tally = {}
    for d in TOTAL:
        name = d['arg_name']
        if name in tally:
            tally[name]["desc"].append(d['arg_desc'].lower())
            tally[name]["pkg"].append(d['pkg'])
        else:
            tally[name] = {"desc": [d['arg_desc']], "pkg": [d['pkg']]}
    
    tuple_tally = {k: (Counter(v['desc']).most_common(), 
                       Counter(v['pkg']).most_common()) for k,v in tally.items()}
    
    for i, (arg, c) in enumerate(mc[:ARGS]):
        
        line = [
            str.ljust("{}.".format(i+1), 3, " "),
            str.ljust("{}".format(arg), 7, " "),
            str.rjust("{} ".format(c), 5, " "),
            "\n",
            str.ljust("    (TOP PKG) ", 11, " "),
            str.ljust("", 11, " "),
            str.ljust("|  ", 6, " "),
            str.ljust("  (TOP DESC)", 5, " "),
            str.ljust("", 5, " "),
            "\n"
        ]
        
        for (desc, cd), (repo, cr) in list(zip(*tuple_tally[arg]))[:TOP_DESC]:
            trim = 60
            ellipse = " [...]" if len(desc) > trim else ""
            sub_lines = [
                str.ljust("    ({}) ".format(cr), 11, " "),
                str.ljust("{}".format(repo), 14, " "),
                str.ljust("|".format(repo), 3, " "),
                str.ljust(" ({})".format(cd), 7, " "),
                str.ljust("{}".format(desc[:trim]+ellipse), 5, " "),
                "\n"
            ]
            line.extend(sub_lines)
        print("".join(line))
        
#     func_names = Counter(x['name'] for x in TOTAL)
    
count_descs_per_arg()  

In [None]:

def count_args_per_desc():
    ARGS= 30
    TOP_DESC = 5
    
    arg_desc = Counter(x['arg_desc'].strip().lower() for x in TOTAL)
    mc = arg_desc.most_common()
    
    tally = {}
    for d in TOTAL:
        desc = d['arg_desc'].strip().lower()
        if desc in tally:
            tally[desc]["name"].append(d['arg_name'].lower())
            tally[desc]["pkg"].append(d['pkg'])
        else:
            tally[desc] = {"name": [d['arg_name']], "pkg": [d['pkg']]}
    
    tuple_tally = {k: (Counter(v['name']).most_common(), 
                       Counter(v['pkg']).most_common()) for k,v in tally.items()}
    
    for i, (arg, c) in enumerate(mc[:ARGS]):
        
        line = [
            str.ljust("{}.".format(i+1), 3, " "),
            str.rjust("({})  ".format(c), 5, " "),
            str.ljust("{}".format(arg), 7, " "),
            "\n",
        ]
        
        for (name, cd), (repo, cr) in list(zip(*tuple_tally[arg]))[:TOP_DESC]:
            sub_lines = [
                str.ljust("            {}".format(name), 30, " "),
                str.ljust("({})".format(cd), 10, " "),
                str.ljust("|", 7, " "),
                
                str.ljust("{}".format(repo), 15, " "),
                str.ljust(" ({}) ".format(cr), 7, " "),

                "\n"
            ]
            line.extend(sub_lines)
        print("".join(line))
        
#     func_names = Counter(x['name'] for x in TOTAL)
    
count_args_per_desc()  