In [1]:
from collections import Counter

import numpy as np
from matplotlib import pyplot as plt

from project.data.preprocessed.unsplit import unsplit_data as UNSPLIT
from project.data.preprocessed.split import split_data as SPLIT

TOTAL = []
TOTAL.extend(UNSPLIT.train)
TOTAL.extend(UNSPLIT.valid)
TOTAL.extend(UNSPLIT.test)


In [215]:
def print_top_and_bottom(counter_obj, count, name, cols=2, width=35):
    mc = counter_obj.most_common()
    top_x = mc[:count]
    bottom_x = mc[-count:]
    
    top = "\n".join("{}. {}".format(i+1, x) for i,x in enumerate(top_x))
    bottom = "\n".join("{}. {}".format(i+1, x) for i,x in enumerate(reversed(bottom_x)))
    
    s = '''TOP {count} {name}\n{top}\nBOTTOM {count} {name}\n{bottom}'''.format(
        count=count, name=name, top=top, bottom=bottom
    )
    print(to_columns(s, cols, width))
    return mc

def to_columns(string, cols, width):
    lines = string.split("\n")
    lpc = int(len(lines) / cols)
    columns = [lines[i*lpc:(i+1)*lpc] for i in range(cols)]
    
    max_c = max([len(c) for c in columns])
    for c in columns:
        size = len(c)
        for i in range(max_c - size):
            c.append(" ")
    
    final_text = []
    for i in range(len(lines)):
        final_text.append(str.ljust(columns[i % cols][(i // cols)], width, " "))
        if i % cols == cols-1:
            final_text.append('\n')
    return "".join(final_text)

def get_histogram(counter, bins, name):
    counts = [n[1] for n in counter]
    total = sum(counts)
    args = len(counter)
    meta_counter = Counter(counts)
    
    h = np.histogram(counts, bins)

    lines = [ 
        "Histogram: {}".format(name),
        "\n",
        str.ljust("Bin", 10, " "),
        str.rjust("Count", 7, " "),
        str.rjust("% of names", 11, " "),
        str.rjust("% of vars", 11, " "), 
        "\n",
    ]
    for i in range(1, len(h[1])-1):
        bucket_min, bucket_max = h[1][i], h[1][i+1] 
        lines.append(str.ljust("{}-{}".format(bucket_min, bucket_max), 10, " "))
        lines.append(str.rjust("{}".format(h[0][i]), 7, " "))
        lines.append(str.rjust("{:.3f}".format(100*h[0][i]/args), 11, " "))
        
        tot = 0
        for i in range(bucket_min, bucket_max):
            if i in meta_counter:
                tot += i * meta_counter[i]
        lines.append(str.rjust("{:.3f}".format(100*tot/total), 11, " "))
        
        lines.append("\n")
    return "".join(lines)
        
        

In [216]:
TOP_N = 10
def counts():
    arg_names = Counter(x['arg_name'] for x in TOTAL)
    func_names = Counter(x['name'] for x in TOTAL)
    
    mc_arg_name = print_top_and_bottom(arg_names, TOP_N, "Argument Names")
    mc_func_name = print_top_and_bottom(func_names, TOP_N, "Function Names")
    
    name_bins =  [0,1,2,3,4,5,10,20,50,100,200,500,3000]
    name_h = get_histogram(mc_arg_name, name_bins, "Arg Names")
    func_bins =  [0,1,2,3,4,5,10,20,50,100,200]
    func_h = get_histogram(mc_func_name, name_bins, "Func Names")
    print(to_columns(name_h + '\n' + func_h , 2, 50))

        
counts()

TOP 10 Argument Names              BOTTOM 10 Argument Names           
1. ('name', 2387)                  1. ('extra_files', 1)              
2. ('x', 551)                      2. ('data2', 1)                    
3. ('kwargs', 375)                 3. ('power_matrix', 1)             
4. ('axis', 329)                   4. ('transport_kwargs', 1)         
5. ('dtype', 321)                  5. ('py_binary_name', 1)           
6. ('input', 292)                  6. ('show_eta', 1)                 
7. ('G', 289)                      7. ('n_targets', 1)                
8. ('a', 287)                      8. ('fractional', 1)               
9. ('value', 271)                  9. ('param_range', 1)              
10. ('inputs', 269)                10. ('show_cbar', 1)               

TOP 10 Function Names              BOTTOM 10 Function Names           
1. ('fit', 143)                    1. ('load_pkcs1_openssl_der', 1)   
2. ('transform', 130)              2. ('get_executable_path', 1)      
3. ('

In [249]:
def check_for_duplicates():
    div = "<!!S!!>"
    arg_names = Counter(x['arg_name'] + div + x['arg_desc'].lower() for x in TOTAL)
    mc = arg_names.most_common()
    print("Check for Unique Arg Desc\n")
    print("{}".format("N. Count  [Arg, Desc]"))
    
    for i, (arg_desc, c) in enumerate(mc[:TOP_N]):
        line = [
            str.ljust("{}".format(i+1), 3, " "),
            str.ljust("{}".format(c), 7, " "),
            "{}".format(arg_desc.split(div)),
            "\n"
        ]
        print("".join(line))
        
    print()
    print()
    name_bins =  [0,1,2,3,4,5,10,20,50,100,200,500,3000]
    name_h = get_histogram(mc, name_bins, "Uniqe Names + Desc")
    print(name_h)
    
check_for_duplicates()

Check for Unique Arg Desc

N. Count  [Arg, Desc]
1  1316   ['name', ' a name for the operation (optional).']

2  117    ['timeout', ' the amount of time, in seconds, to wait for the request to complete.\nnote that if ``retry`` is specified, the timeout applies to each individual\nattempt.']

3  117    ['retry', ' a retry object used to retry requests. if ``none`` is specified,\nrequests will not be retried.']

4  83     ['name', ' optional op name.']

5  83     ['options', ' overrides the default settings for this call, e.g, timeout, retries\netc.']

6  68     ['G', ' a networkx graph']

7  57     ['image', ' input image.']

8  56     ['random_state', ' if int, random_state is the seed used by the random number generator;\nif randomstate instance, random_state is the random number generator;\nif none, the random number generator is the randomstate instance used\nby `np.random`.']

9  56     ['name', ' an optional variable_scope name.']

10 49     ['name', ' a name for this operation (o