In [286]:
import numpy as np
import pandas as pd
import string
import wiggum as wg
import statistics
import time
import warnings
warnings.filterwarnings('ignore')

In [287]:
def generate_synthetic_scalability_data(data, random_seed, n_view, num_dep_indep, num_splitby):
    np.random.seed(random_seed)
    labeled_df = wg.LabeledDataFrame(data)    
    
    # set dependent and independent for some xi, ignore for the rest
    dep_indep_list = np.random.choice(n_view*2, num_dep_indep)
    #print(dep_indep_list)
    roles = {'x'+str(i+1):['ignore'] if i not in dep_indep_list else ['independent','dependent'] 
             for i in range(n_view*2)}
    
    # vars without 'x' in them are splitbys
    splitby_var_list = [cn for cn in data.columns if not('x' in cn)]
    # set splitby for some variable, ignore for the rest
    splitby_list = np.random.choice(splitby_var_list, num_splitby)
    roles.update( {c:['splitby'] if c in splitby_list else ['ignore'] for c in splitby_var_list})    
    
    count_list = []

    var_types = {'x'+str(i+1):'continuous' for i in range(n_view*2)}
    var_types.update( {c:'categorical' for c in splitby_var_list})
    weighting = {}
    
    labeled_df.set_counts(count_list)
    labeled_df.set_roles(roles)
    labeled_df.set_var_types(var_types)
    labeled_df.meta_df

    all_pearson_obj = wg.All_Pearson()
    time = %timeit -or10 -n10 -q labeled_df.get_subgroup_trends_1lev([all_pearson_obj])

    #data_size = len(data)
    #save_directory = '../data/synthetic_scalability_' + str(data_size) + \
    #                    '_' + str(num_dep_indep) + '_' + str(num_splitby)    
    #labeled_df.to_csvs(save_directory)    
    
    return time

In [288]:
def test_scalability(data, n_view, num_dep_indep, num_splitby):
    time_list = []
    num_round = 10
    
    random_seed_list = np.random.randint(100000, size=(num_round))

    for i in range(num_round):
        random_seed = random_seed_list[i]
        time = generate_synthetic_scalability_data(data, random_seed, n_view, num_dep_indep, num_splitby)
        time_list.append(time.timings)

    # flatten a 2D list to 1D
    time_list = [j for sub in time_list for j in sub]

    m = statistics.mean(time_list)
    stdev = statistics.stdev(time_list)

    row = {'size':len(data), 'num_dep_indep':num_dep_indep, 'number_splitby':num_splitby, 
               'num_round':num_round, 'time_mean':m, 'time_std': stdev}

    return row

# Initialize the result dataframe

In [289]:
result = pd.DataFrame(columns=['size', 'num_dep_indep', 'number_splitby', 'cluster', 'num_round', 'time_mean', 'time_std'])

In [290]:
result

Unnamed: 0,size,num_dep_indep,number_splitby,cluster,num_round,time_mean,time_std


In [291]:
data_size_list = [1000, 10000, 100000]
cluster_list = [8, 16, 32]
num_dep_indep_list = [4, 8, 16]
num_splitby_list = [4, 8, 16]

start_time = time.time()

for data_size in data_size_list:
    for cluster in cluster_list:      
        file = '../data/scalability_test/synthetic_scalability_' + str(data_size) + \
                    '_cluster' + str(cluster) + '.csv'
        data = pd.read_csv(file)
        
        n_view = int(len(data.columns) / 3)

        for num_dep_indep in num_dep_indep_list:
            for num_splitby in num_splitby_list:  
                
                row = test_scalability(data, n_view, num_dep_indep, num_splitby)
                row["cluster"] = cluster
                
                result = result.append(row, ignore_index=True)
                
print("--- %s seconds ---" % (time.time() - start_time))

--- 4566.450034856796 seconds ---


In [292]:
result

Unnamed: 0,size,num_dep_indep,number_splitby,cluster,num_round,time_mean,time_std
0,1000.0,4.0,4.0,8.0,10.0,0.009655,0.007589
1,1000.0,4.0,8.0,8.0,10.0,0.011051,0.011993
2,1000.0,4.0,16.0,8.0,10.0,0.014106,0.021296
3,1000.0,8.0,4.0,8.0,10.0,0.013284,0.018819
4,1000.0,8.0,8.0,8.0,10.0,0.019254,0.036301
...,...,...,...,...,...,...,...
76,100000.0,8.0,8.0,32.0,10.0,0.053870,0.140196
77,100000.0,8.0,16.0,32.0,10.0,0.086177,0.236743
78,100000.0,16.0,4.0,32.0,10.0,0.093313,0.258596
79,100000.0,16.0,8.0,32.0,10.0,0.170834,0.484227


In [293]:
file = '../data/scalability_test/result3.csv'
result.to_csv(file ,index=False)