In [44]:
import numpy as np
import pandas as pd
import string
import wiggum as wg
import statistics
import time
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [45]:
global date_time
date_time = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")

In [46]:
def add_meta(data, random_seed, n_view, num_dep_indep, num_splitby, trial):
    np.random.seed(random_seed)
    labeled_df = wg.LabeledDataFrame(data)    
    
    # set dependent and independent for some xi, ignore for the rest
    dep_indep_list = np.random.choice(n_view*2, num_dep_indep)
    #print(dep_indep_list)
    roles = {'x'+str(i+1):['ignore'] if i not in dep_indep_list else ['independent','dependent'] 
             for i in range(n_view*2)}
    
    # vars without 'x' in them are splitbys
    splitby_var_list = [cn for cn in data.columns if not('x' in cn)]
    # set splitby for some variable, ignore for the rest
    splitby_list = np.random.choice(splitby_var_list, num_splitby)
    roles.update( {c:['splitby'] if c in splitby_list else ['ignore'] for c in splitby_var_list})    
    
    count_list = []

    var_types = {'x'+str(i+1):'continuous' for i in range(n_view*2)}
    var_types.update( {c:'categorical' for c in splitby_var_list})
    weighting = {}
    
    labeled_df.set_counts(count_list)
    labeled_df.set_roles(roles)
    labeled_df.set_var_types(var_types)
    labeled_df.meta_df

    # save metadata for tracking results
    data_size = len(data) 
    
    directory = '../data/scalability_test/'+ date_time
    if not(os.path.isdir(directory)):
        os.mkdir(directory)
    
    save_directory = directory +'/meta_' + str(data_size) + '_' + str(num_dep_indep) + '_' + str(num_splitby)      
    if not(os.path.isdir(save_directory)):
        os.mkdir(save_directory)

    meta_csv = 'meta' + str(trial) + '.csv'
    meta_file = os.path.join(save_directory, meta_csv)
    labeled_df.meta_df.to_csv(meta_file)    
    
    return labeled_df

In [47]:
def test_scalability(data, n_view, num_dep_indep, num_splitby, num_trial):
    temp_result = pd.DataFrame(columns=['size', 'num_dep_indep', 'number_splitby', 
                                        'cluster', 'trial', 'timings','timings_mean', 'timings_std'])
    
    random_seed_list = np.random.randint(100000, size=(num_trial))

    for i in range(num_trial):
        random_seed = random_seed_list[i]
        labeled_df = add_meta(data, random_seed, n_view, num_dep_indep, num_splitby, i)
        
        all_pearson_obj = wg.All_Pearson()
        
        # timing
        time = %timeit -or10 -n100 -q labeled_df.get_subgroup_trends_1lev([all_pearson_obj])

        m = statistics.mean(time.timings)
        stdev = statistics.stdev(time.timings)
        timeings_round = [round(time, 6) for time in time.timings]

        row = {'size':len(data), 'num_dep_indep':num_dep_indep, 'number_splitby':num_splitby, 'cluster': 0,
                   'trial':i, 'timings': timeings_round,'timings_mean':m, 'timings_std': stdev}

        temp_result = temp_result.append(row, ignore_index=True)

    return temp_result

# Running test

In [48]:
result = pd.DataFrame(columns=['size', 'num_dep_indep', 'number_splitby', 
                               'cluster', 'trial', 'timings', 'timings_mean', 'timings_std'])

data_size_list = [1000, 10000, 100000]
cluster_list = [2, 4, 8, 16, 32]
num_dep_indep_list = [4, 8, 16]
num_splitby_list = [4, 8, 16]
num_trial = 10

#data_size_list = [1000]
#cluster_list = [8]
#num_dep_indep_list = [4]
#num_splitby_list = [4]
#num_trial = 3

start_time = time.time()

for data_size in data_size_list:
    for cluster in cluster_list:      
        file = '../data/scalability_test/synthetic_scalability_' + str(data_size) + \
                    '_cluster' + str(cluster) + '.csv'
        data = pd.read_csv(file)
        
        n_view = int(len(data.columns) / 3)

        for num_dep_indep in num_dep_indep_list:
            for num_splitby in num_splitby_list:  
                
                temp_result = test_scalability(data, n_view, num_dep_indep, num_splitby, num_trial)
                temp_result["cluster"] = cluster
                
                result = result.append(temp_result)
                
print("--- %s seconds ---" % (time.time() - start_time))

--- 14209.820285081863 seconds ---


In [49]:
result

Unnamed: 0,size,num_dep_indep,number_splitby,cluster,trial,timings,timings_mean,timings_std
0,1000,4,4,2,0,"[0.012889, 0.00942, 0.01023, 0.010135, 0.00980...",0.009421,0.001630
1,1000,4,4,2,1,"[0.009812, 0.007949, 0.007829, 0.007805, 0.007...",0.008053,0.000719
2,1000,4,4,2,2,"[0.008158, 0.007471, 0.007526, 0.007, 0.006817...",0.007213,0.000423
3,1000,4,4,2,3,"[0.0091, 0.007101, 0.007433, 0.007128, 0.00710...",0.007365,0.000620
4,1000,4,4,2,4,"[0.008584, 0.006857, 0.007034, 0.007088, 0.007...",0.007144,0.000513
...,...,...,...,...,...,...,...,...
5,100000,16,16,32,5,"[0.281563, 0.013566, 0.013541, 0.013764, 0.013...",0.040438,0.084723
6,100000,16,16,32,6,"[0.316548, 0.014747, 0.014649, 0.014651, 0.014...",0.044857,0.095462
7,100000,16,16,32,7,"[0.246255, 0.012497, 0.012662, 0.012587, 0.012...",0.035947,0.073895
8,100000,16,16,32,8,"[0.366552, 0.016075, 0.015889, 0.016002, 0.015...",0.050993,0.110876


In [50]:
file = '../data/scalability_test/'+ date_time +'/result.csv'
result.to_csv(file ,index=False)

In [51]:
date_time

'2021_06_27-06:54:22_PM'