In [13]:
import pandas as pd

# pandas_timing.log

In [14]:
f = open('logs/pandas_timing.log', 'r')
data = f.read()
f.close()

df = pd.DataFrame(columns=['trial', 'nrows', 'func', 'time (ns)'])

row_samples = data.split('\nEND ROW\n')
for n, nrows in enumerate([1000, 10_000, 100_000, 1_000_000, 10_000_000]):
    row_sample = row_samples[n]
    func_samples = row_sample.split('\nEND FUNC\n')
    for f, func in enumerate(['add', 'mul', 'cumsum', 'abs', 'mean', 'std', 'count', 'unique']):
        func_sample = func_samples[f]
        
        s = pd.Series(func_sample.split('\n'))
        regex = r'Elapsed time: (\d+\.\d+) ns'
        times = s.str.extractall(regex)[0].values
       
        dff = pd.DataFrame({
            'trial': range(len(times)),
            'nrows': nrows,
            'func': func,
            'time (ns)': times,
        }) 
        
        df = pd.concat([df, dff])

# df.to_csv('logs/pandas_timing.csv', index=False)        
df

Unnamed: 0,trial,nrows,func,time (ns)
0,0,1000,add,389800.00
1,1,1000,add,165900.00
2,2,1000,add,199700.00
3,3,1000,add,175600.00
4,4,1000,add,156900.00
...,...,...,...,...
0,0,10000000,unique,949356931.00
1,1,10000000,unique,987428267.00
2,2,10000000,unique,1099841863.00
3,3,10000000,unique,1010230566.00


# annotation_timing_memory.log

In [39]:
f = open('logs/annotation_timing_memory.log', 'r')
data = f.read()
f.close()

df = pd.DataFrame(columns=['trial', 'nrows', 'nbins', 'unannotated_size', 'annotated_size', 'time (ns)'])

row_samples = data.split('\nEND ROW\n')
for n, nrows in enumerate([1000, 10_000, 100_000, 1_000_000, 10_000_000]):
    row_sample = row_samples[n]
    bin_samples = row_sample.split('\nEND BIN\n')
    
    bs = pd.Series(row_sample)
    unannotated_regex = r'Unannotated DataFrame size: (\d+) bytes'
    unannotated = bs.str.extractall(unannotated_regex)[0].values[0]
    
    for b, nbins in enumerate([10, 100, 1000, 10_000, 100_000, 1_000_000]):
        if nbins >= nrows:
            continue
        
        bin_sample = bin_samples[b]
        
        s = pd.Series(bin_sample.split('\n'))
        
        time_regex = r'Time to annotate: (\d+\.\d+) ns'
        times = s.str.extractall(time_regex)[0].values
        
        annotated_regex = r'Annotated DataFrame size: (\d+) bytes'
        annotated = s.str.extractall(annotated_regex)[0].values[0]
        print(annotated)
       
        dff = pd.DataFrame({
            'trial': range(len(times)),
            'nrows': nrows,
            'nbins': nbins,
            'unannotated_size': unannotated,
            'annotated_size': annotated,
            'time (ns)': times,
        }) 
        
        df = pd.concat([df, dff])

df.to_csv('logs/annotation_timing_memory.csv', index=False)        
df

47460
52768
470460
475768
529064
4700460
4705768
4849064
5224232
47000460
47005768
48049064
48424232
53713576
470000460
470005768
480049064
480424232
503713576
549816616


Unnamed: 0,trial,nrows,nbins,unannotated_size,annotated_size,time (ns)
0,0,1000,10,46000,47460,5136501.00
1,1,1000,10,46000,47460,1630200.00
2,2,1000,10,46000,47460,1442500.00
3,3,1000,10,46000,47460,1317300.00
4,4,1000,10,46000,47460,1310200.00
...,...,...,...,...,...,...
0,0,10000000,1000000,460000000,549816616,46698905446.00
1,1,10000000,1000000,460000000,549816616,47704930965.00
2,2,10000000,1000000,460000000,549816616,47733633702.00
3,3,10000000,1000000,460000000,549816616,48191669692.00


# filter_timing_equals.log

In [16]:
f = open('logs/filter_timing_equals.log', 'r')
data = f.read()
f.close()

df = pd.DataFrame(columns=['trial', 'nrows', 'nbins', 'filter', 'sketch_size', 'time (ns)'])
row_samples = data.split('\nEND ROW\n')

for n, nrows in enumerate([1000, 10_000, 100_000, 1_000_000, 10_000_000]):
    row_sample = row_samples[n]
    bin_samples = row_sample.split('\nEND BIN\n')
    
    for b, nbins in enumerate([10, 100, 1000, 10_000, 100_000, 1_000_000]):
        if nbins >= nrows:
            continue
        
        bin_sample = bin_samples[b]
        filter_samples = bin_sample.split('\nEND FILTER\n')
        
        for f, filter in enumerate(['filter1', 'filter2', 'filter3']):
            filter_sample = filter_samples[f]
            sk_samples = filter_sample.split('\nEND SKETCH\n')
            
            for sk, sketch_size in enumerate([1, 2, 4, 8]):
                sketch_sample = sk_samples[sk]
                
                s = pd.Series(sketch_sample.split('\n'))
                regex = r'Time to filter: (\d+\.\d+) ns'
                times = s.str.extractall(regex)[0].values
               
                dfsk = pd.DataFrame({
                    'trial': range(len(times)),
                    'nrows': nrows,
                    'nbins': nbins,
                    'filter': filter,
                    'sketch_size': sketch_size,
                    'time (ns)': times,
                }) 
                
                df = pd.concat([df, dfsk])
        
df.to_csv('logs/filter_timing_equals.csv', index=False)
df

Unnamed: 0,trial,nrows,nbins,filter,sketch_size,time (ns)
0,0,1000,10,filter1,1,418200.00
1,1,1000,10,filter1,1,379500.00
2,2,1000,10,filter1,1,328100.00
3,3,1000,10,filter1,1,326000.00
4,4,1000,10,filter1,1,322300.00
...,...,...,...,...,...,...
0,0,10000000,1000000,filter3,8,50917601.00
1,1,10000000,1000000,filter3,8,58347502.00
2,2,10000000,1000000,filter3,8,53860201.00
3,3,10000000,1000000,filter3,8,52087901.00


# filter_timing_isin.log

In [17]:
f = open('logs/filter_timing_isin.log', 'r')
data = f.read()
f.close()

df = pd.DataFrame(columns=['trial', 'nrows', 'nbins', 'filter', 'sketch_size', 'time (ns)'])
row_samples = data.split('\nEND ROW\n')

for n, nrows in enumerate([1000, 10_000, 100_000, 1_000_000, 10_000_000]):
    row_sample = row_samples[n]
    bin_samples = row_sample.split('\nEND BIN\n')
    
    for b, nbins in enumerate([10, 100, 1000, 10_000, 100_000, 1_000_000]):
        if nbins >= nrows:
            continue
        
        bin_sample = bin_samples[b]
        filter_samples = bin_sample.split('\nEND FILTER\n')
        
        for f, filter in enumerate(['filter1', 'filter2', 'filter3']):
            filter_sample = filter_samples[f]
            sk_samples = filter_sample.split('\nEND SKETCH\n')
            
            for sk, sketch_size in enumerate([1, 2, 4, 8]):
                sketch_sample = sk_samples[sk]
                
                s = pd.Series(sketch_sample.split('\n'))
                regex = r'Time to filter: (\d+\.\d+) ns'
                times = s.str.extractall(regex)[0].values
               
                dfsk = pd.DataFrame({
                    'trial': range(len(times)),
                    'nrows': nrows,
                    'nbins': nbins,
                    'filter': filter,
                    'sketch_size': sketch_size,
                    'time (ns)': times,
                }) 
                
                df = pd.concat([df, dfsk])
        
df.to_csv('logs/filter_timing_isin.csv', index=False)
df

Unnamed: 0,trial,nrows,nbins,filter,sketch_size,time (ns)
0,0,1000,10,filter1,1,505700.00
1,1,1000,10,filter1,1,431300.00
2,2,1000,10,filter1,1,413700.00
3,3,1000,10,filter1,1,423500.00
4,4,1000,10,filter1,1,395300.00
...,...,...,...,...,...,...
0,0,10000000,1000000,filter3,8,71631001.00
1,1,10000000,1000000,filter3,8,71989601.00
2,2,10000000,1000000,filter3,8,70270701.00
3,3,10000000,1000000,filter3,8,71443602.00


# filter_timing_multi.log

In [18]:
f = open('logs/filter_timing_multi.log', 'r')
data = f.read()
f.close()

df = pd.DataFrame(columns=['trial', 'nrows', 'nbins', 'filter', 'sketch_size', 'time (ns)'])
row_samples = data.split('\nEND ROW\n')

for n, nrows in enumerate([1000, 10_000, 100_000, 1_000_000, 10_000_000]):
    row_sample = row_samples[n]
    bin_samples = row_sample.split('\nEND BIN\n')
    
    for b, nbins in enumerate([10, 100, 1000, 10_000, 100_000, 1_000_000]):
        if nbins >= nrows:
            continue
        
        bin_sample = bin_samples[b]
        
        filter_samples = bin_sample.split('\nEND FILTER\n')

        for f, filter in enumerate(['filter1', 'filter2', 'filter3', 'filter4']):
            filter_sample = filter_samples[f]
            sk_samples = filter_sample.split('\nEND SKETCH\n')
            
            for sk, sketch_size in enumerate([1, 2, 4, 8]):
                sketch_sample = sk_samples[sk]
                
                s = pd.Series(sketch_sample.split('\n'))
                regex = r'Time to filter: (\d+\.\d+) ns'
                times = s.str.extractall(regex)[0].values
               
                dfsk = pd.DataFrame({
                    'trial': range(len(times)),
                    'nrows': nrows,
                    'nbins': nbins,
                    'filter': filter,
                    'sketch_size': sketch_size,
                    'time (ns)': times,
                }) 
                
                df = pd.concat([df, dfsk])
        
df.to_csv('logs/filter_timing_multi.csv', index=False)
df

Unnamed: 0,trial,nrows,nbins,filter,sketch_size,time (ns)
0,0,1000,10,filter1,1,369800.00
1,1,1000,10,filter1,1,288700.00
2,2,1000,10,filter1,1,310100.00
3,3,1000,10,filter1,1,270400.00
4,4,1000,10,filter1,1,265900.00
...,...,...,...,...,...,...
0,0,10000000,1000000,filter4,8,1830700.00
1,1,10000000,1000000,filter4,8,1442800.00
2,2,10000000,1000000,filter4,8,1460600.00
3,3,10000000,1000000,filter4,8,1431400.00
