In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#datasets = ['a9a', 'mushrooms', 'ijcnn1', 'cod-rna', 'covtype', 'w8a', 'protein', 'quantum', 'SUSY', 'alpha']
reg_type = 'l2'
#datasets = ['a9a', 'covtype', 'protein']
#datasets = ['SUSY', 'alpha', 'mnist8m']
#datasets = ['dna18m']
#datasets = ['a9a', 'covtype']
datasets = ['alpha', 'mnist8m']

nim_minibatch_size = 100 if reg_type == 'l2' else 5000
methods = ['NIM', 'SAG', 'newton']
if reg_type == 'l2':
    methods.append('LBFGS')
    if not 'alpha' in datasets and not 'mnist8m' in datasets and not 'dna18m' in datasets:
        methods.append('SFO')

def construct_fname(reg_type, dataset, method):
    suffix = 'dat'
    if method == 'NIM':
        suffix = 'minibatch_size=%d.dat' % (100 if reg_type == 'l2' else 5000)
    if method == 'SAG':
        suffix = 'minibatch_size=10.dat'
    if method == 'newton':
        suffix = 'exact=0.dat'
    return '%s/%s/%s.%s.%s.%s' % (reg_type, dataset, reg_type, dataset, method, suffix)

def get_nd(dataset):
    if dataset == 'a9a': return (32561, 123)
    if dataset == 'mushrooms': return (8124, 112)
    if dataset == 'ijcnn1': return (49990, 22)
    if dataset == 'cod-rna': return (59535, 8)
    if dataset == 'covtype': return (581012, 54)
    if dataset == 'w8a': return (49749, 300)
    if dataset == 'protein': return (145751, 74)
    if dataset == 'quantum': return (50000, 65)
    if dataset == 'SUSY': return (5000000, 18)
    if dataset == 'alpha': return (500000, 500)
    if dataset == 'mnist8m': return (8100000, 784)
    if dataset == 'dna18m': return (18000000, 800)

    return (-1, -1)

grid = np.logspace(-10, -1, num=10)
print(grid)

# Prepare
results = {dataset: {} for dataset in datasets}
# Fill results
for dataset in datasets:
    # Find optimal value
    f_opt = np.inf
    for method in methods:
        res_table = np.loadtxt(construct_fname(reg_type, dataset, method), skiprows=1)
        f_opt_idx = 2 if res_table.shape[1] == 4 else 3
        f_opt = min(f_opt, np.min(res_table[:, f_opt_idx]))
        
    # Make table
    for method in methods:
        res_table = np.loadtxt(construct_fname(reg_type, dataset, method), skiprows=1)
        f_opt_idx = 2 if res_table.shape[1] == 4 else 3
        residual_f = res_table[:, f_opt_idx] - f_opt
        elaps_t = res_table[:, 1]
        # Clean a little bit
        mask = residual_f > 1e-14
        residual_f = residual_f[mask]
        elaps_t = elaps_t[mask]
        
        # Sort
        idx = np.argsort(residual_f)
        residual_f = residual_f[idx]
        elaps_t = elaps_t[idx]
        # Interpolate
        interp_t = np.interp(grid, residual_f, elaps_t)
        # Correct a little bit
        mask = grid > residual_f[-1]
        interp_t[mask] = 0
        
        mask = grid < residual_f[0]
        interp_t[mask] = np.nan
        
        # Save it
        results[dataset][method] = interp_t
        
        # Plot (debug)
#         plt.figure()
#         plt.semilogx(residual_f, elaps_t, linewidth=2)
#         plt.semilogx(grid, interp_t, linewidth=2, marker='o')
#         plt.grid()
#         plt.title('ds=%s, method=%s' % (dataset, method))
        
# Print table
# Header
def tmethod(method):
    if method == 'newton': return 'Newton'
    return method
print(r'\begin{tabular}{%s}' % ('c||' + '||'.join(['c' * len(methods) for dataset in datasets])))
print('& ' + ' & '.join([
    '\multicolumn{%d}{c%s}{\emph{%s (n=%d, d=%d)}}' %
            (len(methods), ('||' if idx < len(datasets)-1 else ''), dataset, get_nd(dataset)[0], get_nd(dataset)[1])
    for idx, dataset in enumerate(datasets)]
), end=r' \\ \hline' + '\n')
print('Res & ' + ' & '.join([
    ' & '.join([tmethod(method) for method in methods])
    for dataset in datasets
]), end=r' \\ \hline' + '\n')

# Body
def form_msg(t, method, best_method):
    if np.isnan(t):
        return '-'
    if t == 0:
        ret = '0'
    elif t < 60: # less than a minute
        ret = ('%.2fs' % t).lstrip('0')
    elif t < 3600: # less than an hour
        ret = '%0.1fm' % (t/60)
    else: # hours
        ret = '%0.1fh' % (t/3600)
    if method == best_method:
        ret = r'\textbf{%s}' % ret
    return ret
    
for idx, accuracy in enumerate(reversed(grid)):
    best_method = dict()
    for dataset in datasets:
        best_result = np.inf
        for method in methods:
            if results[dataset][method][-(idx+1)] < best_result:
                best_result = results[dataset][method][-(idx+1)]
                best_method[dataset] = method
    
    add_str = ' & '.join([
        ' & '.join([form_msg(results[dataset][method][-(idx+1)], method, best_method[dataset])
                    for method in methods])
        for dataset in datasets
    ])
        
    print(r'$10^{%d}$ & %s' % (np.log10(accuracy), add_str), end=r' \\' + '\n')
print('\end{tabular}')