In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os

In [2]:
def score(reduc_time, accuracy, train_time, red_w = 200, ac_w = 1, train_w = 20):
    return (accuracy ** ac_w)*(red_w/int(reduc_time + 1) + train_w/int(train_time + 1))

def analyse(path, reduction_weight, accuracy_weight, training_weight):
    df_each = []
    for filename in os.listdir(path):
        f = os.path.join(path, filename)
        if os.path.isfile(f) and ".DS" not in f:
            temp = pd.read_csv(f)
            temp = temp[temp.name.apply(lambda x: x != "name")]
            df_each.append((temp, filename))

    included_cols = ["name", "filename", "original_shape", "transformed_shape", "params", "reduction_time", "accuracy", "train_time", "score_series"]

    out_df = pd.DataFrame(columns=included_cols)
    collected_all_df = pd.DataFrame(columns=included_cols)

    for res in df_each:
        res_copy = res[0].copy()
        res_copy["filename"] = res[1]
        score_series = res_copy[res_copy.name != "Nothing"][["reduction_time", "train_time", "accuracy"]].apply(lambda x: score(x["reduction_time"], x["accuracy"], reduction_weight, accuracy_weight, training_weight), axis=1)
        res_copy_score = res_copy.copy()
        res_copy_score["score_series"] = score_series
        out_df = pd.concat([out_df, res_copy_score])
        max_ind = res_copy_score.groupby(by=["original_shape"])["score_series"].idxmax()
        collected = res_copy_score.iloc[max_ind][["name", "filename", "original_shape", "transformed_shape", "params", "reduction_time", "accuracy", "train_time", "score_series"]]
        collected_all_df = pd.concat([collected_all_df, collected])

    all = out_df.groupby(by="filename", group_keys=True).apply(lambda x: x[:])
    best = collected_all_df.groupby(["filename", "name"]).count().sort_values(by="original_shape").groupby(level=0).tail(1).sort_values(by="filename")

    return all, best

In [3]:
reduction_weight = 10
accuracy_weight = 10
training_weight = 10

In [4]:
path = "../output/kmeans/"
kmeans_all, kmeans_best = analyse(path, reduction_weight, accuracy_weight, training_weight)

In [5]:
kmeans_best

Unnamed: 0_level_0,Unnamed: 1_level_0,original_shape,transformed_shape,params,reduction_time,accuracy,train_time,score_series
filename,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"1000_0.33_(-100, 100)",extremely sparse JL transform,5,5,5,5,5,5,5
"1000_0.33_(-1000, 1000)",extremely sparse JL transform,5,5,5,5,5,5,5
"1000_0.33_(-500, 500)",extremely sparse JL transform,5,5,5,5,5,5,5
"1000_0.66_(-100, 100)",extremely sparse JL transform,6,6,6,6,6,6,6
"1000_0.66_(-1000, 1000)",extremely sparse JL transform,5,5,5,5,5,5,5
"1000_0.66_(-500, 500)",extremely sparse JL transform,4,4,4,4,4,4,4
"1000_0.99_(-100, 100)",extremely sparse JL transform,4,4,4,4,4,4,4
"1000_0.99_(-1000, 1000)",JL transform,5,5,5,5,5,5,5
"1000_0.99_(-500, 500)",extremely sparse JL transform,3,3,3,3,3,3,3
"1000_0_(-100, 100)",PCA,3,3,3,3,3,3,3


In [6]:
path = "../output/news/"
news_all, news_best = analyse(path, reduction_weight, accuracy_weight, training_weight)

In [12]:
kmeans_all

Unnamed: 0_level_0,Unnamed: 1_level_0,name,filename,original_shape,transformed_shape,params,reduction_time,accuracy,train_time,score_series,run_num,...,original_std_max,original_std_min,original_sparsity,transformed_std_sum,transformed_std_mean,transformed_std_median,transformed_std_max,transformed_std_min,transformed_sparsity,characteristics
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"1000_0.33_(-100, 100)",0,extremely sparse JL transform,"1000_0.33_(-100, 100)","(1000, 1000)","(1110, 2351)","{'ep': 0.05, 'de': 0.05}",0.016116,0.97,0.638783,8.715012e+00,0.0,...,918.64390,740.220146,0.3264,5.877160e+05,529.473862,534.137286,595.385337,29.568283,0.330328,"{'n': 1000, 'd': 1000, 'a': -100, 'b': 100, 'c..."
"1000_0.33_(-100, 100)",1,extremely sparse JL transform,"1000_0.33_(-100, 100)","(1000, 1000)","(1110, 2255)","{'ep': 0.05, 'de': 0.1}",0.036851,0.43,0.621037,2.554084e-03,0.0,...,918.64390,740.220146,0.3264,5.993802e+05,539.982187,543.664054,617.805612,29.742874,0.330776,"{'n': 1000, 'd': 1000, 'a': -100, 'b': 100, 'c..."
"1000_0.33_(-100, 100)",2,extremely sparse JL transform,"1000_0.33_(-100, 100)","(1000, 1000)","(1110, 1175)","{'ep': 0.1, 'de': 0.05}",0.022616,0.58,0.458997,5.091322e-02,0.0,...,918.64390,740.220146,0.3264,8.352706e+05,752.496038,758.337473,855.201144,41.661847,0.324000,"{'n': 1000, 'd': 1000, 'a': -100, 'b': 100, 'c..."
"1000_0.33_(-100, 100)",3,extremely sparse JL transform,"1000_0.33_(-100, 100)","(1000, 1000)","(1110, 1127)","{'ep': 0.1, 'de': 0.1}",0.012347,0.63,0.319385,1.164009e-01,0.0,...,918.64390,740.220146,0.3264,8.507589e+05,766.449455,772.980137,874.955679,41.452892,0.326974,"{'n': 1000, 'd': 1000, 'a': -100, 'b': 100, 'c..."
"1000_0.33_(-100, 100)",4,extremely sparse JL transform,"1000_0.33_(-100, 100)","(1000, 1000)","(1110, 235)","{'ep': 0.5, 'de': 0.05}",0.008589,0.16,0.259311,1.299423e-07,0.0,...,918.64390,740.220146,0.3264,1.854843e+06,1671.030011,1683.179978,2197.208318,86.038302,0.328511,"{'n': 1000, 'd': 1000, 'a': -100, 'b': 100, 'c..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"500_0_(-500, 500)",255,JL transform,"500_0_(-500, 500)","(20000, 20000)","(20110, 221)","{'ep': 0.5, 'de': 0.1}",2.269467,1.00,0.362224,5.151515e+00,0.0,...,588.87356,565.236553,0.0000,1.099559e+08,5467.724793,5465.043124,6446.234433,2614.413619,0.000000,"{'n': 20000, 'd': 20000, 'a': -500, 'b': 500, ..."
"500_0_(-500, 500)",256,JL transform,"500_0_(-500, 500)","(20000, 20000)","(20110, 88)","{'ep': 0.9, 'de': 0.05}",1.476547,0.97,0.227300,5.027892e+00,0.0,...,588.87356,565.236553,0.0000,1.741242e+08,8658.590071,8650.564557,11352.537164,3882.987261,0.000000,"{'n': 20000, 'd': 20000, 'a': -500, 'b': 500, ..."
"500_0_(-500, 500)",257,JL transform,"500_0_(-500, 500)","(20000, 20000)","(20110, 68)","{'ep': 0.9, 'de': 0.1}",1.098905,0.98,0.171278,5.570951e+00,0.0,...,588.87356,565.236553,0.0000,1.949242e+08,9692.900054,9671.847978,14208.475178,4015.480514,0.000000,"{'n': 20000, 'd': 20000, 'a': -500, 'b': 500, ..."
"500_0_(-500, 500)",258,PCA,"500_0_(-500, 500)","(20000, 20000)","(20110, 201)","{'n_components': 201, 'svd_solver': 'auto'}",37.293692,1.00,0.307707,2.081340e+00,0.0,...,588.87356,565.236553,0.0000,5.783543e+07,2875.953911,2875.549821,3030.694452,2697.310689,0.000000,"{'n': 20000, 'd': 20000, 'a': -500, 'b': 500, ..."


In [7]:
news_best

Unnamed: 0_level_0,Unnamed: 1_level_0,original_shape,transformed_shape,params,reduction_time,accuracy,train_time,score_series
filename,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,extremely sparse JL transform,1,1,1,1,1,1,1
3,extremely sparse JL transform,1,1,1,1,1,1,1
5,extremely sparse JL transform,1,1,1,1,1,1,1


In [8]:
path = "../output/lin_reg/"

In [9]:
reg_all, reg_best = analyse(path, reduction_weight, accuracy_weight, training_weight)

In [10]:
reg_all.loc["1000_0"]

Unnamed: 0,name,filename,original_shape,transformed_shape,params,reduction_time,accuracy,train_time,score_series,run_num,...,original_std_max,original_std_min,original_sparsity,transformed_std_sum,transformed_std_mean,transformed_std_median,transformed_std_max,transformed_std_min,transformed_sparsity,characteristics
0,extremely sparse JL transform,1000_0,"(800, 999)","(1000, 2322)","{'ep': 0.05, 'de': 0.05}",0.081309,0.991170,0.634847,1.081515e+01,0.0,...,153814.246660,53734.561934,0.0,6.185990e+07,6.185990e+04,6.009268e+04,8.898140e+04,33624.755765,0.0,"{'n': 1000, 'd': 1000, 'x_range': 10000, 'coef..."
1,extremely sparse JL transform,1000_0,"(800, 999)","(1000, 2226)","{'ep': 0.05, 'de': 0.1}",0.018250,0.973709,0.482785,9.054006e+00,0.0,...,153814.246660,53734.561934,0.0,5.224789e+07,5.224789e+04,5.228416e+04,6.815595e+04,33609.354230,0.0,"{'n': 1000, 'd': 1000, 'x_range': 10000, 'coef..."
2,extremely sparse JL transform,1000_0,"(800, 999)","(1000, 1161)","{'ep': 0.1, 'de': 0.05}",0.034796,0.766579,0.354266,8.281655e-01,0.0,...,153814.246660,53734.561934,0.0,1.395197e+08,1.395197e+05,1.343591e+05,2.323182e+05,50924.808713,0.0,"{'n': 1000, 'd': 1000, 'x_range': 10000, 'coef..."
3,extremely sparse JL transform,1000_0,"(800, 999)","(1000, 1113)","{'ep': 0.1, 'de': 0.1}",0.011325,0.181235,0.293531,4.518333e-07,0.0,...,153814.246660,53734.561934,0.0,8.852732e+07,8.852732e+04,8.607918e+04,1.330149e+05,48303.996614,0.0,"{'n': 1000, 'd': 1000, 'x_range': 10000, 'coef..."
4,extremely sparse JL transform,1000_0,"(800, 999)","(1000, 232)","{'ep': 0.5, 'de': 0.05}",0.007037,0.655796,0.025002,1.738758e-01,0.0,...,153814.246660,53734.561934,0.0,3.396663e+08,3.396663e+05,3.310628e+05,5.922629e+05,75558.109383,0.0,"{'n': 1000, 'd': 1000, 'x_range': 10000, 'coef..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,JL transform,1000_0,"(16000, 9999)","(20000, 221)","{'ep': 0.5, 'de': 0.1}",1.607363,0.877256,0.218929,1.840488e+00,0.0,...,224458.311993,99529.900779,0.0,2.134304e+10,1.067152e+06,1.067538e+06,1.530444e+06,616712.864739,0.0,"{'n': 20000, 'd': 10000, 'x_range': 10000, 'co..."
204,JL transform,1000_0,"(16000, 9999)","(20000, 88)","{'ep': 0.9, 'de': 0.05}",1.630216,0.736913,0.144630,3.219803e-01,0.0,...,224458.311993,99529.900779,0.0,3.482116e+10,1.741058e+06,1.734343e+06,2.665841e+06,968958.014219,0.0,"{'n': 20000, 'd': 10000, 'x_range': 10000, 'co..."
205,JL transform,1000_0,"(16000, 9999)","(20000, 68)","{'ep': 0.9, 'de': 0.1}",0.816336,0.717685,0.079260,4.284379e-01,0.0,...,224458.311993,99529.900779,0.0,3.831971e+10,1.915986e+06,1.909033e+06,3.023940e+06,981091.407398,0.0,"{'n': 20000, 'd': 10000, 'x_range': 10000, 'co..."
206,PCA,1000_0,"(16000, 9999)","(20000, 200)","{'n_components': 200, 'svd_solver': 'auto'}",36.636848,0.935635,0.316000,1.073712e+00,0.0,...,224458.311993,99529.900779,0.0,2.213328e+10,1.106664e+06,1.108816e+06,1.569929e+06,656056.444796,0.0,"{'n': 20000, 'd': 10000, 'x_range': 10000, 'co..."


In [11]:
reg_best

Unnamed: 0_level_0,Unnamed: 1_level_0,original_shape,transformed_shape,params,reduction_time,accuracy,train_time,score_series
filename,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1000_0,extremely sparse JL transform,4,4,4,4,4,4,4
1000_0.1,extremely sparse JL transform,5,5,5,5,5,5,5
1000_0.3,extremely sparse JL transform,4,4,4,4,4,4,4
1000_0.5,extremely sparse JL transform,5,5,5,5,5,5,5
1000_0.9,extremely sparse JL transform,5,5,5,5,5,5,5
100_0,extremely sparse JL transform,4,4,4,4,4,4,4
100_0.1,extremely sparse JL transform,4,4,4,4,4,4,4
100_0.3,extremely sparse JL transform,5,5,5,5,5,5,5
100_0.5,extremely sparse JL transform,5,5,5,5,5,5,5
100_0.9,extremely sparse JL transform,4,4,4,4,4,4,4
