In [258]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

In [259]:
SITE = "website"
MEAN = "mean"
STD = "standard deviation"
VARIANCE = "variance"
MAD = "median absolute deviation"

TIMES = "times"

In [260]:
def readResults(path):
    with open(path) as f:
        d = json.load(f)
    return d

## Collect results in DataFrames

In [263]:
# Collect results in dictionaries

vanilla_r = readResults('vanilla.results')

mean_0_r = readResults('score_mean/fastor_PA0.0_POOL5_0.results')
mean_05_r = readResults('score_mean/fastor_PA0.5_POOL5_0.results')
mean_1_r = readResults('score_mean/fastor_PA1_POOL5_1.results')

var_0_r = readResults('score_variance/fastor_PA0.0_POOL5_0.results')
var_05_r = readResults('score_variance/fastor_PA0.5_POOL5_0.results')
var_1_r = readResults('score_variance/fastor_PA1_POOL5_1.results')

mad_0_r = readResults('score_mad/fastor_PA0.0_POOL5_0.results')
mad_05_r = readResults('score_mad/fastor_PA0.5_POOL5_1.results')
mad_1_r = readResults('score_mad/fastor_PA1_POOL5_0.results')

all_results = [vanilla_r,
              mean_0_r, mean_05_r, mean_1_r,
              var_0_r, var_05_r, var_1_r,
              mad_0_r, mad_05_r, mad_1_r]

In [264]:
# Calculate statistics in dataframes

def getStats(result_dict):
    df = pd.DataFrame()
    df[SITE] = result_dict.keys()
    df[TIMES] = df[SITE].map(result_dict)
    df['reps'] = df[TIMES].apply(len)
    df[MEAN] = df[TIMES].apply(np.mean)
    df[STD] = df[TIMES].apply(np.std)
    return df
#     return df.drop(TIMES, axis=1)

# Vanilla
vanilla_df = getStats(vanilla_r)

# Fastor Mean
fastor_mean_0_df = getStats(mean_0_r)
fastor_mean_05_df = getStats(mean_05_r)
fastor_mean_1_df = getStats(mean_1_r)

# Fastor Variance
fastor_var_0_df = getStats(var_0_r)
fastor_var_05_df = getStats(var_05_r)
fastor_var_1_df = getStats(var_1_r)

# Fastor MAD
fastor_mad_0_df = getStats(mad_0_r)
fastor_mad_05_df = getStats(mad_05_r)
fastor_mad_1_df = getStats(mad_1_r)

In [265]:
# Visualize number of repetitions for each website (to figure out minimum filter)
def plotReps(df, t):
    x = range(df.shape[0])
    plt.scatter(x, df['reps'])
    plt.title(t)
    plt.show()

# plotReps(vanilla_df, 'vanilla')

# plotReps(fastor_mean_0_df, 'mean_0')
# plotReps(fastor_mean_05_df, 'mean_05')
# plotReps(fastor_mean_1_df, 'mean_1')

# plotReps(fastor_var_0_df, 'var_0')
# plotReps(fastor_var_05_df, 'var_05')
# plotReps(fastor_var_1_df, 'var_1')

# plotReps(fastor_mad_0_df, 'mad_0')
# plotReps(fastor_mad_05_df, 'mad_05')
# plotReps(fastor_mad_1_df, 'mad_01')

In [266]:
# Find min limits for reps

vanilla_reps_lim = 25

fastor_mean_0_lim = 30
fastor_mean_05_lim = 30
fastor_mean_1_lim = 20

fastor_var_0_lim = 25
fastor_var_05_lim = 30
fastor_var_1_lim = 15

fastor_mad_0_lim = 30
fastor_mad_05_lim = 30
fastor_mad_1_lim = 30


# Filter for sites with a minimum number of tries
# vanilla
vanilla_df = vanilla_df[vanilla_df['reps'] > vanilla_reps_lim]

# fastor mean
fastor_mean_0_df = fastor_mean_0_df[fastor_mean_0_df['reps'] > fastor_mean_0_lim]
fastor_mean_05_df = fastor_mean_05_df[fastor_mean_05_df['reps'] > fastor_mean_05_lim]
fastor_mean_1_df = fastor_mean_1_df[fastor_mean_1_df['reps'] > fastor_mean_1_lim]

# fastor variance
fastor_var_0_df = fastor_var_0_df[fastor_var_0_df['reps'] > fastor_var_0_lim]
fastor_var_05_df = fastor_var_05_df[fastor_var_05_df['reps'] > fastor_var_05_lim]
fastor_var_1_df = fastor_var_1_df[fastor_var_1_df['reps'] > fastor_var_1_lim]

# fastor mad
fastor_mad_0_df = fastor_mad_0_df[fastor_mad_0_df['reps'] > fastor_mad_0_lim]
fastor_mad_05_df = fastor_mad_05_df[fastor_mad_05_df['reps'] > fastor_mad_05_lim]
fastor_mad_1_df = fastor_mad_1_df[fastor_mad_1_df['reps'] > fastor_mad_1_lim]


In [267]:
# Leave only common websites in each dataframe

all_dfs = [vanilla_df,
           fastor_mean_0_df, fastor_mean_05_df, fastor_mean_1_df,
           fastor_var_0_df, fastor_var_05_df, fastor_var_1_df,
           fastor_mad_0_df, fastor_mad_05_df, fastor_mad_1_df]


# Get all common sites
def siteInAllDFs(site, list_dfs):
    for i, df in enumerate(list_dfs):
        if not site in df[SITE].unique():
            return False
    return True

def getCommonSites():
    all_sites = set().union(*all_results)
    common_sites = []
    
    for site in tqdm(all_sites):
        if siteInAllDFs(site, all_dfs):
            common_sites.append(site)
    return common_sites

intersection_sites = getCommonSites()
time.sleep(0.2)
print(f"Found {len(intersection_sites)} common sites!\n")


# Filter all dfs for common sites
    
    
# vanilla
vanilla_df = vanilla_df[vanilla_df[SITE].isin(intersection_sites)].copy()

# fastor mean
fastor_mean_0_df = fastor_mean_0_df[fastor_mean_0_df[SITE].isin(intersection_sites)].copy()
fastor_mean_05_df = fastor_mean_05_df[fastor_mean_05_df[SITE].isin(intersection_sites)].copy()
fastor_mean_1_df = fastor_mean_1_df[fastor_mean_1_df[SITE].isin(intersection_sites)].copy()

# fastor variance
fastor_var_0_df = fastor_var_0_df[fastor_var_0_df[SITE].isin(intersection_sites)].copy()
fastor_var_05_df = fastor_var_05_df[fastor_var_05_df[SITE].isin(intersection_sites)].copy()
fastor_var_1_df = fastor_var_1_df[fastor_var_1_df[SITE].isin(intersection_sites)].copy()

# fastor mad
fastor_mad_0_df = fastor_mad_0_df[fastor_mad_0_df[SITE].isin(intersection_sites)].copy()
fastor_mad_05_df = fastor_mad_05_df[fastor_mad_05_df[SITE].isin(intersection_sites)].copy()
fastor_mad_1_df = fastor_mad_1_df[fastor_mad_1_df[SITE].isin(intersection_sites)].copy()


100%|██████████| 487/487 [00:00<00:00, 890.60it/s]


Found 460 common sites!



In [268]:
# Save to csvs
# vanilla_df.to_csv('vanilla_results.csv', index=False)

# fastor_mean_0_df.to_csv('fastor_mean_0_results.csv', index=False)
# fastor_mean_05_df.to_csv('fastor_mean_05_results.csv', index=False)
# fastor_mean_1_df.to_csv('fastor_mean_1_results.csv', index=False)

# fastor_var_0_df.to_csv('fastor_var_0_results.csv', index=False)
# fastor_var_05_df.to_csv('fastor_var_05_results.csv', index=False)
# fastor_var_1_df.to_csv('fastor_var_1_results.csv', index=False)

# fastor_mad_0_df.to_csv('fastor_mad_0_results.csv', index=False)
# fastor_mad_05_df.to_csv('fastor_mad_05_results.csv', index=False)
# fastor_mad_1_df.to_csv('fastor_mad_1_results.csv', index=False)


## Combine all dfs into one

In [269]:
# Combine to 1 csv

# Add cols on all dfs specifying options

# vanilla
vanilla_df["scheme"] = ['vanilla'] * vanilla_df.shape[0]
vanilla_df["score"] = ['-'] * vanilla_df.shape[0]
vanilla_df['PA'] = [-1] * vanilla_df.shape[0]
vanilla_df['Pool Size'] = [-1] * vanilla_df.shape[0]

# fastor mean
fastor_mean_0_df["scheme"] = ['fastor'] * fastor_mean_0_df.shape[0]
fastor_mean_0_df["score"] = ['mean'] * fastor_mean_0_df.shape[0]
fastor_mean_0_df['PA'] = [0] * fastor_mean_0_df.shape[0]
fastor_mean_0_df['Pool Size'] = [5] * fastor_mean_0_df.shape[0]

fastor_mean_05_df["scheme"] = ['fastor'] * fastor_mean_05_df.shape[0]
fastor_mean_05_df["score"] = ['mean'] * fastor_mean_05_df.shape[0]
fastor_mean_05_df['PA'] = [0.5] * fastor_mean_05_df.shape[0]
fastor_mean_05_df['Pool Size'] = [5] * fastor_mean_05_df.shape[0]

fastor_mean_1_df["scheme"] = ['fastor'] * fastor_mean_1_df.shape[0]
fastor_mean_1_df["score"] = ['mean'] * fastor_mean_1_df.shape[0]
fastor_mean_1_df['PA'] = [1] * fastor_mean_1_df.shape[0]
fastor_mean_1_df['Pool Size'] = [5] * fastor_mean_1_df.shape[0]

# fastor variance
fastor_var_0_df["scheme"] = ['fastor'] * fastor_var_0_df.shape[0]
fastor_var_0_df["score"] = ['variance'] * fastor_var_0_df.shape[0]
fastor_var_0_df['PA'] = [0] * fastor_var_0_df.shape[0]
fastor_var_0_df['Pool Size'] = [5] * fastor_var_0_df.shape[0]

fastor_var_05_df["scheme"] = ['fastor'] * fastor_var_05_df.shape[0]
fastor_var_05_df["score"] = ['variance'] * fastor_var_05_df.shape[0]
fastor_var_05_df['PA'] = [0.5] * fastor_var_05_df.shape[0]
fastor_var_05_df['Pool Size'] = [5] * fastor_var_05_df.shape[0]

fastor_var_1_df["scheme"] = ['fastor'] * fastor_var_1_df.shape[0]
fastor_var_1_df["score"] = ['variance'] * fastor_var_1_df.shape[0]
fastor_var_1_df['PA'] = [1] * fastor_var_1_df.shape[0]
fastor_var_1_df['Pool Size'] = [5] * fastor_var_1_df.shape[0]

# fastor mad
fastor_mad_0_df["scheme"] = ['fastor'] * fastor_mad_0_df.shape[0]
fastor_mad_0_df["score"] = ['mad'] * fastor_mad_0_df.shape[0]
fastor_mad_0_df['PA'] = [0] * fastor_mad_0_df.shape[0]
fastor_mad_0_df['Pool Size'] = [5] * fastor_mad_0_df.shape[0]

fastor_mad_05_df["scheme"] = ['fastor'] * fastor_mad_05_df.shape[0]
fastor_mad_05_df["score"] = ['mad'] * fastor_mad_05_df.shape[0]
fastor_mad_05_df['PA'] = [0.5] * fastor_mad_05_df.shape[0]
fastor_mad_05_df['Pool Size'] = [5] * fastor_mad_05_df.shape[0]

fastor_mad_1_df["scheme"] = ['fastor'] * fastor_mad_1_df.shape[0]
fastor_mad_1_df["score"] = ['mad'] * fastor_mad_1_df.shape[0]
fastor_mad_1_df['PA'] = [1] * fastor_mad_1_df.shape[0]
fastor_mad_1_df['Pool Size'] = [5] * fastor_mad_1_df.shape[0]

all_dfs = [vanilla_df,
          fastor_mean_0_df, fastor_mean_05_df, fastor_mean_1_df,
          fastor_var_0_df, fastor_var_05_df, fastor_var_1_df,
          fastor_mad_0_df, fastor_mad_05_df, fastor_mad_1_df]

In [270]:
df = pd.concat(all_dfs, ignore_index=True).sort_values('website').reset_index()

In [271]:
df.to_csv('evaluation.csv', quotechar='"', index=False)