In [8]:
import pandas as pd
from glob import glob
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [19]:
files = glob('*.csv')

def get_ci(a, confidence=0.95):
    n = len(a)
    m, se = np.mean(a), stats.sem(a)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m-h, m+h
    

model_performance = []
for f in files:
    
    if 'sub' in f:
        model = 'Substitution'
    else:
        model = 'Tree'
    if 'fulltree' in f:
        resolution = 'Full tree'
    else:
        resolution = 'Interserotype'
    if 'nonorm' in f:
        normalization = 'No'
    else:
        normalization = 'Yes'
    
    performance = pd.read_csv(f)
    
    rmse_mean = performance['rms_error'].mean()
    rmse_ci = get_ci(performance['rms_error'])
    rmse_summary = '%.2f (%.2f, %.2f)'%(rmse_mean, rmse_ci[0], rmse_ci[1])
    
    r_sq_mean = performance['r_squared'].mean()
    r_sq_ci = get_ci(performance['r_squared'])
    r_sq_summary = '%.2f (%.2f, %.2f)'%(r_sq_mean, r_sq_ci[0], r_sq_ci[1])
    
    model_performance.append({
        'Model': model,
        'Resolution': resolution,
        r'$v_a$ and $p_b$': normalization,
        'RMSE': rmse_summary,
        r'Pearson R^2': r_sq_summary
    })

In [20]:
model_performance = pd.DataFrame(model_performance)
model_performance = model_performance.reindex(columns = ['Model', 'Resolution', r'$v_a$ and $p_b$', \
                                                         'RMSE', r'Pearson R^2'])
model_performance = model_performance.sort_values(['Model', 'Resolution', r'$v_a$ and $p_b$'])
model_performance = model_performance.round(2)
table = model_performance.to_latex()



In [21]:
open('./model_performance_summary.txt', 'w').write(table)