In [1]:
import re
import pandas as pd
import random
import os
import numpy as np

In [2]:
path = './data/results/train_sizes'

In [3]:
def calc_f1(p, r):
    return 2 * ((p*r)/(p+r))

In [4]:
def calc_macro(p, r, f):    
    m_p = sum(p)/len(p)
    m_r = sum(r)/len(r)    
    m_f1 = sum(f)/len(f) 
    return round(m_p,4), round(m_r,4), round(m_f1,4)
    
def calc_micro(tp, fp, fn):
    m_p = sum(tp)/(sum(tp)+sum(fp))
    m_r = sum(tp)/(sum(tp)+sum(fn))
    m_f1 = calc_f1(m_p, m_r)
    return round(m_p,4), round(m_r,4), round(m_f1,4)

In [5]:
def get_results(path):
    f = open(path, "r")

    precisions = []
    recalls = []
    f1 = []
    tps = []
    fps = []
    fns = []
    for line in f:
        if 'micro' in line:
            microf1 = line.split(" ")[3].strip()
        elif 'macro' in line:
            macrof1 = line.split(" ")[3].strip()
        elif 'tp' in line:
            split = line.split(":")
            # macro
            precision = split[4].split("-")[0].strip()
            recall = split[5].split("-")[0].strip()
            precisions.append(float(precision)) 
            recalls.append(float(recall))
            f1.append(float(split[6].strip()))

            #micro
            tp = split[1].split("-")[0].strip()
            fp = split[2].split("-")[0].strip()
            fn = split[3].split("-")[0].strip()

            tps.append(float(tp))
            fps.append(float(fp))
            fns.append(float(fn))


    micro_p, micro_r, micro_f1 = calc_micro(tps, fps, fns)
#     print('Micro: p=',micro_p, ' r=', micro_r, ' f1=', micro_f1)

    macro_p, macro_r, macro_f1 = calc_macro(precisions, recalls, f1)
#     print('Macro: p=',macro_p, ' r=', macro_r, ' f1=', macro_f1)
    
    r = [micro_p, micro_r, micro_f1, macro_p, macro_r, macro_f1]
    return r

In [6]:
for subdir, dirs, files in os.walk(path):
    if subdir != './data/results/train_sizes':
        temp = []
        headers = ['Train size', 'Dataset', 'P-micro', 'R-micro', 'F1-micro', 'P-macro', 'R-macro', 'F1-macro']
        print(subdir)
        datasets = np.arange(1, 6, 1).tolist()
        train_splits = np.arange(20, 100, 20).tolist()
            
        for train_split in train_splits:
            for dataset in datasets:
                cur_file = 'results_' + str(dataset) + '_' + str(train_split) + '.txt'
                results = get_results(os.path.join(subdir, cur_file))
                    
                
                temp.append(([train_split, dataset] + results))
        df = pd.DataFrame(data=temp, columns=headers)
        df.to_csv(os.path.join(subdir, 'results.csv'), index=False)

./data/results/train_sizes/bert-elmo-flair
./data/results/train_sizes/bert-flair
./data/results/train_sizes/flair
./data/results/train_sizes/elmo
./data/results/train_sizes/elmo-flair
./data/results/train_sizes/bert-elmo
./data/results/train_sizes/bert
