# Rouge + Accuracy (Precision & Recall)

In [1]:
import evaluate

import pandas as pd

In [6]:
rouge = evaluate.load("rouge")

checkpoint_list = [step for step in range(0, 30860+6172, 6172)]

## Train

In [7]:
metrics_train = []
for checkpoint in checkpoint_list:
    print("checkpoint {}: ".format(checkpoint))

    df_train_comp = pd.read_csv("./Completions/{}_train.csv".format(checkpoint))

    df_train_comp["precision"] = None
    df_train_comp["recall"] = None

    for idx in range(len(df_train_comp)):
        comp_true = list(df_train_comp["comp_true"])[idx].replace("[", "").replace("]", "").replace("'", "").split(",")
        comp_true = [mat.strip() for mat in comp_true]
        comp_pred = list(df_train_comp["comp_pred"])[idx].replace("[", "").replace("]", "").replace("'", "").split(",")
        comp_pred = [mat.strip() for mat in comp_pred]
        
        count_shared = len(set(comp_true).intersection(comp_pred))
        df_train_comp["precision"][idx] = count_shared/len(comp_true)
        df_train_comp["recall"][idx]    = count_shared/len(comp_pred)
    
    # rouge
    rouge_results = rouge.compute(predictions=list(df_train_comp["comp_pred"]), 
                                  references=list(df_train_comp["comp_true"]))
    print(rouge_results)

    # accuracy
    accuracy = {"precision": df_train_comp["precision"].mean(),
                "recall": df_train_comp["recall"].mean()}
    print(accuracy)
    print("\n")
    
    df_train_comp.to_csv("./Completions/{}_train_metric.csv".format(checkpoint), index=False)

    metrics = dict(rouge_results)
    metrics["checkpoint"] = checkpoint
    metrics["precision"] = df_train_comp["precision"].mean()
    metrics["precall"] = df_train_comp["recall"].mean()
    metrics_train.append(metrics)

df_metrics_train = pd.DataFrame.from_records(metrics_train)
df_metrics_train.to_csv("./Completions/metrics_train.csv")


checkpoint 0: 
{'rouge1': 0.018881514770593598, 'rouge2': 0.0, 'rougeL': 0.018933011521027786, 'rougeLsum': 0.01892426778391159}
{'precision': 0.03, 'recall': 0.0016612903225806453}


checkpoint 6172: 
{'rouge1': 0.7616666666666666, 'rouge2': 0.006666666666666666, 'rougeL': 0.7581666666666667, 'rougeLsum': 0.7590000000000001}
{'precision': 0.5091666666666667, 'recall': 0.565}


checkpoint 12344: 
{'rouge1': 0.7667142857142857, 'rouge2': 0.014666666666666668, 'rougeL': 0.765452380952381, 'rougeLsum': 0.7635238095238095}
{'precision': 0.5208333333333334, 'recall': 0.555}


checkpoint 18516: 
{'rouge1': 0.85975, 'rouge2': 0.005, 'rougeL': 0.8583333333333333, 'rougeLsum': 0.8606666666666669}
{'precision': 0.7791666666666667, 'recall': 0.845}


checkpoint 24688: 
{'rouge1': 0.8213896103896103, 'rouge2': 0.005, 'rougeL': 0.8175129870129869, 'rougeLsum': 0.8194740259740261}
{'precision': 0.6575, 'recall': 0.685}


checkpoint 30860: 
{'rouge1': 0.7394458874458874, 'rouge2': 0.01666666666666666

## Val

In [8]:
metrics_val = []
for checkpoint in checkpoint_list:
    print("checkpoint {}: ".format(checkpoint))
    
    df_val_comp = pd.read_csv("./Completions/{}_val.csv".format(checkpoint))

    df_val_comp["precision"] = None
    df_val_comp["recall"] = None

    for idx in range(len(df_val_comp)):
        comp_true = list(df_val_comp["comp_true"])[idx].replace("[", "").replace("]", "").replace("'", "").split(",")
        comp_true = [mat.strip() for mat in comp_true]
        comp_pred = list(df_val_comp["comp_pred"])[idx].replace("[", "").replace("]", "").replace("'", "").split(",")
        comp_pred = [mat.strip() for mat in comp_pred]

        count_shared = len(set(comp_true).intersection(comp_pred))
        df_val_comp["precision"][idx] = count_shared/len(comp_true)
        df_val_comp["recall"][idx]    = count_shared/len(comp_pred)
    
    # rouge
    rouge_results = rouge.compute(predictions=list(df_val_comp["comp_pred"]), 
                                  references=list(df_val_comp["comp_true"]))
    print(rouge_results)

    # accuracy
    accuracy = {"precision": df_val_comp["precision"].mean(),
                "recall": df_val_comp["recall"].mean()}
    print(accuracy)
    print("\n")
    
    df_val_comp.to_csv("./Completions/{}_val_metric.csv".format(checkpoint), index=False)

    metrics = dict(rouge_results)
    metrics["checkpoint"] = checkpoint
    metrics["precision"] = df_val_comp["precision"].mean()
    metrics["recall"] = df_val_comp["recall"].mean()
    metrics_val.append(metrics)

df_metrics_val = pd.DataFrame.from_records(metrics_val)
df_metrics_val.to_csv("./Completions/metrics_val.csv")

checkpoint 0: 
{'rouge1': 0.02078449042210682, 'rouge2': 0.0, 'rougeL': 0.02023851337637743, 'rougeLsum': 0.020493459853571712}
{'precision': 0.06, 'recall': 0.0034780423280423274}


checkpoint 6172: 
{'rouge1': 0.7629696969696971, 'rouge2': 0.0, 'rougeL': 0.7619393939393937, 'rougeLsum': 0.7604696969696969}
{'precision': 0.545, 'recall': 0.6}


checkpoint 12344: 
{'rouge1': 0.7699999999999998, 'rouge2': 0.016666666666666666, 'rougeL': 0.7659999999999998, 'rougeLsum': 0.7645}
{'precision': 0.6091666666666667, 'recall': 0.635}


checkpoint 18516: 
{'rouge1': 0.8228888888888889, 'rouge2': 0.0, 'rougeL': 0.8162222222222223, 'rougeLsum': 0.8222777777777778}
{'precision': 0.7275, 'recall': 0.77}


checkpoint 24688: 
{'rouge1': 0.7157554112554112, 'rouge2': 0.01, 'rougeL': 0.714034632034632, 'rougeLsum': 0.7063571428571427}
{'precision': 0.54, 'recall': 0.54}


checkpoint 30860: 
{'rouge1': 0.7219754130845679, 'rouge2': 0.03, 'rougeL': 0.7206844399731722, 'rougeLsum': 0.7113551307847081}
{'p