In [1]:
import sys
sys.path.insert(0, "../src")
import re
from pathlib import Path
from functools import partial
from collections import defaultdict

import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from sklearn.metrics import classification_report, accuracy_score

import constants
from gen.util import read_data, write_jsonl
from rte.aggregate import agg_predict_proba, agg_predict

# Init

In [2]:
def print_report(df, digits=4):
    print(classification_report(y_true=df["labels"], y_pred=df["predicted_label"], digits=digits, labels=["REFUTES", "SUPPORTS"]))
    print("Accuracy: ", round(accuracy_score(y_true=df["labels"], y_pred=df["predicted_label"]), 4))

In [3]:
sf_all = pd.DataFrame(read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/dumps/bert-data-sent-evidence/scifact.all.test.jsonl")))
sf_all = sf_all.groupby("claim_id").agg({"evidence": "count", "labels": "max"})
sf_all = sf_all.assign(labels=sf_all["labels"].map(constants.ID2LABEL))
sf_all = sf_all[sf_all["labels"] != constants.LOOKUP["label"]["nei"]]
sf_all["evidence"].value_counts()

1     355
2     200
3      79
4      35
5       8
6       5
8       4
7       3
9       2
11      2
Name: evidence, dtype: int64

In [4]:
ev1 = sf_all[sf_all["evidence"] == 1]
ev2 = sf_all[sf_all["evidence"] == 2]
ev3 = sf_all[sf_all["evidence"] >= 3]

# Oracle IR

In [5]:
best_doc = pd.DataFrame(read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/doc/scifact/fever-climatefeverpure-bert-base-uncased.all.jsonl")))
best_doc = best_doc.set_index("claim_id")
best_sent = pd.DataFrame(read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/sent/scifact/fever-climatefever-xlnet-base-cased.all.jsonl")))
best_sent = best_sent.assign(predicted_label=best_sent["predicted_label"].map(constants.LABEL2ID))

best_maj = best_sent.groupby("claim_id").agg({"predicted_label": agg_predict})
best_maj = best_maj.assign(predicted_label=best_maj["predicted_label"].map(constants.ID2LABEL))
best_mean = best_sent.groupby("claim_id").agg({"predicted_proba": agg_predict_proba}).rename(columns={"predicted_proba": "predicted_label"})
best_mean = best_mean.assign(predicted_label=best_mean["predicted_label"].map(constants.ID2LABEL))

In [6]:
d_ev1 = best_doc.join(ev1, how="inner")
d_ev2 = best_doc.join(ev2, how="inner")
d_ev3 = best_doc.join(ev3, how="inner")

smaj_ev1 = best_maj.join(ev1, how="inner")
smaj_ev2 = best_maj.join(ev2, how="inner")
smaj_ev3 = best_maj.join(ev3, how="inner")

smean_ev1 = best_mean.join(ev1, how="inner")
smean_ev2 = best_mean.join(ev2, how="inner")
smean_ev3 = best_mean.join(ev3, how="inner")

## Doc

In [7]:
print_report(d_ev1)

              precision    recall  f1-score   support

     REFUTES     0.6506    0.4821    0.5538       112
    SUPPORTS     0.7897    0.8807    0.8327       243

   micro avg     0.7571    0.7549    0.7560       355
   macro avg     0.7201    0.6814    0.6933       355
weighted avg     0.7458    0.7549    0.7447       355

Accuracy:  0.7549


In [8]:
print_report(d_ev2)

              precision    recall  f1-score   support

     REFUTES     0.7292    0.4795    0.5785        73
    SUPPORTS     0.7534    0.8661    0.8059       127

   micro avg     0.7474    0.7250    0.7360       200
   macro avg     0.7413    0.6728    0.6922       200
weighted avg     0.7446    0.7250    0.7229       200

Accuracy:  0.725


In [9]:
print_report(d_ev3)

              precision    recall  f1-score   support

     REFUTES     0.7500    0.2885    0.4167        52
    SUPPORTS     0.7340    0.8023    0.7667        86

   micro avg     0.7368    0.6087    0.6667       138
   macro avg     0.7420    0.5454    0.5917       138
weighted avg     0.7401    0.6087    0.6348       138

Accuracy:  0.6087


## Sent - Majority

In [10]:
print_report(smaj_ev1)

              precision    recall  f1-score   support

     REFUTES     0.6400    0.4286    0.5134       112
    SUPPORTS     0.8294    0.7202    0.7709       243

   micro avg     0.7797    0.6282    0.6958       355
   macro avg     0.7347    0.5744    0.6421       355
weighted avg     0.7696    0.6282    0.6897       355

Accuracy:  0.6282


In [11]:
print_report(smaj_ev2)

              precision    recall  f1-score   support

     REFUTES     0.8182    0.4932    0.6154        73
    SUPPORTS     0.8302    0.6929    0.7554       127

   micro avg     0.8267    0.6200    0.7086       200
   macro avg     0.8242    0.5930    0.6854       200
weighted avg     0.8258    0.6200    0.7043       200

Accuracy:  0.62


In [12]:
print_report(smaj_ev3)

              precision    recall  f1-score   support

     REFUTES     0.7692    0.3846    0.5128        52
    SUPPORTS     0.7553    0.8256    0.7889        86

   micro avg     0.7583    0.6594    0.7054       138
   macro avg     0.7623    0.6051    0.6509       138
weighted avg     0.7606    0.6594    0.6849       138

Accuracy:  0.6594


## Sent - Mean Probability

In [13]:
print_report(smean_ev1)

              precision    recall  f1-score   support

     REFUTES     0.6400    0.4286    0.5134       112
    SUPPORTS     0.8294    0.7202    0.7709       243

   micro avg     0.7797    0.6282    0.6958       355
   macro avg     0.7347    0.5744    0.6421       355
weighted avg     0.7696    0.6282    0.6897       355

Accuracy:  0.6282


In [14]:
print_report(smean_ev2)

              precision    recall  f1-score   support

     REFUTES     0.8205    0.4384    0.5714        73
    SUPPORTS     0.8198    0.7165    0.7647       127

   micro avg     0.8200    0.6150    0.7029       200
   macro avg     0.8202    0.5774    0.6681       200
weighted avg     0.8201    0.6150    0.6942       200

Accuracy:  0.615


In [15]:
print_report(smean_ev3)

              precision    recall  f1-score   support

     REFUTES     0.7500    0.2885    0.4167        52
    SUPPORTS     0.7846    0.5930    0.6755        86

   micro avg     0.7765    0.4783    0.5919       138
   macro avg     0.7673    0.4407    0.5461       138
weighted avg     0.7716    0.4783    0.5780       138

Accuracy:  0.4783


# Pipeline

In [16]:
best_doc = pd.DataFrame(read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/doc/scifactpipeline/fever-climatefeverpure-xlnet-base-cased.all.jsonl")))
best_doc = best_doc.set_index("claim_id")
best_sent = pd.DataFrame(read_data(Path("/users/k21190024/study/fact-check-transfer-learning/scratch/thesis/predictions/sent/scifactpipeline/fever-climatefever-xlnet-base-cased.all.jsonl")))
best_sent = best_sent.assign(predicted_label=best_sent["predicted_label"].map(constants.LABEL2ID))

best_maj = best_sent.groupby("claim_id").agg({"predicted_label": agg_predict})
best_maj = best_maj.assign(predicted_label=best_maj["predicted_label"].map(constants.ID2LABEL))
best_mean = best_sent.groupby("claim_id").agg({"predicted_proba": agg_predict_proba}).rename(columns={"predicted_proba": "predicted_label"})
best_mean = best_mean.assign(predicted_label=best_mean["predicted_label"].map(constants.ID2LABEL))

In [17]:
d_ev1 = best_doc.join(ev1, how="inner")
d_ev2 = best_doc.join(ev2, how="inner")
d_ev3 = best_doc.join(ev3, how="inner")

smaj_ev1 = best_maj.join(ev1, how="inner")
smaj_ev2 = best_maj.join(ev2, how="inner")
smaj_ev3 = best_maj.join(ev3, how="inner")

smean_ev1 = best_mean.join(ev1, how="inner")
smean_ev2 = best_mean.join(ev2, how="inner")
smean_ev3 = best_mean.join(ev3, how="inner")

## Doc

In [18]:
print_report(d_ev1)

              precision    recall  f1-score   support

     REFUTES     0.5581    0.2143    0.3097       112
    SUPPORTS     0.7789    0.6379    0.7014       243

   micro avg     0.7397    0.5042    0.5997       355
   macro avg     0.6685    0.4261    0.5055       355
weighted avg     0.7092    0.5042    0.5778       355

Accuracy:  0.5042


In [19]:
print_report(d_ev2)

              precision    recall  f1-score   support

     REFUTES     0.7308    0.2603    0.3838        73
    SUPPORTS     0.7876    0.7008    0.7417       127

   micro avg     0.7770    0.5400    0.6372       200
   macro avg     0.7592    0.4805    0.5628       200
weighted avg     0.7669    0.5400    0.6111       200

Accuracy:  0.54


In [20]:
print_report(d_ev3)

              precision    recall  f1-score   support

     REFUTES     0.7059    0.2308    0.3478        52
    SUPPORTS     0.7634    0.8256    0.7933        86

   micro avg     0.7545    0.6014    0.6694       138
   macro avg     0.7347    0.5282    0.5706       138
weighted avg     0.7418    0.6014    0.6254       138

Accuracy:  0.6014


## Sent - Majority

In [21]:
print_report(smaj_ev1)

              precision    recall  f1-score   support

     REFUTES     0.6136    0.4821    0.5400       112
    SUPPORTS     0.8194    0.7654    0.7915       243

   micro avg     0.7619    0.6761    0.7164       355
   macro avg     0.7165    0.6238    0.6657       355
weighted avg     0.7545    0.6761    0.7121       355

Accuracy:  0.6761


In [22]:
print_report(smaj_ev2)

              precision    recall  f1-score   support

     REFUTES     0.6604    0.4795    0.5556        73
    SUPPORTS     0.7463    0.7874    0.7663       127

   micro avg     0.7219    0.6750    0.6977       200
   macro avg     0.7033    0.6334    0.6609       200
weighted avg     0.7149    0.6750    0.6894       200

Accuracy:  0.675


In [23]:
print_report(smaj_ev3)

              precision    recall  f1-score   support

     REFUTES     0.6923    0.3462    0.4615        52
    SUPPORTS     0.7449    0.8488    0.7935        86

   micro avg     0.7339    0.6594    0.6947       138
   macro avg     0.7186    0.5975    0.6275       138
weighted avg     0.7251    0.6594    0.6684       138

Accuracy:  0.6594


## Sent - Mean Probability

In [24]:
print_report(smean_ev1)

              precision    recall  f1-score   support

     REFUTES     0.5676    0.3750    0.4516       112
    SUPPORTS     0.8225    0.5720    0.6748       243

   micro avg     0.7449    0.5099    0.6054       355
   macro avg     0.6950    0.4735    0.5632       355
weighted avg     0.7421    0.5099    0.6044       355

Accuracy:  0.5099


In [25]:
print_report(smean_ev2)

              precision    recall  f1-score   support

     REFUTES     0.6316    0.3288    0.4324        73
    SUPPORTS     0.7727    0.6693    0.7173       127

   micro avg     0.7365    0.5450    0.6264       200
   macro avg     0.7022    0.4990    0.5749       200
weighted avg     0.7212    0.5450    0.6133       200

Accuracy:  0.545


In [26]:
print_report(smean_ev3)

              precision    recall  f1-score   support

     REFUTES     0.7500    0.4038    0.5250        52
    SUPPORTS     0.7875    0.7326    0.7590        86

   micro avg     0.7778    0.6087    0.6829       138
   macro avg     0.7688    0.5682    0.6420       138
weighted avg     0.7734    0.6087    0.6708       138

Accuracy:  0.6087
