In [1]:
import evaluate

In [2]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [4]:
print(accuracy.description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative



In [5]:
print(accuracy.citation)


@article{scikit-learn,
  title={Scikit-learn: Machine Learning in {P}ython},
  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
  journal={Journal of Machine Learning Research},
  volume={12},
  pages={2825--2830},
  year={2011}
}



In [6]:
print(accuracy.features)

{'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}


In [7]:
accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])

{'accuracy': 0.5}

In [8]:
for ref, pred in zip([0,1,0,1], [1,0,0,1]):
    accuracy.add(references=ref, predictions=pred)

In [9]:
accuracy.compute()

{'accuracy': 0.5}

In [12]:
for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
    accuracy.add_batch(references=refs, predictions=preds)

In [13]:
accuracy.compute()

{'accuracy': 0.5}

In [14]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [15]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

# SQuAD

In [17]:
squad_metric = evaluate.load("squad_v2")

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [19]:
predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}]
references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]

In [20]:
results = squad_metric.compute(predictions=predictions, references=references)

In [21]:
results

{'exact': 100.0,
 'f1': 100.0,
 'total': 1,
 'HasAns_exact': 100.0,
 'HasAns_f1': 100.0,
 'HasAns_total': 1,
 'best_exact': 100.0,
 'best_exact_thresh': 0.0,
 'best_f1': 100.0,
 'best_f1_thresh': 0.0}

In [33]:
predictions = [
    {'prediction_text': '', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}, 
    {'prediction_text': 'Beyonce', 'id': '56d2051ce7d4791d0090260b', 'no_answer_probability': 0.}, 
    {'prediction_text': 'climate change in world', 'id': '5733b5344776f419006610e1', 'no_answer_probability': 0.},
    {'prediction_text': 'jakarta', 'id': '5733b5344776f419006610e2', 'no_answer_probability': 0.},
    {'prediction_text': 'bandung', 'id': '5733b5344776f419006610e3', 'no_answer_probability': 0.}
]
references = [
    {'answers': {'answer_start': [891], 'text': ['climate change in other world']}, 'id': '5733b5344776f419006610e1'},
    {'answers': {'answer_start': [891], 'text': ['jakarta']}, 'id': '5733b5344776f419006610e2'},
    {'answers': {'answer_start': [891], 'text': ['bandung']}, 'id': '5733b5344776f419006610e3'},
    {'answers': {'answer_start': [], 'text': []}, 'id': '56e10a3be3433e1400422b22'}, 
    {'answers': {'answer_start': [], 'text': []}, 'id': '56d2051ce7d4791d0090260b'}
]

In [34]:
results = squad_metric.compute(predictions=predictions, references=references)
results

{'exact': 60.0,
 'f1': 77.77777777777777,
 'total': 5,
 'HasAns_exact': 66.66666666666667,
 'HasAns_f1': 96.29629629629629,
 'HasAns_total': 3,
 'NoAns_exact': 50.0,
 'NoAns_f1': 50.0,
 'NoAns_total': 2,
 'best_exact': 60.0,
 'best_exact_thresh': 0.0,
 'best_f1': 77.77777777777777,
 'best_f1_thresh': 0.0}