In [0]:
import pandas as pd
import numpy as np
import re

### Interpreting HANS results
This is a notebook in which we interpret the results obtained from testing BERT with the HANS. 

In [0]:
# defining a function to convert logit output to csv with labels
def build_label_csv(hans_output):
  predictions = hans_output['gold_label']
  label_predictions = []
  for prediction in predictions:
    # since output is in a string '[x, y]'
    index0 = float(re.split(',', prediction.replace('[', '').replace(']', '').replace(' ',''))[0])
    index1 = float(re.split(',', prediction.replace('[', '').replace(']', '').replace(' ',''))[1])
    if index0 > index1:
      label_predictions.append('entailment')
    else:
      label_predictions.append('non-entailment')
    
  return label_predictions

------
#### After training BERT on examples from HANS for one epoch
Must upload the ```hans_predictions_trained.csv``` file.

In [0]:
hans_output = pd.read_csv('hans_predictions_trained.csv')

In [0]:
hans_output.head(5)

Unnamed: 0,pairID,gold_label
0,ex19939,"[3.734972, -4.0110707]"
1,ex2510,"[-3.7118118, 3.902585]"
2,ex6977,"[3.688711, -4.031746]"
3,ex12480,"[-3.645817, 3.9012985]"
4,ex27201,"[3.7094097, -3.9501584]"


In [0]:
# getting label predictions
label_predictions = build_label_csv(hans_output)

In [0]:
df = pd.DataFrame() 
df['pair_id'] = hans_output['pairID']
df['gold_label'] = label_predictions
df.to_csv('hans_output_with_labels_e1.csv', index=False)

In [0]:
!python evaluate_heur_output.py hans_output_with_labels_e1.csv

Heuristic entailed results:
lexical_overlap: 1.0
subsequence: 1.0
constituent: 1.0

Heuristic non-entailed results:
lexical_overlap: 0.9996
subsequence: 1.0
constituent: 1.0

Subcase results:
ln_subject/object_swap: 1.0
ln_preposition: 1.0
ln_relative_clause: 0.998
ln_passive: 1.0
ln_conjunction: 1.0
le_relative_clause: 1.0
le_around_prepositional_phrase: 1.0
le_around_relative_clause: 1.0
le_conjunction: 1.0
le_passive: 1.0
sn_NP/S: 1.0
sn_PP_on_subject: 1.0
sn_relative_clause_on_subject: 1.0
sn_past_participle: 1.0
sn_NP/Z: 1.0
se_conjunction: 1.0
se_adjective: 1.0
se_understood_object: 1.0
se_relative_clause_on_obj: 1.0
se_PP_on_obj: 1.0
cn_embedded_under_if: 1.0
cn_after_if_clause: 1.0
cn_embedded_under_verb: 1.0
cn_disjunction: 1.0
cn_adverb: 1.0
ce_embedded_under_since: 1.0
ce_after_since_clause: 1.0
ce_embedded_under_verb: 1.0
ce_conjunction: 1.0
ce_adverb: 1.0

Template results:
temp1: 1.0
temp5: 1.0
temp7: 1.0
temp3: 1.0
temp2: 1.0
temp4: 1.0
temp6: 1.0
temp11: 1.0
temp9: 1.0


-----
#### After two epochs
Must upload the ```hans_predictions_trained_2.csv``` file.

In [0]:
hans_output_2 = pd.read_csv('/content/hans_predictions_trained_2.csv')

In [0]:
label_predictions_2 = build_label_csv(hans_output_2)

In [0]:
df2 = pd.DataFrame()
df2['pair_id'] = hans_output_2['pairID']
df2['gold_label'] = label_predictions_2
df.to_csv('hans_output_with_labels_e2.csv', index=False)

In [0]:
!python evaluate_heur_output.py hans_output_with_labels_e2.csv

Heuristic entailed results:
lexical_overlap: 1.0
subsequence: 1.0
constituent: 1.0

Heuristic non-entailed results:
lexical_overlap: 0.9996
subsequence: 1.0
constituent: 1.0

Subcase results:
ln_subject/object_swap: 1.0
ln_preposition: 1.0
ln_relative_clause: 0.998
ln_passive: 1.0
ln_conjunction: 1.0
le_relative_clause: 1.0
le_around_prepositional_phrase: 1.0
le_around_relative_clause: 1.0
le_conjunction: 1.0
le_passive: 1.0
sn_NP/S: 1.0
sn_PP_on_subject: 1.0
sn_relative_clause_on_subject: 1.0
sn_past_participle: 1.0
sn_NP/Z: 1.0
se_conjunction: 1.0
se_adjective: 1.0
se_understood_object: 1.0
se_relative_clause_on_obj: 1.0
se_PP_on_obj: 1.0
cn_embedded_under_if: 1.0
cn_after_if_clause: 1.0
cn_embedded_under_verb: 1.0
cn_disjunction: 1.0
cn_adverb: 1.0
ce_embedded_under_since: 1.0
ce_after_since_clause: 1.0
ce_embedded_under_verb: 1.0
ce_conjunction: 1.0
ce_adverb: 1.0

Template results:
temp1: 1.0
temp5: 1.0
temp7: 1.0
temp3: 1.0
temp2: 1.0
temp4: 1.0
temp6: 1.0
temp11: 1.0
temp9: 1.0


-----
After training on only 3,000 HANS examples

In [0]:
hans_output_3000 = pd.read_csv('/content/hans_predictions_trained_3000.csv')

In [0]:
label_predictions_3000 = build_label_csv(hans_output_3000)

In [0]:
df3 = pd.DataFrame() 
df3['pair_id'] = hans_output_3000['pairID']
df3['gold_label'] = label_predictions_3000
df3.to_csv('hans_output_with_labels_3000.csv', index=False)

In [12]:
!python evaluate_heur_output.py hans_output_with_labels_3000.csv

Heuristic entailed results:
lexical_overlap: 0.9528
subsequence: 0.9938
constituent: 0.9898

Heuristic non-entailed results:
lexical_overlap: 0.9752
subsequence: 0.9536
constituent: 0.9982

Subcase results:
ln_subject/object_swap: 0.978
ln_preposition: 0.984
ln_relative_clause: 0.965
ln_passive: 1.0
ln_conjunction: 0.949
le_relative_clause: 0.776
le_around_prepositional_phrase: 1.0
le_around_relative_clause: 0.997
le_conjunction: 0.992
le_passive: 0.999
sn_NP/S: 0.948
sn_PP_on_subject: 0.986
sn_relative_clause_on_subject: 0.999
sn_past_participle: 0.999
sn_NP/Z: 0.836
se_conjunction: 0.979
se_adjective: 1.0
se_understood_object: 1.0
se_relative_clause_on_obj: 0.99
se_PP_on_obj: 1.0
cn_embedded_under_if: 1.0
cn_after_if_clause: 1.0
cn_embedded_under_verb: 0.991
cn_disjunction: 1.0
cn_adverb: 1.0
ce_embedded_under_since: 0.95
ce_after_since_clause: 1.0
ce_embedded_under_verb: 0.999
ce_conjunction: 1.0
ce_adverb: 1.0

Template results:
temp1: 0.978
temp5: 0.9490445859872612
temp7: 1.0
tem