In [102]:
import numpy as np
import pandas as pd
import re
import sklearn.metrics as sm

In [146]:
def get_acc(df):
    return sm.accuracy_score(df.target, df.pred)

def gen_length(df):
    df['s1_length'] = df.sentence1.apply(lambda x: len(re.findall("[a-z]+", x)))
    df['s2_length'] = df.sentence2.apply(lambda x: len(re.findall("[a-z]+", x)))
    return df
    
def load_pred(f, ds = 'rte'):
    df = pd.read_csv(f)
    ds = gen_length(pd.read_csv('RTE/dev.tsv', sep = '\t'))
    print(df.describe())
    return pd.merge(df, ds, left_on = 'idx', right_on = 'index')

def length_s(df, pos = 1, positive_only = True):
    if pos == 1:
        len_field = 's1_length'
    else:
        len_field = 's2_length'
    
    if positive_only:
        return df[df.pred == df.target][len_field].mean()
    else:
        return df[len_field].mean()
    
def get_tf_tn_rates(df):
    cm = sm.confusion_matrix(df.target, df.pred)
    tn = cm[0][0]
    fn = cm[1][0]
    tp = cm[1][1]
    fp = cm[0][1]
    print("True positives: ", tp)
    print("Recall: ", tp / (tp + fn))
    print("True negatives: ", tn)


In [152]:
s = pd.read_csv('error_analysis_rte.csv')
sum(s.pred == s.target) / len(s)

0.628158844765343

In [153]:
s = pd.read_csv('error_analysis_AP_RTE.csv')
sum(s.pred == s.target) / len(s)

0.5812274368231047

In [136]:
msr = load_pred('error_analysis_rte.csv')
get_acc(msr)
#67.5% responses are predicted to be positive

             pred      target        idx
count  277.000000  277.000000  277.00000
mean     0.675090    0.527076  138.00000
std      0.469189    0.500170   80.10722
min      0.000000    0.000000    0.00000
25%      0.000000    0.000000   69.00000
50%      1.000000    1.000000  138.00000
75%      1.000000    1.000000  207.00000
max      1.000000    1.000000  276.00000


0.628158844765343

In [137]:
msr_ap = load_pred('error_analysis_AP_RTE.csv')
get_acc(msr_ap)
#59.2% responses are predicted to be positive

             pred      target        idx
count  277.000000  277.000000  277.00000
mean     0.592058    0.527076  138.00000
std      0.492342    0.500170   80.10722
min      0.000000    0.000000    0.00000
25%      0.000000    0.000000   69.00000
50%      1.000000    1.000000  138.00000
75%      1.000000    1.000000  207.00000
max      1.000000    1.000000  276.00000


0.5812274368231047

In [80]:
get_tf_tn_rates(msr)

True positives:  115
Recall:  0.7876712328767124
True negatives:  59


In [81]:
get_tf_tn_rates(msr_ap)

True positives:  97
Recall:  0.6643835616438356
True negatives:  64


In [82]:
64/sum(msr.target == 0)

0.48854961832061067

In [83]:
59/sum(msr.target == 0)

0.45038167938931295

In [143]:
length_s(msr, 1), length_s(msr, 2)

(41.60919540229885, 8.189655172413794)

In [144]:
length_s(msr_ap, 1), length_s(msr_ap, 2)

(41.577639751552795, 8.248447204968944)

In [148]:
length_s(msr, 1, False), length_s(msr, 2, False)

(41.56678700361011, 8.155234657039712)

In [84]:
msr = load_pred('error_analysis_msr.csv')
get_acc(msr)
#67.5% responses are predicted to be positive

              pred       target           idx
count  1725.000000  1725.000000  1.725000e+03
mean      0.691014     0.664928  1.659233e+06
std       0.462209     0.472153  9.922661e+05
min       0.000000     0.000000  2.670000e+02
25%       0.000000     0.000000  7.802010e+05
50%       1.000000     1.000000  1.629440e+06
75%       1.000000     1.000000  2.500731e+06
max       1.000000     1.000000  3.454292e+06


0.8057971014492754

In [85]:
msr_ap = load_pred('error_analysis_AP_MSR.csv')
get_acc(msr_ap)
#59.2% responses are predicted to be positive

              pred       target           idx
count  1725.000000  1725.000000  1.725000e+03
mean      0.707246     0.664928  1.659233e+06
std       0.455158     0.472153  9.922661e+05
min       0.000000     0.000000  2.670000e+02
25%       0.000000     0.000000  7.802010e+05
50%       1.000000     1.000000  1.629440e+06
75%       1.000000     1.000000  2.500731e+06
max       1.000000     1.000000  3.454292e+06


0.7953623188405797

In [86]:
get_tf_tn_rates(msr)

True positives:  1002
Recall:  0.8735832606800349
True negatives:  388


In [87]:
get_tf_tn_rates(msr_ap)

True positives:  1007
Recall:  0.8779424585876199
True negatives:  365


In [90]:
388/sum(msr.target == 0)

0.671280276816609

In [91]:
365/sum(msr.target == 0)

0.6314878892733564