In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
pred_file = "../output/class_pred_16_fixed.v2.5.csv"
baseline_file = "../output/baseline.csv"
test_file = "../input/en_test_2.csv"

In [3]:
pred_df = pd.read_csv(pred_file)
baselin_df = pd.read_csv(baseline_file)
test_df = pd.read_csv(test_file)

In [6]:
pred_df.head()

Unnamed: 0,id,before,class_pred
0,0_0,Last,0.0
1,0_1,modified,0.0
2,0_2,2016-03-31,2.0
3,0_3,.,1.0
4,1_0,There's,0.0


In [4]:
labels = ['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM',
          'DECIMAL', 'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC',
          'DIGIT', 'FRACTION', 'TELEPHONE', 'ADDRESS']
label2index = dict(zip(labels, range(len(labels))))
index2label = dict([(v, k) for k, v in label2index.items()])

In [5]:
pred_df["class_pred"].value_counts()

0.0     688246
1.0     193491
2.0      26822
3.0      17320
4.0      13443
5.0       9920
7.0       2391
9.0       1113
11.0       815
6.0        793
8.0        557
12.0       462
14.0       353
10.0       152
13.0       106
15.0        62
Name: class_pred, dtype: int64

In [8]:
dec_df = pred_df.loc[pred_df["class_pred"].apply(lambda i: index2label[i]) == "DECIMAL"]
dec_df.shape

(897, 3)

In [20]:
def has_measure_string(token):
    token = str(token)
    mea_strs = ['years', 'year', 'months', 'month', 'weeks', 'week', 'days', 'day', 'hours', 'minutes', 'min', 'seconds', 'nautical', 'acres', 'hectares', 'tonnes', 'tons', 'kilobytes', 'nanobarns', 'square', 'miles', 'kilometres', 'kilometre', 'kilometers', 'kilometer', 'meters', 'meter', 'metres', 'metre', 'feet', 'yards', 'barrels', 'calories', 'watts', 'knots', 'inches', 'pounds', 'degrees', 'bar', 'Cellos']
    for one in mea_strs:
        if one in token:
            return True
    return False

In [21]:
def has_string(token):
    token = str(token)
    for c in token:
        if ord("A") <= ord(c) <= ord("Z") or ord("a") <= ord(c) <= ord("z"):
            return True
    return False

In [23]:
dec_df.loc[dec_df["before"].apply(has_string)][150:]

Unnamed: 0,id,before,class_pred
746706,54810_6,6.5 million,6.0
751992,55194_8,8 million,6.0
758054,55639_4,3 million,6.0
763611,56047_10,4 million,6.0
772009,56662_1,26 years,6.0
774771,56866_3,1.4 million,6.0
774784,56866_16,1.3 million,6.0
775609,56926_6,57 years,6.0
776569,56997_2,40 years,6.0
779328,57189_17,50 years,6.0


In [24]:
dec_df.loc[dec_df["before"].apply(has_measure_string)]

Unnamed: 0,id,before,class_pred
1573,137_7,28 years,6.0
9170,737_15,45 minutes,6.0
20520,1593_3,22 years,6.0
21990,1702_12,15 years,6.0
57684,4462_6,33 years,6.0
63761,4909_3,10 years,6.0
77368,5943_9,32 years,6.0
81366,6243_1,20 years,6.0
91520,7017_18,40 years,6.0
99978,7649_6,20 years,6.0


In [33]:
pred_df.loc[pred_df["class_pred"].apply(lambda i: index2label[i]) == "MONEY"]

Unnamed: 0,id,before,class_pred
1096,95_8,£415 million,8.0
1163,101_7,1.5 centimetres,8.0
1704,148_11,$203 million,8.0
3387,282_5,"$210,000",8.0
3389,282_7,"US$610,408",8.0
3391,282_9,2016 dollars,8.0
8311,666_6,$3M,8.0
9014,725_11,$150 billion,8.0
15331,1206_3,$20.5m,8.0
15960,1252_20,US$12.5 million,8.0


In [26]:
elec_df = pred_df.loc[pred_df["class_pred"].apply(lambda i: index2label[i]) == "ELECTRONIC"]
elec_df.shape

(948, 3)

In [29]:
das_pat = re.compile(r"^[a-zA-Z]+-[a-zA-Z]+$")

def has_dash(token):
    token = str(token)
    if das_pat.search(token):
        return True
    else:
        return False

In [30]:
elec_df[elec_df["before"].apply(has_dash)]

Unnamed: 0,id,before,class_pred
4282,353_17,off-season,11.0
13186,1043_11,semi-finalist,11.0
16754,1315_14,full-time,11.0
22394,1735_11,semi-final,11.0
36197,2801_8,anti-war,11.0
41524,3201_6,one-time,11.0
45729,3537_9,two-seater,11.0
50584,3904_8,semi-finals,11.0
50727,3916_7,counter-terrorism,11.0
53672,4143_4,semi-arid,11.0


In [6]:
test_df[:50]

Unnamed: 0,sentence_id,token_id,before
0,0,0,Another
1,0,1,religious
2,0,2,family
3,0,3,is
4,0,4,of
5,0,5,Hazrat
6,0,6,Sayyed
7,0,7,Ahmad
8,0,8,and
9,0,9,his


In [7]:
ids_a = np.array(list(map(lambda tup: str(tup[0]) + "_" + str(tup[1]),
                         zip(test_df["sentence_id"].values,
                             test_df["token_id"].values))))

In [8]:
test_df["id"] = ids_a

In [9]:
compare_df = pd.merge(test_df, baselin_df, on=["id"])
compare_df = pd.merge(compare_df, pred_df, on=["id"])
compare_df = compare_df[["id", "before", "after","class_pred"]]

In [10]:
compare_df["equal"] = (compare_df["before"] == compare_df["after"])

In [11]:
compare_df[:100]

Unnamed: 0,id,before,after,class_pred,equal
0,0_0,Another,Another,0.0,True
1,0_1,religious,religious,0.0,True
2,0_2,family,family,0.0,True
3,0_3,is,is,0.0,True
4,0_4,of,of,0.0,True
5,0_5,Hazrat,Hazrat,0.0,True
6,0_6,Sayyed,Sayyed,0.0,True
7,0_7,Ahmad,Ahmad,0.0,True
8,0_8,and,and,0.0,True
9,0_9,his,his,0.0,True


In [12]:
diff_df = compare_df.iloc[compare_df["class_pred"].values != list(map(float, (~compare_df["equal"].values)))]
diff_df

Unnamed: 0,id,before,after,class_pred,equal
54,3_9,-,-,1.0,True
57,3_12,metre,meter,0.0,False
66,4_2,D,D,1.0,True
70,4_6,Dn,d n,0.0,False
383,27_8,p,p,1.0,True
434,31_9,11.8 MB,11.8 MB,1.0,True
571,40_7,metres,meters,0.0,False
575,40_11,-,-,1.0,True
704,49_1,http://www.britannica.com/EBchecked/topic/2852...,http://www.britannica.com/EBchecked/topic/2852...,1.0,True
742,52_9,http://search.eb.com/eb/article-9061377,http://search.eb.com/eb/article-9061377,1.0,True


In [13]:
diff_df.shape[0] / compare_df.shape[0]

0.013198121561984412