In [1]:
# coding=utf-8
# @author: cer
# use python3
from __future__ import print_function
import os
import operator
from num2words import num2words  # 这个包不支持中文
import gc
import pandas as pd
import numpy as np
import time
import pickle as pkl

In [2]:
train_file_name = "../input/en_train.csv"
test_file = '../input/en_test_2.csv'
baseline_file = '../output/baseline.csv'
debug_file = "../output/res_16.v2.5.debug.csv"
diff_file = "../output/diff_with_shot.csv"

In [3]:
train_df = pd.read_csv(train_file_name)
test_df = pd.read_csv(test_file)
debug_df = pd.read_csv(debug_file, index_col=0)

In [4]:
def train():
    print('Train start...')

    # Work with primary dataset
    train_file = open(train_file_name, encoding='UTF8')
    train_file.readline()
    res = dict()
    total = 0
    not_same = 0
    while 1:
        line = train_file.readline().strip()
        if line == '':
            break
        total += 1
        pos = line.find('","')
        text = line[pos + 2:]
        if text[:3] == '","':
            continue
        text = text[1:-1]
        arr = text.split('","')
        if arr[0] != arr[1]:
            not_same += 1
        if arr[0] not in res:
            res[arr[0]] = dict()
            res[arr[0]][arr[1]] = 1
        else:
            if arr[1] in res[arr[0]]:
                res[arr[0]][arr[1]] += 1
            else:
                res[arr[0]][arr[1]] = 1
    train_file.close()
    print(train_file_name + ':\tTotal: {} Have diff value: {}'.format(total, not_same))

    # Work with additional dataset from https://www.kaggle.com/google-nlu/text-normalization
    files = ['output_1.csv', 'output_6.csv', 'output_11.csv', 'output_16.csv', \
             'output_21.csv', 'output_91.csv', 'output_96.csv']

    for add_file_name in files:
        train_file = open(os.path.join("../input", 'tn', add_file_name), encoding='UTF8')
        train_file.readline()
        while 1:
            line = train_file.readline().strip()
            if line == '':
                break
            line = line.replace(',NA,', ',"NA",')
            total += 1
            pos = line.find('","')
            text = line[pos + 2:]
            if text[:3] == '","':
                continue
            text = text[1:-1]
            arr = text.split('","')
            if arr[0] == '<eos>':
                continue
            if arr[1] != '<self>':
                not_same += 1

            if arr[1] == '<self>' or arr[1] == 'sil':
                arr[1] = arr[0]

            if arr[0] not in res:
                res[arr[0]] = dict()
                res[arr[0]][arr[1]] = 1
            else:
                if arr[1] in res[arr[0]]:
                    res[arr[0]][arr[1]] += 1
                else:
                    res[arr[0]][arr[1]] = 1
        train_file.close()
        print(add_file_name + ':\tTotal: {} Have diff value: {}'.format(total, not_same))

    return res

In [5]:
res = train()

Train start...
../input/en_train.csv:	Total: 9918441 Have diff value: 659793
output_1.csv:	Total: 67101629 Have diff value: 12723310
output_6.csv:	Total: 124336086 Have diff value: 24795377
output_11.csv:	Total: 181494302 Have diff value: 36848944
output_16.csv:	Total: 238523401 Have diff value: 48886815
output_21.csv:	Total: 295674425 Have diff value: 60943615
output_91.csv:	Total: 352830815 Have diff value: 72996182
output_96.csv:	Total: 410013747 Have diff value: 85056247


In [7]:
res

{'': {'': 37, 'n a': 5},
 'ZIGA': {'ZIGA': 1},
 '583.4/km²': {'five hundred eighty three point four per square kilometers': 1},
 'Mitropoliya': {'Mitropoliya': 4},
 'shivhads': {'shivhads': 1},
 'malade': {'malade': 9},
 'Nordperd': {'Nordperd': 1},
 'Iongicaudatus': {'Iongicaudatus': 1},
 'Aeruginascin': {'Aeruginascin': 2},
 'Pitekan': {'Pitekan': 1},
 'vellalore': {'vellalore': 1},
 'laberintoentreteje': {'laberintoentreteje': 1},
 'delusion': {'delusion': 204},
 'TPub.com': {'t_letter  _letter p_letter  _letter u_letter  _letter b_letter dot c_letter o_letter m_letter': 1},
 'Wreda': {'Wreda': 1},
 'ComplicationsTissue': {'ComplicationsTissue': 1},
 "Mészaros'": {"Mészaros'": 1},
 'http://www.justice.gouv.qc.ca/english/publications/generale/maria-a.htm#namesGovernment': {'h_letter  _letter t_letter  _letter t_letter  _letter p_letter  _letter c_letter o_letter l_letter o_letter n_letter  _letter s_letter l_letter a_letter s_letter h_letter  _letter s_letter l_letter a_letter s_lett

In [30]:
with open("../output/res_dict.pkl", "wb") as f:
    pkl.dump(res, f)

In [5]:
with open("../output/res_dict.pkl", "rb") as f:
    res = pkl.load(f)

In [6]:
debug_df.shape

(956046, 5)

In [7]:
diffs = []
not_hits = []

for i, row in debug_df.iterrows():
    if row["before"] in res:
        srtd = sorted(res[row["before"]].items(), key=operator.itemgetter(1), reverse=True)
        if row["after"] != srtd[0][0]:
            row["res"] = srtd[0][0]
            row["hit_num"] = res[row["before"]][row["res"]]
            diffs.append(row)
    else:
        not_hits.append(row)
    

In [7]:
not_hits[:5]

[id                                                      4_0
 after         the twenty first of february twenty seventeen
 before                                     21 February 2017
 class_pred                                             DATE
 max_prob                                           0.999994
 Name: 45, dtype: object, id              5_0
 after             "
 before            "
 class_pred    PUNCT
 max_prob          1
 Name: 47, dtype: object, id              5_5
 after             "
 before            "
 class_pred    PUNCT
 max_prob          1
 Name: 52, dtype: object, id              6_0
 after             "
 before            "
 class_pred    PUNCT
 max_prob          1
 Name: 54, dtype: object, id             6_12
 after             "
 before            "
 class_pred    PUNCT
 max_prob          1
 Name: 66, dtype: object]

In [8]:
not_hit_df = pd.DataFrame(not_hits)
not_hit_df.shape

(93765, 5)

In [9]:
def filter_not_hit_func(row):
    return row["class_pred"] != "PUNCT" and row["class_pred"] != "PLAIN" and row["class_pred"] != "DATE" 

In [10]:
filter_not_hit_df = not_hit_df.loc[not_hit_df.apply(filter_not_hit_func, axis=1)]
filter_not_hit_df.shape

(3230, 5)

In [11]:
filter_not_hit_df

Unnamed: 0,id,after,before,class_pred,max_prob
724,59_10,seventy six point two per square miles,76.2/sq mi,MEASURE,0.931052
958,82_13,six years,6 years,MEASURE,0.930047
1074,93_6,o two one seven nine eight,021798,DIGIT,0.982155
1163,101_7,one point five centimetres,1.5 centimetres,MEASURE,0.771172
1573,137_7,twenty eight years,28 years,MEASURE,0.657223
1784,153_3,i e e e,IEEE,LETTERS,0.595068
2214,188_11,seventeen point four feet,17.4 feet,MEASURE,0.997561
2331,197_11,i q t r e e,IQTREE,LETTERS,0.868032
2632,220_11,a n d slash o r,and/or,ELECTRONIC,0.978925
2697,224_6,m d c c x l i,MDCCXLI,LETTERS,0.988097


In [12]:
filter_not_hit_df["class_pred"].value_counts()

MEASURE       1182
LETTERS        610
ELECTRONIC     467
CARDINAL       296
TELEPHONE      254
MONEY          121
DECIMAL         75
VERBATIM        71
DIGIT           58
FRACTION        38
TIME            29
ADDRESS         26
ORDINAL          3
Name: class_pred, dtype: int64

In [28]:
filter_not_hit_df.loc[filter_not_hit_df["class_pred"] == "MONEY"]

Unnamed: 0.1,Unnamed: 0,id,after,before,class_pred
3389,3389,282_7,six hundred ten thousand four hundred eight do...,"US$610,408",MONEY
19388,19388,1512_12,one hundred eighty two thousand two hundred fi...,"US$182,250",MONEY
19390,19390,1512_14,forty two thousand seven hundred fifty dollars,"US$42,750",MONEY
19393,19393,1512_17,thirty seven thousand five hundred dollars,"US$37,500",MONEY
22273,22273,1725_9,twenty three point two billion dollars,$23.2 billion,MONEY
27356,27356,2129_5,sixteen point eight billion dollars,$ 16.8 billion,MONEY
29878,29878,2317_8,one thousand six hundred ninety five dollars,$1695,MONEY
39518,39518,3052_14,fourbillion dollars,$4billion,MONEY
70884,70884,5451_11,six hundred fifty million pounds,£650 million,MONEY
70886,70886,5451_13,eight hundred two million dollars,$802 million,MONEY


In [23]:
filter_not_hit_df.loc[filter_not_hit_df["class_pred"] == "MEASURE"]

Unnamed: 0.1,Unnamed: 0,id,after,before,class_pred
724,724,59_10,seventy six point two per square miles,76.2/sq mi,MEASURE
958,958,82_13,six years,6 years,MEASURE
1163,1163,101_7,one point five centimetres,1.5 centimetres,MEASURE
1573,1573,137_7,twenty eight years,28 years,MEASURE
2214,2214,188_11,seventeen point four feet,17.4 feet,MEASURE
2747,2747,230_3,seven thousand acres,"7,000 acres",MEASURE
2749,2749,230_5,two thousand eight hundred hectares,"2,800 hectares",MEASURE
3034,3034,254_4,ninety thousand tons,"90,000 tons",MEASURE
4852,4852,398_15,forty five years,45 years,MEASURE
6442,6442,527_12,point one five kilometers,.15 km,MEASURE


In [8]:
diff_df = pd.DataFrame(diffs)

In [10]:
diff_df.head()

Unnamed: 0,id,after,before,class_pred,max_prob,res,hit_num
1092,95_4,two thousand twenty,2020,CARDINAL,0.994577,twenty twenty,2347
1246,108_11,vWD,vWD,VERBATIM,0.815306,v w d,13
1558,136_1,to,-,PLAIN,0.994886,-,501208
1759,151_17,to,-,PLAIN,0.993562,-,501208
2506,211_1,to,:,PLAIN,0.99161,:,2575555


In [11]:
diff_df.shape

(2390, 7)

In [68]:
diff_df.loc[diff_df["class_pred"] == "LETTERS"]

Unnamed: 0,id,after,before,class_pred,res,hit_num
4197,348_12,o s t i,OSTI,LETTERS,OSTI,7
7347,598_1,g a t a,GATA,LETTERS,GATA,102
7822,630_8,o p a l s,OPALS,LETTERS,OPALS,13
7980,639_9,e p u b,EPUB,LETTERS,EPUB,49
11865,947_5,n a k a m u r a,NAKAMURA,LETTERS,NAKAMURA,7
15089,1189_11,u l m,ULM,LETTERS,ULM,86
15995,1255_6,i v a,IVA,LETTERS,IVA,83
16723,1313_7,c a v,CAV,LETTERS,CAV,57
18736,1466_20,f e,FE,LETTERS,FE,512
19036,1488_16,i r b's,IRBs,LETTERS,IRBs,8


In [None]:
diff_df.index

In [79]:
diff_df.loc[diff_df.apply(lambda row: row["class_pred"] == "CARDINAL" and len(row["before"]) == 4, axis=1)]

Unnamed: 0,id,after,before,class_pred,res,hit_num
1092,95_4,two thousand twenty,2020,CARDINAL,twenty twenty,2347
7676,621_6,two thousand sixteen,2016,CARDINAL,twenty sixteen,19734
19461,1518_15,two thousand fifteen,2015,CARDINAL,twenty fifteen,74827
27922,2172_6,two hundred three,203,CARDINAL,two o three,25
30775,2385_13,two thousand thirteen,2013,CARDINAL,twenty thirteen,107979
34825,2701_8,two thousand ten,2010,CARDINAL,twenty ten,192031
40301,3105_4,two thousand fifteen,2015,CARDINAL,twenty fifteen,74827
40844,3150_12,two thousand fourteen,2014,CARDINAL,twenty fourteen,99787
49127,3798_7,one thousand nine hundred ninety five,1995,CARDINAL,nineteen ninety five,69540
59249,4576_12,one thousand nine hundred eighty,1980,CARDINAL,nineteen eighty,39431


In [60]:
res["2020"]

{'twenty twenty': 2347, 'two o two o': 35, 'two thousand twenty': 23}

In [86]:
def filter_func(row):
    return row["before"] not in ["-", ":", "~"] and row["class_pred"] != "LETTERS" \
            and row["class_pred"] != "DIGIT" and row["class_pred"] != "CARDINAL" \
            and "_letter" not in row["res"]

In [87]:
filter_df = diff_df.loc[diff_df.apply(filter_func, axis=1)]
filter_df.shape

(329, 6)

In [89]:
filter_df[100:200]

Unnamed: 0,id,after,before,class_pred,res,hit_num
318406,23702_13,hash tag osl-3,#OSL-3,ELECTRONIC,hash osl dash three,1
321971,23967_10,the fourteenth,XIV,ORDINAL,fourteen,1260
322853,24027_5,CsSK,CsSK,VERBATIM,c s s k,1
326600,24310_1,76 b c,76 BC,DATE,seventy six b c,2
329479,24521_20,one o o sil one nine nine two,100 - 1992,TELEPHONE,one hundred sil one nine nine two,17
331560,24678_8,the second of november twenty fifteen,2.11.2015,DATE,the eleventh of february twenty fifteen,2
332567,24755_3,the first,I.,ORDINAL,i,11087
337520,25124_4,eight zeros,800s,DATE,eight hundreds,28
340889,25374_9,one thousand nine hundred seventy one dolla ru...,1971 dollars,MONEY,one thousand nine hundred seventy one dollars,2
340893,25374_13,two thousand sixteen dolla rupees,2016 dollars,MONEY,two thousand sixteen dollars,63


In [63]:
pd.set_option('display.max_rows', 100)

In [9]:
diff_df.to_csv(diff_file)

In [97]:
diff_df.loc[532707]

id                        39330_10
after                 five six b c
before                      506 BC
class_pred                    DATE
res           five hundred six b c
hit_num                          3
Name: 532707, dtype: object

In [92]:
pd.read_csv(diff_file, index_col=0)

Unnamed: 0,id,after,before,class_pred,res,hit_num
1092,95_4,two thousand twenty,2020,CARDINAL,twenty twenty,2347
1246,108_11,vWD,vWD,VERBATIM,v w d,13
1558,136_1,to,-,PLAIN,-,501208
1759,151_17,to,-,PLAIN,-,501208
2506,211_1,to,:,PLAIN,:,2575555
2591,217_13,to,-,PLAIN,-,501208
2595,217_17,to,-,PLAIN,-,501208
2599,217_21,to,-,PLAIN,-,501208
2603,217_25,to,-,PLAIN,-,501208
2607,217_29,to,-,PLAIN,-,501208
