In [20]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
import inflect
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict, Counter
import feather
from num2words import num2words 
import re
import pickle

In [None]:
def get_classify_train_data(np_file,csv_file):
    if os.path.exists(np_file) == True:
       temp = np.load(np_file)
       return temp['x_train'],temp['y_train'],temp['label']
    else:
        num_features = 9 #每个 word 取前 5 后 4 个字符来编码
        train=pd.read_csv(csv_file)
        tmp=pd.factorize(train['class'])
        y_train,label=tmp[0].astype(np.int8),tmp[1].values
        num_train=len(y_train)
        train['before']=train['before'].astype(np.str)
        x_train=np.zeros([num_train,num_features],np.int8)
        feature=np.zeros([num_train,7],np.int8)# 人工提取的特征
        list1=('a','e','i','o','u')# 元音
        list2=('+','-','*','//','%')# 数学运算符
        for word,row in zip(train['before'].values,range(num_train)):
            if(len(word)>=num_features):
                for c,col in zip(word[:5],range(5)):
                    x_train[row,col]=ord(c)
                for c,col in zip(word[-4:],range(5,9)):
                    x_train[row,col]=ord(c)
            else:
                for c,col in zip(word,range(num_features)):
                    x_train[row,col]=ord(c)
            feature[row, 3] =len(word) # 统计字符串的长度
            dotflag=0
            for c in word:
                if c.isdigit():feature[row,0]+=1# 统计数字的个数
                if c.isupper():feature[row,1]+=1# 统计大写字母的个数
                if c.isalnum()!=True:feature[row,2]+=1# 统计非字母和数字的个数
                if c in list1:feature[row,4]+=1# 统计元音的个数
                if c=='.': dotflag=1
                elif dotflag==1:#  . 后面跟字母置 1 ，数字置 2，其他置 3
                    dotflag = 0
                    if c.isdigit():feature[row,5]+=10
                    elif c.isalpha():feature[row,5]+=100
                    else:feature[row,5]+=1000
                if c in list2:feature[row,6]+=1# 统计数学运算符的个数

        # 掐头去尾，结合上文 2 单词，下文 1 个单词
        num_train-=3
        y_train=y_train[2:-1]
        x_train=np.concatenate((x_train[:-3],x_train[1:-2],x_train[2:-1],x_train[3:],feature[2:-1]),axis=1)
        np.savez(np_file,x_train=x_train, y_train=y_train, label=label)
        return x_train, y_train, label

def get_classify_test_data(np_file,csv_file):
    test=pd.read_csv(csv_file)
    if os.path.exists(np_file) == True:
       temp = np.load(np_file)
       x_test=temp['x_test']
    else:
        num_features = 9 #每个 word 取前 5 后 4 个字符来编码
        human_feature=7 #人工提取7个特征
        num_test=len(test)
        test['before']=test['before'].astype(np.str)
        x_test=np.zeros([num_test,num_features],np.int8)
        feature=np.zeros([num_test,human_feature],np.int8)# 人工提取的特征
        list1=('a','e','i','o','u')# 元音
        list2=('+','-','*','//','%')# 数学运算符
        for word,row in zip(test['before'].values,range(num_test)):
            if(len(word)>=num_features):
                for c,col in zip(word[:5],range(5)):
                    x_test[row,col]=ord(c)
                for c,col in zip(word[-4:],range(5,9)):
                    x_test[row,col]=ord(c)
            else:
                for c,col in zip(word,range(num_features)):
                    x_test[row,col]=ord(c)
            feature[row, 3] =len(word) # 统计字符串的长度
            dotflag=0
            for c in word:
                if c.isdigit():feature[row,0]+=1# 统计数字的个数
                if c.isupper():feature[row,1]+=1# 统计大写字母的个数
                if c.isalnum()!=True:feature[row,2]+=1# 统计非字母和数字的个数
                if c in list1:feature[row,4]+=1# 统计元音的个数
                if c=='.': dotflag=1
                elif dotflag==1:#  . 后面跟字母置 1 ，数字置 2，其他置 3
                    dotflag = 0
                    if c.isdigit():feature[row,5]+=10
                    elif c.isalpha():feature[row,5]+=100
                    else:feature[row,5]+=1000
                if c in list2:feature[row,6]+=1# 统计数学运算符的个数

        # 开头补上2个单词,结尾补上1个单词，结合上文 2 单词，下文 1 个单词
        x_test = np.concatenate((np.zeros([2,num_features],np.int8),x_test,np.zeros([1,num_features],np.int8)),axis=0)
        feature = np.concatenate((np.zeros([2,human_feature],np.int8),feature,np.zeros([1,human_feature],np.int8)),axis=0)
        x_test=np.concatenate((x_test[:-3],x_test[1:-2],x_test[2:-1],x_test[3:],feature[2:-1]),axis=1)
        np.savez(np_file,x_test=x_test)
    return test, x_test

In [None]:
if __name__=='__main__':
    prehead=''
    train_data_csv='en_train.csv'
    classify_train_file='classify_train.npz'
    xgb_model='xgb_model.dat'
    test_data_csv='en_test_2.csv'
    classify_test_file='classify_test.npz'
    xgb_model2='xgb_model2.dat'
    classify_test_file2='classify_test2.npz'

    # 训练模型
    x_train,y_train,label=get_classify_train_data(prehead+classify_train_file,prehead+train_data_csv)
    print(x_train.shape)
    dtrain = xgb.DMatrix(x_train, label=y_train)
    watchlist = [(dtrain, 'train')]
    param = {
        'eta': 0.3,
        'max_depth':10,
        'objective':'multi:softmax',
        'num_class':len(label),
        'eval_metric':'merror',
        'subsample': 1,
        'colsample_bytree': 1,
        'silent':1,
        'seed':0,
    }
    num_boost_rounds=10
    model = xgb.train(param, dtrain, num_boost_rounds, watchlist,verbose_eval=1)
    print('save model ',xgb_model2)
    pickle.dump(model,open(xgb_model2,'wb'))# 保存模型
    del x_train,y_train
    #gc.collect()

In [None]:
train.head()

In [None]:
# xgb
params = {'eta': 0.1, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'accuracy', 'silent': False}

X = train[['class', 'before']]
X['class'] = X['class'].astype('category')
X['before'] = X['before'].astype('category')
y = train['after'].values
#y = y.astype('category')
test['class'] = ''
test['class'] = test['class'].astype('category')
test['before'] = test['before'].astype('category')

sub = (test.sentence_id.astype(str) + '_' + test.token_id.astype(str)).to_frame()
sub['target']=''

sub_train = (train.sentence_id.astype(str) + '_' + train.token_id.astype(str)).to_frame()
sub_train['target']=''
nrounds= 200 #10**6  # need to change to 2000
kfold = 5  # need to change to 5

d_train = xgb.DMatrix(X, y) 
d_valid = xgb.DMatrix(test) 
xgb_model = xgb.train(params, d_train, nrounds, early_stopping_rounds=10, 
                       maximize=True, verbose_eval=10)
sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                    ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)

sub_train['target'] += xgb_model.predict(xgb.DMatrix(train[features].values), 
                    ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)

gc.collect()
sub.head(2)

In [None]:
prehead=''
train_data_csv='en_train.csv'
classify_train_file='classify_train.npz'
xgb_model='xgb_model.dat'
test_data_csv='en_test_2.csv'
classify_test_file='classify_test.npz'
xgb_model2='xgb_model2.dat'
classify_test_file2='classify_test2.npz'
train = pd.read_csv('en_train.csv')
model = pickle.load(open('xgb_model2.dat', 'rb'))
test_data_csv = 'en_test_2.csv'


In [None]:
    # 预测 test 上的 class
    #model = pickle.load(open(xgb_model2, "rb"))
    test,x_test=get_classify_test_data(prehead+classify_test_file,prehead+test_data_csv)
    print(x_test.shape)
    dtest = xgb.DMatrix(x_test)
    pred = model.predict(dtest)
    pred = [label[int(x)] for x in pred]
    test['class']=pred
    test.to_csv(os.path.join(prehead, 'test_pred_class_new.csv'))
    

In [None]:
print(test.head())
test['class'].unique()

In [None]:
sub.to_csv(base_path+'test_sub_xgb.csv', index=False)
sub_train.to_csv(base_path+'train_sub_xgb_new.csv')

In [None]:
X = train[['class', 'before']]
X['class'] = X['class'].astype('category')
X['before'] = [''.join(str(ord(c)) for c in s) for s in X['before']]
y = train['after'].values
y = y.map(ord)

test['class'] = test['class'].astype('category')
test['before'] = test['before'].map(ord)



# Defining dictionary, rules and running predictions

Read the train file, google dataset, test file with predicted classes

In [16]:
train = pd.read_csv('en_train.csv')
test = pd.read_csv('test_pred_class_new.csv')

In [17]:
#Kerem
# d = train.groupby(['before', 'after']).size()
# d = d.reset_index().sort_values(0, ascending=False)
# d = d.loc[d['before'].drop_duplicates(keep='first').index]
# d = d.loc[d['before'] != d['after']]
# d = d.set_index('before')['after'].to_dict()
# test = pd.read_csv('test_class_kerem.csv')
# test.head()

In [18]:
# #Kerem
# test['random'] = ''
# test = test[['random','sentence_id', 'token_id', 'class']]
# test.head()

In [21]:
d = defaultdict(list)
train_list = [(train.iloc[i,3],train.iloc[i,4]) for i in range(train.shape[0])]
for k,v in train_list:
    d[k].append(v)
    
train_dict = {}
for key in d:
    c = Counter(d[key]).most_common(1)[0][0]
    train_dict[key] = c


In [22]:
train_dict['.']

'.'

In [23]:
# d1 = defaultdict(list)

# google_dataset = os.listdir('en_with_types/')
# for file in google_dataset:
#     data = pd.read_csv('en_with_types/' + file, sep = '\t', error_bad_lines= False, quoting=3)
#     print(file)
#     data = data[data.iloc[:,0] != 'PUNCT']
#     data = data[data.iloc[:,0] != 'ELECTRONIC']    
#     data = data[data.iloc[:,0] != '<eos>']   
#     data = data.drop(data.columns[0], axis=1)
#     #print(data.head())
    
#     data_list = [(data.iloc[i,0],data.iloc[i,1]) for i in range(data.shape[0])]
#     for k,v in data_list:
#         #print(data_list)[1:10]
#         #print(k)
#         #print(v)
#         d1[k].append(v)
#         #print(d1[k])
        
# counter_dict = {}
# for key in d1:
#     c = Counter(d1[key]).most_common(1)[0][0]
#     counter_dict[key] = c


In [24]:
# import pickle

# f = open("dict.pkl","wb")
# pickle.dump(counter_dict,f)
# f.close()

In [26]:
test[test['class'] == 'ELECTRONIC'].tail(10)

Unnamed: 0.1,Unnamed: 0,sentence_id,token_id,before,class
943352,943352,69066,0,BioLib.cz,ELECTRONIC
943563,943563,69081,0,Barbados.gov.bb,ELECTRONIC
943922,943922,69104,7,Snopes.com,ELECTRONIC
947664,947664,69379,6,Reginagallery.com,ELECTRONIC
950459,950459,69589,3,foreverastro.com,ELECTRONIC
951175,951175,69639,2,Fussballdaten.de,ELECTRONIC
952168,952168,69714,5,Historywc.rootsweb.com,ELECTRONIC
952475,952475,69737,3,::,ELECTRONIC
954139,954139,69854,19,http://dos.myflorida.com/elections/data-statis...,ELECTRONIC
954640,954640,69895,0,Changsha.com,ELECTRONIC


In [27]:
# f = open("dict.pkl","wb")
# pickle.dump(counter_dict,f)
# f.close()

p1 = pickle.load(open('dict.pkl', 'rb'))

p1['the']

'<self>'

In [28]:
# import os

# p = {} # scores is an empty dict already
# target = 'dict.pkl'
# if os.path.getsize(target) > 0:      
#     with open(target, "rb") as f:
#         unpickler = pickle.Unpickler(f)
#         # if file is not empty scores will be equal
#         # to the value unpickled
#         p = unpickler.load()

In [29]:
def rom_to_int(string):

    table=[['M',1000],['CM',900],['D',500],['CD',400],['C',100],['XC',90],['L',50],['XL',40],['X',10],['IX',9],['V',5],['IV',4],['I',1]]
    returnint=0
    for pair in table:


        continueyes=True

        while continueyes:
            if len(string)>=len(pair[0]):

                if string[0:len(pair[0])]==pair[0]:
                    returnint+=pair[1]
                    string=string[len(pair[0]):]

                else: continueyes=False
            else: continueyes=False

    return returnint

In [32]:
## For classes:
p = inflect.engine()
def plain(x):
    return x

def punct(x):
    return x

def cardinal(x):
    try:
        if re.match('.*[A-Za-z]+.*', x):
            return x
        x = re.sub(',', '', x, count = 10)

        if(re.match('.+\..*', x)):
            x = p.number_to_words(float(x))
        elif re.match('\..*', x): 
            x = p.number_to_words(float(x))
            x = x.replace('zero ', '', 1)
        else:
            x = p.number_to_words(int(x))
        x = x.replace('zero', 'o')    
        x = re.sub('-', ' ', x, count=10)
        x = re.sub(' and','',x, count = 10)
        return x
    except:
        return x

def verbatim(x):
    return(x)


dict_mon = {'jan': "January", "feb": "February", "mar ": "march", "apr": "april", "may": "may ","jun": "june", "jul": "july", "aug": "august","sep": "september",
            "oct": "october","nov": "november","dec": "december", "january":"January", "february":"February", "march":"march","april":"april", "may": "may", 
            "june":"june","july":"july", "august":"august", "september":"september", "october":"october", "november":"november", "december":"december"}
def date(key):
    dict_mon = {'jan': "January", "feb": "February", "mar ": "march", "apr": "april", "may": "may ","jun": "june", "jul": "july", "aug": "august","sep": "september",
            "oct": "october","nov": "november","dec": "december", "january":"January", "february":"February", "march":"march","april":"april", "may": "may", 
            "june":"june","july":"july", "august":"august", "september":"september", "october":"october", "november":"november", "december":"december"}


    v =  key.split('-')
    if len(v)==3:
        if v[1].isdigit():
            try:
                date = datetime.strptime(key , '%Y-%m-%d')
                text = 'the '+ p.ordinal(p.number_to_words(int(v[2]))).replace('-',' ')+' of '+datetime.date(date).strftime('%B')
                if int(v[0])>=2000 and int(v[0]) < 2010:
                    text = text  + ' '+cardinal(v[0])
                else: 
                    text = text + ' ' + cardinal(v[0][0:2]) + ' ' + cardinal(v[0][2:])
            except:
                text = key
            return text.lower()    
    else:   
        v = re.sub(r'[^\w]', ' ', key).split()
        if v[0].isalpha():
            try:
                if len(v)==3:
                    text = dict_mon[v[0].lower()] + ' '+ p.ordinal(p.number_to_words(int(v[1]))).replace('-',' ')
                    if int(v[2])>=2000 and int(v[2]) < 2010:
                        text = text  + ' '+cardinal(v[2])
                    else: 
                        text = text + ' ' + cardinal(v[2][0:2]) + ' ' + cardinal(v[2][2:])   
                elif len(v)==2:

                    if int(v[1])>=2000 and int(v[1]) < 2010:
                        text = dict_mon[v[0].lower()]  + ' '+ cardinal(v[1])
                    else: 
                        if len(v[1]) <=2:
                            text = dict_mon[v[0].lower()] + ' ' + cardinal(v[1])
                        else:
                            text = dict_mon[v[0].lower()] + ' ' + cardinal(v[1][0:2]) + ' ' + cardinal(v[1][2:])
                else: text = key
            except: text = key
            return text.lower()
        else: 
            key = re.sub(r'[^\w]', ' ', key)
            v = key.split()
            try:
                date = datetime.strptime(key , '%d %b %Y')
                text = 'the '+ p.ordinal(p.number_to_words(int(v[0]))).replace('-',' ')+' of '+ dict_mon[v[1].lower()]
                if int(v[2])>=2000 and int(v[2]) < 2010:
                    text = text  + ' '+cardinal(v[2])
                else: 
                    text = text + ' ' + cardinal(v[2][0:2]) + ' ' + cardinal(v[2][2:])
            except:
                try:
                    date = datetime.strptime(key , '%d %B %Y')
                    text = 'the '+ p.ordinal(p.number_to_words(int(v[0]))).replace('-',' ')+' of '+ dict_mon[v[1].lower()]
                    if int(v[2])>=2000 and int(v[2]) < 2010:
                        text = text  + ' '+cardinal(v[2])
                    else: 
                        text = text + ' ' + cardinal(v[2][0:2]) + ' ' + cardinal(v[2][2:])
                except:
                    try:
                        date = datetime.strptime(key , '%d %m %Y')
                        text = 'the '+ p.ordinal(p.number_to_words(int(v[0]))).replace('-',' ')+' of '+datetime.date(date).strftime('%B')
                        if int(v[2])>=2000 and int(v[2]) < 2010:
                            text = text  + ' '+cardinal(v[2])
                        else: 
                            text = text + ' ' + cardinal(v[2][0:2]) + ' ' + cardinal(v[2][2:])
                    except:
                        try:
                            date = datetime.strptime(key , '%d %m %y')
                            text = 'the '+ p.ordinal(p.number_to_words(int(v[0]))).replace('-',' ')+' of '+datetime.date(date).strftime('%B')
                            v[2] = datetime.date(date).strftime('%Y')
                            if int(v[2])>=2000 and int(v[2]) < 2010:
                                text = text  + ' '+cardinal(v[2])
                            else: 
                                text = text + ' ' + cardinal(v[2][0:2]) + ' ' + cardinal(v[2][2:])
                        except:text = key
            return text.lower() 




# def date(x):
#     try:
#         x = re.sub('\.','',x )
#         if re.match('^[0-9]+$', x):
#             return re.sub('-', ' ', p.number_to_words(int(x))) 
#         elif re.match('.*[A-Za-z]+.*',x):
#             y = re.sub(',','',x)
#             y = y.split(' ')
#             result_string = 'the '
#             two_dig = [w for w in y if len(w) <= 2]
#             if len(two_dig)>0:
#                 num = int(str(two_dig[0]))
#                 num = re.sub('-', ' ',  num2words(num, ordinal=True))
#                 result_string = result_string + num + ' of '
#             month = [w for w in y if re.match('^[A-Za-z]+$', w)]
#             result_string = result_string  + str(month[0])
#             y = [w for w in y if w not in month]
#             year = [w for w in y if len(w) == 4]
#             year1 = (str(year[0][0:2]))
#             year2 = (str(year[0][2:4]))
#             year1 = re.sub('-', ' ', p.number_to_words(int(year1)))
#             year2 = re.sub('-', ' ', p.number_to_words(int(year2)))
#             result_string = result_string + ' ' + year1 + ' ' + year2
#             return result_string
#         else:
#             result_string = 'the '
#             y = re.sub('-', ' ', x)
#             y = y.split(' ')
#             num = int(str(y[0]))
#             num = re.sub('-', ' ',  num2words(num, ordinal=True))
#             result_string = result_string + num + ' of '
#             month = y[1]
#             result_string = result_string  + str(month[0])
#             year = [w for w in y if len(w) == 4]
#             year1 = (str(year[0][0:2]))
#             year2 = (str(year[0][2:4]))
#             year1 = re.sub('-', ' ', p.number_to_words(int(year1)))
#             year2 = re.sub('-', ' ', p.number_to_words(int(year2)))
#             result_string = result_string + ' ' + year1 + ' ' + year2
#             return result_string
#     except:
#         return x
    
def measure(x):
    try:
        x = re.sub(',', '', count=10)
        x = x.split(' ')
        replacement = {'MB': 'megabyte', 'MW': 'megawatt','ft':'feet', 'km':'kilometers', 'mm': 'millimeters', 'ha':'hectares', '"':'inches', 'cm':'centimeters', '/day' : 'per day', 'nm' : 'nanometers', '/s':'per second', 'm2':'square meters', 'km2': 'square kilometers', 'percent': 'percent'}
        #Comprehensive list of all measures
        replacement = {'"': 'inches', "'": 'feet', 'km/s': 'kilometers per second', 'AU': 'units', 'BAR': 'bars', 'CM': 'centimeters', 'mm': 'millimeters', 'FT': 'feet', 'G': 'grams', 
     'GAL': 'gallons', 'GB': 'gigabytes', 'GHZ': 'gigahertz', 'HA': 'hectares', 'HP': 'horsepower', 'HZ': 'hertz', 'KM':'kilometers', 'km3': 'cubic kilometers',
     'KA':'kilo amperes', 'KB': 'kilobytes', 'KG': 'kilograms', 'KHZ': 'kilohertz', 'KM²': 'square kilometers', 'KT': 'knots', 'KV': 'kilo volts', 'M': 'meters',
      'KM2': 'square kilometers','Kw':'kilowatts', 'KWH': 'kilo watt hours', 'LB': 'pounds', 'LBS': 'pounds', 'MA': 'mega amperes', 'MB': 'megabytes',
     'KW': 'kilowatts', 'MPH': 'miles per hour', 'MS': 'milliseconds', 'MV': 'milli volts', 'kJ':'kilojoules', 'km/h': 'kilometers per hour',  'V': 'volts', 
     'M2': 'square meters', 'M3': 'cubic meters', 'MW': 'megawatts', 'M²': 'square meters', 'M³': 'cubic meters', 'OZ': 'ounces',  'MHZ': 'megahertz', 'MI': 'miles',
     'MB/S': 'megabytes per second', 'MG': 'milligrams', 'ML': 'milliliters', 'YD': 'yards', 'au': 'units', 'bar': 'bars', 'cm': 'centimeters', 'ft': 'feet', 'g': 'grams', 
     'gal': 'gallons', 'gb': 'gigabytes', 'ghz': 'gigahertz', 'ha': 'hectares', 'hp': 'horsepower', 'hz': 'hertz', 'kWh': 'kilo watt hours', 'ka': 'kilo amperes', 'kb': 'kilobytes', 
     'kg': 'kilograms', 'khz': 'kilohertz', 'km': 'kilometers', 'km2': 'square kilometers', 'km²': 'square kilometers', 'kt': 'knots','kv': 'kilo volts', 'kw': 'kilowatts', 
     'lb': 'pounds', 'lbs': 'pounds', 'm': 'meters', 'm2': 'square meters','m3': 'cubic meters', 'ma': 'mega amperes', 'mb': 'megabytes', 'mb/s': 'megabytes per second', 
     'mg': 'milligrams', 'mhz': 'megahertz', 'mi': 'miles', 'ml': 'milliliters', 'mph': 'miles per hour','ms': 'milliseconds', 'mv': 'milli volts', 'mw': 'megawatts', 'm²': 'square meters',
     'm³': 'cubic meters', 'oz': 'ounces', 'v': 'volts', 'yd': 'yards', 'µg': 'micrograms', 'ΜG': 'micrograms', 'kg/m3': 'kilograms per meter cube'}
        result_string = ''
        if re.match('.*%$',x[0]):
            x = re.sub('%','',x[0])
            x = cardinal(x)
            result_string = result_string + x + ' percent' 
        elif re.match('.*\"$',x[0]):
            x = re.sub('\"','',x[0])
            x = cardinal(x)
            result_string = result_string + x + ' inches' 
        elif len(x)<2:
            return x
        elif re.match('.*ft$', x[0]):
            x = re.sub('ft','',x[0])
            x = cardinal(x)
            result_string = result_string + x1 + ' ' + replacement['ft']
        elif x[1] in replacement:
            x1 = cardinal(x[0])

            result_string = result_string + x1 + ' ' + replacement[x[1]] 
        else:
            result_string = x
        return(result_string)
    except:
        return x

def letters(x):
    try:
        x = re.sub('[^a-zA-Z]', '', x)
        x = x.lower()
        result_string = ''
        for i in range(len(x)):
            result_string = result_string + x[i] + ' '
        return(result_string.strip())  
    except:
        return x
    
    
def decimal(x):
    try:
        x = re.sub(',', '', count=10)
        x = x.split(' ')
        if len(x) == 1:
            result_string = cardinal(x[0])
        else:
            result_string = cardinal(x[0]) + ' ' + x[1]
        return result_string   
    except:
        return x

def ordinal(x):
    try:
        result_string = ''
        x = x.replace(',', '')
        x = x.replace('[\.]$', '')
        if re.match('^[0-9]+$',x):
            x = num2words(int(x), ordinal=True)
            return(x.replace('-', ' '))
        if re.match('.*V|X|I|L|D',x):
            if re.match('.*th|st|nd|rd',x):
                x = x[0:len(x)-2]
                x = rom_to_int(x)
                result_string = re.sub('-', ' ',  num2words(x, ordinal=True))
            else:
                x = rom_to_int(x)
                result_string = 'the '+ re.sub('-', ' ',  num2words(x, ordinal=True))
        else:
            x = x[0:len(x)-2]
            result_string = re.sub('-', ' ',  num2words(float(x), ordinal=True))
        return(result_string)  
    except:
        return x

def electronic(x):
    try:
        replacement = {'.' : 'dot', ':' : 'colon', '/':'slash', '-' : 'dash', '#' : 'hash tag', }
        result_string = ''
        if re.match('.*[A-Za-z].*', x):
            for char in x:
                if re.match('[A-Za-z]', char):
                    result_string = result_string + letters(char) + ' '
                elif char in replacement:
                    result_string = result_string + replacement[char] + ' '
                elif re.match('[0-9]', char):
                    if char == 0:
                        result_string = result_string + 'o '
                    else:
                        number = cardinal(char)
                        for n in number:
                            result_string = result_string + n + ' ' 
            return result_string.strip()                
        else:
            return(x)
    except:    
        return(x)

def address(x):
    try:
        x = re.sub('[^0-9a-zA-Z]+', '', x)
        result_string = ''
        for i in range(0,len(x)):
            if re.match('[A-Z]|[a-z]',x[i]):
                result_string = result_string + plain(x[i]).lower() + ' '
            else:
                result_string = result_string + cardinal(x[i]) + ' '
                
        return(result_string.strip())        
    except:    
        return(x)

def telephone(x):
    try:
        result_string = ''
        for i in range(0,len(x)):
            if re.match('[0-9]+', x[i]):
                result_string = result_string + cardinal(x[i]) + ' '
            else:
                result_string = result_string + 'sil '
        return result_string.strip()    
    except:    
        return(x)

def time(x):
    return(x)

def money(x):
    try:
        if re.match('^\$', x):
            x = x.replace('$','')
            text = cardinal(x)
            x = text + ' dollars'
            return x.lower()

        elif re.match('^£', x):
            x = x.replace('£','')
            text = cardinal(x)
            x = text+ ' pounds'
            return x.lower()   
            
        elif re.match('^€', x):
            x = x.replace('€','')
            text = cardinal(x)
            x = text+ ' euros'
            return x.lower()        
    except:    
        return(x)

def fraction(x):
    try:
        y = x.split('/')
        result_string = ''
        y[0] = cardinal(y[0])
        y[1] = ordinal(y[1])
        if y[1] == 4:
            result_string = y[0] + ' quarters'
        else:    
            result_string = y[0] + ' ' + y[1] + 's'
        return(result_string)
    except:    
        return(x)
    
    
def digit(x): 
    try:
        x = re.sub('[^0-9]', '',x)
        result_string = ''
        for i in x:
            result_string = result_string + cardinal(i) + ' '
        result_string = result_string.strip()
        return result_string
    except:
        return(x)

In [33]:
 x = '#LifetimeBiopics123.com'
electronic(x)


'hash tag l i f e t i m e b i o p i c s o n e t w o t h r e e dot c o m'

In [34]:
after = []
for i in range(test.shape[0]):
#    print(test.iloc[i,3])
#     print(after)
    if test.iloc[i,3] in train_dict.keys():
        after.append(train_dict[test.iloc[i,3]])
    elif test.iloc[i,3] in p1.keys() and test.iloc[i,3] != '-' and test.iloc[i,3] != '.' and test.iloc[i,3] != '"' and test.iloc[i,3] != '/' and test.iloc[i,3] != '\\' and test.iloc[i,3] != ',':
        if p1[test.iloc[i,3]] == '<self>':
            after.append(test.iloc[i,3])
        else:
            after.append(p1[test.iloc[i,3]]) 
    else:    
        class_name = test.iloc[i,4]
        if class_name == 'PLAIN':
            after.append(plain(test.iloc[i,3]))
        elif class_name == 'PUNCT':
            after.append(punct(test.iloc[i,3]))
        elif class_name  == 'CARDINAL':
            after.append(cardinal(test.iloc[i,3]))
        elif class_name == 'VERBATIM':
            after.append(verbatim(test.iloc[i,3]))
        elif class_name == 'DATE':
            after.append(date(test.iloc[i,3]))
        elif class_name == 'MEASURE':
            after.append(measure(test.iloc[i,3]))
        elif class_name == 'LETTERS':
            after.append(letters(test.iloc[i,3]))
        elif class_name == 'DECIMAL':
            after.append(decimal(test.iloc[i,3]))
        elif class_name == 'ORDINAL':
            after.append(ordinal(test.iloc[i,3]))
        elif class_name == 'ELECTRONIC':
            after.append(electronic(test.iloc[i,3]))
        elif class_name == 'ADDRESS':
            after.append(address(test.iloc[i,3]))
        elif class_name == 'DIGIT':
            after.append(digit(test.iloc[i,3]))
        elif class_name == 'MONEY':
            after.append(money(test.iloc[i,3]))
        elif class_name == 'TIME':
            after.append(time(test.iloc[i,3]))
        elif class_name == 'TELEPHONE':
            after.append(telephone(test.iloc[i,3]))
        elif class_name == 'FRACTION':
            after.append(fraction(test.iloc[i,3]))

In [35]:
after

['Last',
 'modified',
 'the thirty first of march twenty sixteen',
 '.',
 "There's",
 'More',
 'to',
 'Clear',
 'Channel',
 'Than',
 "'",
 'The',
 'Larry',
 'King',
 'Show',
 "'",
 '.',
 'Roberto',
 'Chiti',
 ';',
 'Roberto',
 'Poppi',
 ';',
 'Enrico',
 'Lancia',
 '.',
 'The',
 'party',
 'applied',
 'to',
 'register',
 'this',
 'with',
 'the',
 'Electoral',
 'Commission',
 'in',
 'april twenty seventeen',
 'and',
 'it',
 'was',
 'approved',
 'in',
 'may twenty seventeen',
 '.',
 '21 february 2017',
 '.',
 '"',
 'Passport',
 'and',
 'visa',
 'requirements',
 '"',
 '.',
 '"',
 'Main',
 'Street',
 'Electrical',
 'Parade',
 'Extended',
 'by',
 'Popular',
 'Demand',
 'at',
 'Disneyland',
 'Park',
 '"',
 '.',
 'International',
 'Air',
 'Transport',
 'Association',
 '(',
 'i a t a',
 ')',
 'through',
 'Olympic',
 'Air',
 '.',
 'He',
 'was',
 'buried',
 'in',
 'the',
 'chancel',
 'of',
 'his',
 'former',
 'church',
 'in',
 'Raleigh',
 ',',
 'which',
 'was',
 'rebuilt',
 'several',
 'years',
 '

In [36]:
test['after'] = after
test.head()

Unnamed: 0.1,Unnamed: 0,sentence_id,token_id,before,class,after
0,0,0,0,Last,PLAIN,Last
1,1,0,1,modified,PLAIN,modified
2,2,0,2,2016-03-31,DATE,the thirty first of march twenty sixteen
3,3,0,3,.,PUNCT,.
4,4,1,0,There's,PLAIN,There's


In [37]:
test['id'] = test.sentence_id.astype(str) + '_' + test.token_id.astype(str)

print(test[['id', 'after']])
test[['id', 'after']].to_csv('output6.csv', index=False)

              id                                     after
0            0_0                                      Last
1            0_1                                  modified
2            0_2  the thirty first of march twenty sixteen
3            0_3                                         .
4            1_0                                   There's
5            1_1                                      More
6            1_2                                        to
7            1_3                                     Clear
8            1_4                                   Channel
9            1_5                                      Than
10           1_6                                         '
11           1_7                                       The
12           1_8                                     Larry
13           1_9                                      King
14          1_10                                      Show
15          1_11                                        

# Predictions:

In [10]:
# -*- coding: utf-8 -*-
# @Time    : 2017/9/30 8:53
# @Author  : LiYun
# @File    : main_v2.py
'''description:
this method is a simple extention of BingQing Wei's XGboost With Context Label Data (ACC: 99.637%)
it's accuracy is 99.81% when 10% of the training data is used as validtion data
and finally, the whole data is used for training
'''
import os
import gc
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [11]:
def get_classify_train_data(np_file,csv_file):
    if os.path.exists(np_file) == True:
       temp = np.load(np_file)
       return temp['x_train'],temp['y_train'],temp['label']
    else:
        num_features = 9 #每个 word 取前 5 后 4 个字符来编码
        train=pd.read_csv(csv_file)
        tmp=pd.factorize(train['class'])
        y_train,label=tmp[0].astype(np.int8),tmp[1].values
        num_train=len(y_train)
        train['before']=train['before'].astype(np.str)
        x_train=np.zeros([num_train,num_features],np.int8)
        feature=np.zeros([num_train,7],np.int8)# 人工提取的特征
        list1=('a','e','i','o','u')# 元音
        list2=('+','-','*','//','%')# 数学运算符
        for word,row in zip(train['before'].values,range(num_train)):
            if(len(word)>=num_features):
                for c,col in zip(word[:5],range(5)):
                    x_train[row,col]=ord(c)
                for c,col in zip(word[-4:],range(5,9)):
                    x_train[row,col]=ord(c)
            else:
                for c,col in zip(word,range(num_features)):
                    x_train[row,col]=ord(c)
            feature[row, 3] =len(word) # 统计字符串的长度
            dotflag=0
            for c in word:
                if c.isdigit():feature[row,0]+=1# 统计数字的个数
                if c.isupper():feature[row,1]+=1# 统计大写字母的个数
                if c.isalnum()!=True:feature[row,2]+=1# 统计非字母和数字的个数
                if c in list1:feature[row,4]+=1# 统计元音的个数
                if c=='.': dotflag=1
                elif dotflag==1:#  . 后面跟字母置 1 ，数字置 2，其他置 3
                    dotflag = 0
                    if c.isdigit():feature[row,5]+=10
                    elif c.isalpha():feature[row,5]+=100
                    else:feature[row,5]+=1000
                if c in list2:feature[row,6]+=1# 统计数学运算符的个数

        # 掐头去尾，结合上文 2 单词，下文 1 个单词
        num_train-=3
        y_train=y_train[2:-1]
        x_train=np.concatenate((x_train[:-3],x_train[1:-2],x_train[2:-1],x_train[3:],feature[2:-1]),axis=1)
        np.savez(np_file,x_train=x_train, y_train=y_train, label=label)
        return x_train, y_train, label

def get_classify_test_data(np_file,csv_file):
    test=pd.read_csv(csv_file)
    if os.path.exists(np_file) == True:
       temp = np.load(np_file)
       x_test=temp['x_test']
    else:
        num_features = 9 #每个 word 取前 5 后 4 个字符来编码
        human_feature=7 #人工提取7个特征
        num_test=len(test)
        test['before']=test['before'].astype(np.str)
        x_test=np.zeros([num_test,num_features],np.int8)
        feature=np.zeros([num_test,human_feature],np.int8)# 人工提取的特征
        list1=('a','e','i','o','u')# 元音
        list2=('+','-','*','//','%')# 数学运算符
        for word,row in zip(test['before'].values,range(num_test)):
            if(len(word)>=num_features):
                for c,col in zip(word[:5],range(5)):
                    x_test[row,col]=ord(c)
                for c,col in zip(word[-4:],range(5,9)):
                    x_test[row,col]=ord(c)
            else:
                for c,col in zip(word,range(num_features)):
                    x_test[row,col]=ord(c)
            feature[row, 3] =len(word) # 统计字符串的长度
            dotflag=0
            for c in word:
                if c.isdigit():feature[row,0]+=1# 统计数字的个数
                if c.isupper():feature[row,1]+=1# 统计大写字母的个数
                if c.isalnum()!=True:feature[row,2]+=1# 统计非字母和数字的个数
                if c in list1:feature[row,4]+=1# 统计元音的个数
                if c=='.': dotflag=1
                elif dotflag==1:#  . 后面跟字母置 1 ，数字置 2，其他置 3
                    dotflag = 0
                    if c.isdigit():feature[row,5]+=10
                    elif c.isalpha():feature[row,5]+=100
                    else:feature[row,5]+=1000
                if c in list2:feature[row,6]+=1# 统计数学运算符的个数

        # 开头补上2个单词,结尾补上1个单词，结合上文 2 单词，下文 1 个单词
        x_test = np.concatenate((np.zeros([2,num_features],np.int8),x_test,np.zeros([1,num_features],np.int8)),axis=0)
        feature = np.concatenate((np.zeros([2,human_feature],np.int8),feature,np.zeros([1,human_feature],np.int8)),axis=0)
        x_test=np.concatenate((x_test[:-3],x_test[1:-2],x_test[2:-1],x_test[3:],feature[2:-1]),axis=1)
        np.savez(np_file,x_test=x_test)
    return test, x_test

In [12]:
if __name__=='__main__':
    prehead=''
    train_data_csv='en_train.csv'
    classify_train_file='classify_train.npz'
    xgb_model='xgb_model.dat'
    test_data_csv='en_test_2.csv'
    classify_test_file='classify_test.npz'
    xgb_model2='xgb_model2.dat'
    classify_test_file2='classify_test2.npz'

    # 训练模型
    x_train,y_train,label=get_classify_train_data(prehead+classify_train_file,prehead+train_data_csv)
    print(x_train.shape)
    dtrain = xgb.DMatrix(x_train, label=y_train)
    watchlist = [(dtrain, 'train')]
    param = {
        'eta': 0.3,
        'max_depth':10,
        'objective':'multi:softmax',
        'num_class':len(label),
        'eval_metric':'merror',
        'subsample': 1,
        'colsample_bytree': 1,
        'silent':1,
        'seed':0,
    }
    num_boost_rounds=12
    model = xgb.train(param, dtrain, num_boost_rounds, watchlist,verbose_eval=1)
    print('save model ',xgb_model2)
    pickle.dump(model,open(xgb_model2,'wb'))# 保存模型
    del x_train,y_train
    gc.collect()

    # 预测 test 上的 class
    model = pickle.load(open(xgb_model2, "rb"))
    test,x_test=get_classify_test_data(prehead+classify_test_file,prehead+test_data_csv)
    print(x_test.shape)
    dtest = xgb.DMatrix(x_test)
    pred = model.predict(dtest)
    pred = [label[int(x)] for x in pred]
    test['class']=pred
    test.to_csv(os.path.join(prehead, 'test_pred_class_new.csv'))

(9918438, 43)
[0]	train-merror:0.006201
[1]	train-merror:0.005233
[2]	train-merror:0.004759
[3]	train-merror:0.004471
[4]	train-merror:0.004253
[5]	train-merror:0.004053
[6]	train-merror:0.003818
[7]	train-merror:0.003616
[8]	train-merror:0.003515
[9]	train-merror:0.003356
[10]	train-merror:0.003241
[11]	train-merror:0.003142
save model  xgb_model2.dat
(956046, 43)


In [13]:
len(pred)

956046

In [14]:
len(test)

956046

In [15]:
len(x_test)

956046

In [39]:
test1 = pd.read_csv('en_test_2.csv')
test1.head()

Unnamed: 0,sentence_id,token_id,before
0,0,0,Last
1,0,1,modified
2,0,2,2016-03-31
3,0,3,.
4,1,0,There's
