In [5]:
import numpy as np
import pandas as pd

df = pd.read_csv("../../keras/CONLL_dataset.csv")

In [7]:


tags = np.unique(df.Tag)

tags

transform_tag_mapping = {
    'B-art': ['Object'],
    'B-eve': ['Event'],
    'B-geo': ['Location', 'Party'],
    'B-gpe': ['Race'],
    'B-nat': ['SpecialTerm'],
    'B-org': ['Party'],
    'B-per': ['Party'],
    'B-tim': ['Time'],
    'I-art': ['Object'],
    'I-eve': ['Event'],
    'I-geo': ['Location', 'Party'],
    'I-gpe': ['Race', "Party"],
    'I-nat': ['SpecialTerm'],
    'I-org': ['Party'],
    'I-per': ['Party'],
    'I-tim': ['TemporalUnit'],
    'O': [],
}

def transform_tag(tag):
    out = transform_tag_mapping[tag]
    out = [tag] + out
    out = "|".join(out)
    return out

df["newTag"] = df.Tag.apply(transform_tag)

df.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag,newTag
0,Sentence: 1,Thousands,NNS,O,O
1,,of,IN,O,O
2,,demonstrators,NNS,O,O
3,,have,VBP,O,O
4,,marched,VBN,O,O
5,,through,IN,O,O
6,,London,NNP,B-geo,B-geo|Location|Party
7,,to,TO,O,O
8,,protest,VB,O,O
9,,the,DT,O,O


In [8]:
from label_functions.basicTypes import is_Float, is_Integer
from label_functions.CountryCode import is_CountryCode
from label_functions.CryptoCurrencyCode import is_CryptoCurrencyCode
from label_functions.CurrencyCode import is_CurrencyCode
from label_functions.TemporalUnit import is_TemporalUnit
from label_functions.Timezone import is_TimeZone
from label_functions.US_States import is_US_States
from label_functions.isMonth import isMonth

In [9]:
text = "usd"

In [10]:
is_CurrencyCode(text)

True

In [11]:
class label_function:
    def __init__(self, function, tags = [], seperator = "|"):
        self.label_function = function
        self.tags = seperator.join(tags)
        self.seperator = seperator
    def label(self, text, origin_label):
        if self.label_function(text):
            return f"{origin_label}{self.seperator}{self.tags}"
        else:
            return origin_label

In [12]:
labels = [
    [is_Float, ["Float"]],
    [is_Integer, ["Integer"]],
    [is_CountryCode, ["CountryCode"]],
    [is_CryptoCurrencyCode, ["CryptoCurrencyCode"]],
    [is_CurrencyCode, ["CurrencyCode"]],
    [is_TemporalUnit, ["TemporalUnit"]],
    [is_TimeZone, ["Timezone"]],
    [is_US_States, ["US_States"]],
    [isMonth, ['TemporalUnit', 'Month']]
]

In [13]:
label_functions = []
for label in labels:
    label_functions.append(label_function(label[0], label[1]))

In [14]:
def label_with_functions(row):
    for function in label_functions:
        row["newTag"] = function.label(row.Word, row.newTag)
    return row

In [16]:
import swifter

In [17]:
%%time
df2 = df.swifter.apply(label_with_functions, axis = 1)

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

CPU times: user 7.15 s, sys: 159 ms, total: 7.31 s
Wall time: 1min 39s


In [18]:
#remove O if it have other value
df2.newTag = df2.newTag.apply(lambda x: x[2:] if "O|" == x[0:2] else x)

In [23]:
def remove_repeat_tag(text):
    return "|".join(list(set(text.split("|"))))

In [24]:
df2.newTag = df2.newTag.swifter.apply(remove_repeat_tag)

Pandas Apply:   0%|          | 0/1048575 [00:00<?, ?it/s]

In [25]:
np.unique(df2.newTag)

array(['B-art|CountryCode|Object', 'B-art|Integer|Object', 'B-art|Object',
       'B-eve|CurrencyCode|Event', 'B-eve|Event', 'B-eve|Integer|Event',
       'B-eve|TemporalUnit|Month|Event',
       'B-geo|Location|CountryCode|Party', 'B-geo|Location|Party',
       'B-geo|Location|Party|CurrencyCode', 'B-geo|Location|Party|Float',
       'B-geo|Location|Party|US_States', 'B-geo|Location|Timezone|Party',
       'B-org|CountryCode|Party', 'B-org|CountryCode|Party|US_States',
       'B-org|CryptoCurrencyCode|Party', 'B-org|Party',
       'B-org|Party|CurrencyCode', 'B-org|Party|Float',
       'B-org|Party|Integer', 'B-org|Party|US_States',
       'B-org|Timezone|Party', 'B-tim|CurrencyCode|Time',
       'B-tim|Float|Time', 'B-tim|Integer|Time',
       'B-tim|TemporalUnit|Month|Time', 'B-tim|TemporalUnit|Time',
       'B-tim|Time', 'B-tim|Timezone|Time', 'CountryCode',
       'CountryCode|I-org|Party', 'CountryCode|I-per|Party',
       'CountryCode|US_States', 'CryptoCurrencyCode', 'CurrencyC

In [26]:
df2.to_csv("./NER_multilabel_data_v2.csv")