In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path('../')))

In [2]:
import pandas as pd
import numpy as np
from utils import get_tokenizer, get_jaccard_from_df
from config import Config
from data_utils import AlbertDataGenerator, XLNetDataGenerator, BertDataGenerator, RobertaDataGenerator

In [3]:
tjs = {}
data_df = pd.read_csv(Config.train_path).dropna()

In [4]:
st = np.zeros((data_df.shape[0], Config.Train.max_len))
et = np.zeros((data_df.shape[0], Config.Train.max_len))
g = AlbertDataGenerator(data_df).generate()
for i, (_, y) in enumerate(g):
    st[i, :len(y['sts'])] = y['sts']
    et[i, :len(y['ets'])] = y['ets']
st = np.argmax(st, axis=-1)
et = np.argmax(et, axis=-1)
tjs['albert'] = get_jaccard_from_df(data_df, st, et, 'albert', 'albert_results.csv')

In [5]:
st = np.zeros((data_df.shape[0], Config.Train.max_len))
et = np.zeros((data_df.shape[0], Config.Train.max_len))
g = XLNetDataGenerator(data_df).generate()
for i, (_, y) in enumerate(g):
    st[i, :len(y['sts'])] = y['sts']
    et[i, :len(y['ets'])] = y['ets']
st = np.argmax(st, axis=-1)
et = np.argmax(et, axis=-1)
tjs['xlnet'] = get_jaccard_from_df(data_df, st, et, 'xlnet', 'xlnet_results.csv')

In [6]:
st = np.zeros((data_df.shape[0], Config.Train.max_len))
et = np.zeros((data_df.shape[0], Config.Train.max_len))
g = BertDataGenerator(data_df).generate()
for i, (_, y) in enumerate(g):
    st[i, :len(y['sts'])] = y['sts']
    et[i, :len(y['ets'])] = y['ets']
st = np.argmax(st, axis=-1)
et = np.argmax(et, axis=-1)
tjs['bert'] = get_jaccard_from_df(data_df, st, et, 'bert', 'bert_results.csv')

In [7]:
st = np.zeros((data_df.shape[0], Config.Train.max_len))
et = np.zeros((data_df.shape[0], Config.Train.max_len))
g = BertDataGenerator(data_df).generate()
for i, (_, y) in enumerate(g):
    st[i, :len(y['sts'])] = y['sts']
    et[i, :len(y['ets'])] = y['ets']
st = np.argmax(st, axis=-1)
et = np.argmax(et, axis=-1)
tjs['electra'] = get_jaccard_from_df(data_df, st, et, 'bert', 'electra_result.csv')

In [8]:
st = np.zeros((data_df.shape[0], Config.Train.max_len))
et = np.zeros((data_df.shape[0], Config.Train.max_len))
g = RobertaDataGenerator(data_df).generate()
for i, (_, y) in enumerate(g):
    st[i, :len(y['sts'])] = y['sts']
    et[i, :len(y['ets'])] = y['ets']
st = np.argmax(st, axis=-1)
et = np.argmax(et, axis=-1)
tjs['roberta'] = get_jaccard_from_df(data_df, st, et, 'roberta', 'roberta_results.csv')

In [9]:
tj_df = pd.DataFrame({
    'model type': list(tjs.keys()),
    'Jaccard score': list(tjs.values())
})
tj_df

Unnamed: 0,model type,Jaccard score
0,albert,0.956906
1,xlnet,0.956679
2,bert,0.975771
3,electra,0.975771
4,roberta,0.9597


In [10]:
tokenizer = get_tokenizer('albert')
sentiment_ids = {'positive': 2221, 'negative': 3682, 'neutral': 8387}
def create_albert_data(text, selected_text, sentiment):
    text = text.lower()
    selected_text = selected_text.lower()
    text = ' '.join(text.split())
    selected_text = ' '.join(selected_text.split())
    # find the intersection between text and selected text
    idx_start = text.find(selected_text)
    

    text_tokens = tokenizer.tokenize(text)
    selected_text_tokens = tokenizer.tokenize(selected_text)
    chars = np.zeros((len(''.join(text_tokens))))
    chars[idx_start:idx_start + len(''.join(selected_text_tokens))] = 1
    offsets = []
    idx = 0
    for t in text_tokens:
        len_t = len(t)
        offsets.append((idx, idx + len_t))
        idx += len_t

    # compute targets
    target_idx = []
    for i, (o1, o2) in enumerate(offsets):
        if sum(chars[o1: o2]) > 0:
            target_idx.append(i)

    start_tokens = target_idx[0]
    end_tokens = target_idx[-1]

    input_ids_orig = tokenizer.encode(text, add_special_tokens=False)
    input_ids = [2] + input_ids_orig + [3] + [sentiment_ids[sentiment]] + [3]
    token_type_ids = [0] * (len(input_ids_orig) + 2) + [1, 1]
    attention_mask = [1] * (len(input_ids_orig) + 4)
    np_start_tokens = np.zeros((len(input_ids)), dtype='int')
    np_start_tokens[start_tokens] = 1
    a = np.argmax(np_start_tokens, axis=-1)
    np_end_tokens = np.zeros((len(input_ids)), dtype='int')
    np_end_tokens[end_tokens] = 1
    b = np.argmax(np_end_tokens, axis=-1)
    start_tokens = np_start_tokens.tolist()
    end_tokens = np_end_tokens.tolist()
    encoded_text = tokenizer.tokenize(text)
    pred_selected_text = tokenizer.convert_tokens_to_string(encoded_text[a: b + 1])
    return ({'ids': input_ids, 'att': attention_mask, 'tti': token_type_ids},
            {'start_token': a, 'end_token': b},
            {'sts': start_tokens, 'ets': end_tokens},
            {'tok_text': tokenizer.tokenize(text), 'pred_selected_text': pred_selected_text})

In [11]:
text = '  I`m sorry.'
selected_text = 'I`m sorry.'
sentiment = 'negative'
create_albert_data(text, selected_text, sentiment)

({'ids': [2, 31, 1, 79, 1875, 9, 3, 3682, 3],
  'att': [1, 1, 1, 1, 1, 1, 1, 1, 1],
  'tti': [0, 0, 0, 0, 0, 0, 0, 1, 1]},
 {'start_token': 0, 'end_token': 4},
 {'sts': [1, 0, 0, 0, 0, 0, 0, 0, 0], 'ets': [0, 0, 0, 0, 1, 0, 0, 0, 0]},
 {'tok_text': ['▁i', '`', 'm', '▁sorry', '.'],
  'pred_selected_text': 'i`m sorry.'})

In [12]:
tokenizer_xlnet = get_tokenizer('xlnet')
sentiment_ids = {'positive': 1654, 'negative': 2981, 'neutral': 9201}
def create_albert_data(text, selected_text, sentiment):
    text = text.lower()
    selected_text = selected_text.lower()
    # find overlap
    text = ' '.join(text.split())
    selected_text = ' '.join(selected_text.split())
    # find the intersection between text and selected text
    idx_start = text.find(selected_text)

    # calculate offsets
    text_tokens = tokenizer_xlnet.tokenize(text)
    selected_text_tokens = tokenizer_xlnet.tokenize(selected_text)
    chars = np.zeros((len(''.join(text_tokens))))
    chars[idx_start:idx_start + len(''.join(selected_text_tokens))] = 1
    offsets = []
    idx = 0
    for t in text_tokens:
        len_t = len(t)
        offsets.append((idx, idx + len_t))
        idx += len_t

    # compute targets
    target_idx = []
    for i, (o1, o2) in enumerate(offsets):
        if sum(chars[o1: o2]) > 0:
            target_idx.append(i)

    start_tokens = target_idx[0]
    end_tokens = target_idx[-1]

    input_ids_orig = tokenizer_xlnet.encode(text, add_special_tokens=False)
    input_ids = input_ids_orig + [4] + [sentiment_ids[sentiment]] + [4, 3]
    token_type_ids = [0] * (len(input_ids_orig) + 1) + [1, 1] + [2]
    attention_mask = [1] * (len(input_ids_orig) + 4)
    np_start_tokens = np.zeros((len(input_ids)), dtype='int')
    np_start_tokens[start_tokens] = 1
    a = np.argmax(np_start_tokens, axis=-1)
    np_end_tokens = np.zeros((len(input_ids)), dtype='int')
    np_end_tokens[end_tokens] = 1
    b = np.argmax(np_end_tokens, axis=-1)
    start_tokens = np_start_tokens.tolist()
    end_tokens = np_end_tokens.tolist()
    encoded_text = tokenizer_xlnet.tokenize(text)
    pred_selected_text = tokenizer_xlnet.convert_tokens_to_string(encoded_text[a: b + 1])
    return ({'ids': input_ids, 'att': attention_mask, 'tti': token_type_ids},
            {'start_token': a, 'end_token': b},
            {'sts': start_tokens, 'ets': end_tokens},
            {'tok_text': tokenizer_xlnet.tokenize(text), 'pred_selected_text': pred_selected_text})

In [13]:
text = 'My Sharpie is running DANGERously low on ink'
selected_text = 'DANGERously'
sentiment = 'negative'
create_albert_data(text, selected_text, sentiment)

({'ids': [94, 3346, 950, 27, 926, 25550, 599, 31, 17691, 4, 2981, 4, 3],
  'att': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'tti': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2]},
 {'start_token': 5, 'end_token': 5},
 {'sts': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  'ets': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]},
 {'tok_text': ['▁my',
   '▁sharp',
   'ie',
   '▁is',
   '▁running',
   '▁dangerously',
   '▁low',
   '▁on',
   '▁ink'],
  'pred_selected_text': 'dangerously'})

In [14]:
tokenizer_xlnet.tokenize('My Sharpie is running DANGERously low on ink'.lower())

['▁my',
 '▁sharp',
 'ie',
 '▁is',
 '▁running',
 '▁dangerously',
 '▁low',
 '▁on',
 '▁ink']