In [1]:
import numpy as np
import pandas as pd

from transformers import ElectraConfig as Config
from transformers import RobertaConfig as Config
# from transformers import RobertaTokenizer as Tokenizer

from tokenizers import ByteLevelBPETokenizer as Tokenizer
PATH = '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/roberta-base/'
roberta_tokenizer = Tokenizer(
    vocab_file=PATH+"vocab.json",
    merges_file=PATH+"merges.txt",
    lowercase=False,
    add_prefix_space=True
)
from tokenizers import BertWordPieceTokenizer as Tokenizer
PATH = '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/electra-large/'
electra_tokenizer = Tokenizer(PATH+'vocab.txt', lowercase=True, add_special_tokens=False)

MAX_SEQUENCE_LENGTH = 20



In [2]:
text = "Hello there, u little shit!!!"
roberta = roberta_tokenizer.encode(" " + text)
electra = electra_tokenizer.encode(text)


In [4]:
electra.tokens

['hello', 'there', ',', 'u', 'little', 'shit', '!', '!', '!']

In [75]:
roberta_ids, roberta_offsets = roberta.ids, roberta.offsets
electra_ids, electra_offsets = electra.ids, electra.offsets

roberta_preds_start = [0.5, 0.3, 0.7, 0.2, 0.4, 0.3, 0.1]
electra_preds_start = [0.4, 0.2, 0.5, 0.7, 0.3, 0.6, 0.1, 0.2, 0.4]

roberta_preds_end =   [0.3, 0.1, 0.3, 0.6, 0.9, 1.4, 2.5]
electra_preds_end =   [0.3, 0.3, 0.3, 0.2, 0.7, 1.5, 0.8, 0.8, 0.6]

print(roberta.tokens)
print(electra.tokens)
print(roberta_ids, roberta_offsets)
print(electra_ids, electra_offsets)

['ĠHello', 'Ġthere', ',', 'Ġu', 'Ġlittle', 'Ġshit', '!!!']
['hello', 'there', ',', 'u', 'little', 'shit', '!', '!', '!']
[20920, 89, 6, 1717, 410, 15328, 16506] [(0, 6), (6, 12), (12, 13), (13, 15), (15, 22), (22, 27), (27, 30)]
[7592, 2045, 1010, 1057, 2210, 4485, 999, 999, 999] [(0, 5), (6, 11), (11, 12), (13, 14), (15, 21), (22, 26), (26, 27), (27, 28), (28, 29)]


In [147]:
preds_end = np.array([[.5,1], [3,8]])
preds_end = preds_end.T
preds_end = np.exp(preds_end) / np.sum(np.exp(preds_end), axis=0)
preds_end.T

array([[0.37754067, 0.62245933],
       [0.00669285, 0.99330715]])

In [148]:
import tensorflow as tf
preds_end = np.array([[.5,1], [3,8]], dtype=np.float32)
tf.nn.softmax(preds_end, axis=-1)
#tf.math.exp(preds_end) / tf.math.reduce_sum(tf.math.exp(preds_end), -1)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.37754068, 0.62245935],
       [0.00669285, 0.9933072 ]], dtype=float32)>

In [151]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [None]:
class Ensemble:
    
    def __init__(self, texts):
        
        # this should be all the text that will be predicted on
        self.__texts = []
        for text in texts:
            self.__texts.append(" ".join(str(text).split()))
        
        # initialize char preds arrays for later
        self.char_preds_start = [
            np.zeros(len(text), dtype=np.float32) for text in self.__texts
        ]
        self.char_preds_end = [
            np.zeros(len(text), dtype=np.float32) for text in self.__texts
        ]
        
        # initialize accumulators:
        # each item in these lists corresponds to
        # a fold prediction of some model
        self.__preds_start = []
        self.__preds_end = []
        self.__weights = []
        self.__offsets = []

    def add(self, preds_start, preds_end, offsets, weight):
        
        # makes sure preds_start and preds_end has been passed to a softmax
        if not np.isclose(np.sum(preds_start), 1, atol=1e-3):
            preds_start = preds_start.T
            preds_start = np.exp(preds_start) / np.sum(np.exp(preds_start), axis=0)
            preds_start = preds_start.T
        if not np.isclose(np.sum(preds_end), 1, atol=1e-3):
            preds_end = preds_end.T
            preds_end = np.exp(preds_end) / np.sum(np.exp(preds_end), axis=0)
            preds_end = preds_end.T
            
        # because self.__texts won't have a preceeding blankspace
        # we need to subtract ByteLevel tokenizers offsets by one
        if offsets[-1][-1] > len(text):
            corrected_offsets = []
            for o1, o2 in offsets:
                corrected_offsets.append((o1-1 if o1 != 0 else o1, o2-1))
            offsets = corrected_offsets
            
        self.__preds_start.append(preds_start)
        self.__preds_end.append(preds_end)
        self.__weights.append(weight)
        self.__offsets.append(offsets)
        
    
    def _token_logits_to_char_logits(self):
        
        # num_models x num_folds long lists
        for preds_start, preds_end, weight, offsets in zip(
            self.__preds_start, self.__preds_end, self.__weights, self.__offsets):
            
            # loop over each example (or ID)
            for i, pred_start, pred_end, (o1, o2) in enumerate(zip(preds_start, preds_end, offsets)):
                self.char_preds_start[i][o1:o2] += pred_start * weight
                self.char_preds_end[i][o1:o2] += pred_end * weight
    
            
    def _char_logits_to_word_logits(self):
        
        def argmax(a, take='min'):
            if take == 'min':
                return np.where(a == a.max())[0][0]
            elif take == 'max':
                return np.where(a == a.max())[0][-1]
        
        self.selected_text_preds = []
        for text, char_pred_start, char_pred_end in zip(self.__texts, self.char_preds_start, self.char_preds_end):
            pos_start = argmax(char_pred_start, take='min')
            pos_end = argmax(char_pred_end, take='max')
            
            self.selected_text_preds.append(text[pos_start:pos_end+1])
        
    
    def compute_predictions(self):
        self._token_logits_to_char_logits()
        self._char_logits_to_word_logits()
        return self.selected_text_preds
    

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold

import common._settings
import common.model_utils as model_utils
import common.prediction_utils as prediction_utils

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--transformer', type=str, default='roberta')
parser.add_argument('--fold', type=int, default=0)
parser.add_argument('--dropout_rate', type=float, default=0.1)
parser.add_argument('--rnn_units', type=int, default=512)
parser.add_argument('--num_hidden_states', type=int, default=4)
args = parser.parse_args()

import importlib
transformer = importlib.import_module(f'{args.transformer}.transformer')
dataset = importlib.import_module(f'{args.transformer}.dataset')



# PATH
INPUT_PATH = '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/tweet-sentiment-extraction/'

# read files
test_df = pd.read_csv(INPUT_PATH+'test.csv')
test_df.loc[:, "selected_text"] = test_df.text.values
submission_df = pd.read_csv(INPUT_PATH+'sample_submission.csv')

fold_num = args.fold

transformer.Model.NUM_HIDDEN_STATES = args.num_hidden_states
transformer.Model.DROPOUT_RATE = args.dropout_rate
transformer.Model.RNN_UNITS = args.rnn_units

config = dataset.Config.from_pretrained(dataset.PATH, output_hidden_states=True)
model = transformer.Model.from_pretrained(dataset.PATH, config=config)

model(np.ones((1, 8), dtype=np.int32))

print("\nfold %02d" % (fold_num))

test_dataset = dataset.Generator.create(
    test_df, batch_size=32, shuffle_buffer_size=-1)

model.load_weights(dataset.PATH+'fine-tuned/' + f'model-{fold_num}.h5')

# predict test set
preds_start, preds_end, text, _, sentiment, offset = \
    model_utils.predict(model, test_dataset, dataset.MAX_SEQUENCE_LENGTH)



# decode test set and add to submission file
selected_text_pred = prediction_utils.transform_to_text(
    preds_start, preds_end, text, offset, sentiment)
submission_df.loc[:, 'selected_text'] = selected_text_pred
submission_df.to_csv(dataset.PATH + "submission.csv", index=False)


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold

import common._settings
import common.model_utils as model_utils
import common.prediction_utils as prediction_utils

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--transformer', type=str, default='roberta')
parser.add_argument('--fold', type=int, default=0)
parser.add_argument('--dropout_rate', type=float, default=0.1)
parser.add_argument('--rnn_units', type=int, default=512)
parser.add_argument('--num_hidden_states', type=int, default=4)
args = parser.parse_args()

import importlib
transformer = importlib.import_module(f'{args.transformer}.transformer')
dataset = importlib.import_module(f'{args.transformer}.dataset')



# PATH
INPUT_PATH = '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/tweet-sentiment-extraction/'

# read files
test_df = pd.read_csv(INPUT_PATH+'test.csv')
test_df.loc[:, "selected_text"] = test_df.text.values
submission_df = pd.read_csv(INPUT_PATH+'sample_submission.csv')

fold_num = args.fold

transformer.Model.NUM_HIDDEN_STATES = args.num_hidden_states
transformer.Model.DROPOUT_RATE = args.dropout_rate
transformer.Model.RNN_UNITS = args.rnn_units

config = dataset.Config.from_pretrained(dataset.PATH, output_hidden_states=True)
model = transformer.Model.from_pretrained(dataset.PATH, config=config)

model(np.ones((1, 8), dtype=np.int32))

print("\nfold %02d" % (fold_num))

test_dataset = dataset.Generator.create(
    test_df, batch_size=32, shuffle_buffer_size=-1)

model.load_weights(dataset.PATH+'fine-tuned/' + f'model-{fold_num}.h5')

# predict test set
preds_start, preds_end, text, _, sentiment, offset = \
    model_utils.predict(model, test_dataset, dataset.MAX_SEQUENCE_LENGTH)



# decode test set and add to submission file
selected_text_pred = prediction_utils.transform_to_text(
    preds_start, preds_end, text, offset, sentiment)
submission_df.loc[:, 'selected_text'] = selected_text_pred
submission_df.to_csv(dataset.PATH + "submission.csv", index=False)


In [76]:

    



        
def char_preds_to_word_preds(preds_char_start, preds_char_end, text):
    start_pos = argmax(preds_char_start, take='min')
    print(start_pos)
    end_pos = argmax(preds_char_end, take='max')
    return text[start_pos:end_pos+1]
    

new_offsets = make_offsets_compatible([roberta_offsets, electra_offsets], text)
print(new_offsets)
    
char_preds_start, char_preds_end = token_preds_to_char_preds(
                    [roberta_preds_start, electra_preds_start],
                    [roberta_preds_end,   electra_preds_end],
                    new_offsets)
print(char_preds_start)
print()
print(char_preds_end)

char_preds_to_word_preds(char_preds_start, char_preds_end, text)

# def subtract_one(offset):
#     corrected_offset = []
#     for x, y in offset:
#         corrected_offset.append((x-1 if x != 0 else x, y-1))
#     return corrected_offset

# subtract_one(roberta_offsets)

# def make_offsets_compatible(offsets_list, text):
#     new_offsets = []
#     for offset in offsets_list:
#         if offset[-1][-1] > len(text):
#             new_offsets.append(subtract_one(offset))
#         else:
#             new_offsets.append(offset)
#     return new_offsets

# def token_preds_to_char_preds(preds_start_list, preds_end_list, offsets_list):
    
    
#     for preds_start, preds_end, offsets in zip(preds_start_list, preds_end_list, offsets_list):
        
#         for pred_start, pred_end, (o1, o2) in zip(preds_start, preds_end, offsets):
#             char_preds_start[o1:o2] += pred_start
#             char_preds_end[o1:o2] += pred_end
    
#     return char_preds_start, char_preds_end


# @staticmethod
# def argmax(a, take='min'):
#     if take == 'min':
#         return np.where(a == a.max())[0][0]
#     elif take == 'max':
#         return np.where(a == a.max())[0][-1]

[[(0, 5), (5, 11), (11, 12), (12, 14), (14, 21), (21, 26), (26, 29)], [(0, 5), (6, 11), (11, 12), (13, 14), (15, 21), (22, 26), (26, 27), (27, 28), (28, 29)]]
[0.9        0.9        0.9        0.9        0.9        0.3
 0.5        0.5        0.5        0.5        0.5        1.2
 0.2        0.9        0.4        0.70000005 0.70000005 0.70000005
 0.70000005 0.70000005 0.70000005 0.3        0.90000004 0.90000004
 0.90000004 0.90000004 0.2        0.3        0.5       ]

[0.6       0.6       0.6       0.6       0.6       0.1       0.4
 0.4       0.4       0.4       0.4       0.6       0.6       0.8
 0.9       1.5999999 1.5999999 1.5999999 1.5999999 1.5999999 1.5999999
 1.4       2.9       2.9       2.9       2.9       3.3       3.3
 3.1      ]
11


', u little shit!!'

In [72]:
np.where(np.array([1,2,3,4,6]) == 6)[0][-1]

4

In [176]:
from collections import Counter

def text_voting(texts, threshold=0.5):
    word_counter = Counter()
    for text in texts:
        text_split = set(text.lower().split()) # follow jaccard metric
        for word in text_split:
            word_counter[word] += 1
    
    words_to_keep = []
    for word, count in word_counter.items():
        # keep words that has been voted for at a rate of more than threshold
        if count >= len(texts) * threshold: 
            words_to_keep.append(word)
    
    print("word counts:", word_counter)
    print("words to keep:", words_to_keep)
        
    return words_to_keep

text1 = "Hello, there world"
text2 = ", there world"
text3 = "there world"
text4 = "Hello, "
text5 = "world"
text6 = "Hello, ther"
text7 = "hello, there, worl"

texts = [text1, text2, text3, text4, text5, text6, text7]

text_voting(texts)




word counts: Counter({'hello,': 4, 'world': 4, 'there': 3, ',': 1, 'ther': 1, 'there,': 1, 'worl': 1})
words to keep: ['hello,', 'world']


['hello,', 'world']

['hello', 'Ġthere', 'Ġyou', 'Ġlittle', 'Ġshit', 'Ġ#####', '#']

In [180]:
a = {'foo': 'bar', "foobar": 'foobar'}


len(a)

2

In [181]:
sub = pd.read_csv('/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/tweet-sentiment-extraction/sample_submission.csv')

In [185]:
sub.iloc[0, 1] = 5

In [186]:
sub

Unnamed: 0,textID,selected_text
0,11aa4945ff,5.0
1,fd1db57dc0,
2,2524332d66,
3,0fb19285b2,
4,e6c9e5e3ab,
...,...,...
3530,2f8444db6c,
3531,11de8c0456,
3532,08f6036add,
3533,27d6472b81,


In [156]:
for _, row in train_df.iterrows():
    if row.sentiment == "positive":
        print(row.text)
        print(row.selected_text)
        print('------------------------')

 Oh! Good idea about putting them on ice cream
Good
------------------------
 haha better drunken tweeting you mean?
better
------------------------
had an awsome salad! I recommend getting the Spicey buffalo chicken salad!
had an awsome salad!
------------------------
 fine! Going to do my big walk today 20 or so miles
fine!
------------------------
 Thank a yoou  how are you? #TwitterTakeover
Thank
------------------------
Have just bought a TV tuner for my laptop.  He he.  I deserve a present
Have just bought a TV tuner for my laptop.  He he.  I deserve a present
------------------------
12:46AM. HAppy birthday little sister of mine. Also, good night Priscilla
HAppy birthday little sister of mine.
------------------------
 aaawww  no worries fresh start to work on growing it out again
aaawww  no worries
------------------------
is sooo tired and too busy to tweet  im glad the weekend is here... yay 4 day-weekend
glad
------------------------
Just watched &quot;Marley &amp; Me.&quot;

Just watched the movie The Holiday. I had forgotten what a feel good movie it was!  Lovely evening.
feel good movie it was!  Lovely evening.
------------------------
 That's a great point... but I'm not **** in ANY! woods, Sara... you should know better.  Haha
great
------------------------
why on twitter does no one talk 2 me i know your all celebs and i'm nt but it's nice to be spoken to once in a while
nice
------------------------
 tut tut. Isn't charging just fun. Thr new palm pre charging platform looks nifty tho.
fun.
------------------------
THANKS GUYS! 200 FOLLOWERS
THANKS
------------------------
Relaxing
Relaxing
------------------------
 tell everybody i said happy mothers day!  love ya! #1 fan love you miley you rock
d happy mothers day!  love ya! #1 fan love you miley you rock
------------------------
 I really hope you see my tweets. Sent you so much, I swear. Do a tour in the Philippines, please?  *prays*
please?  *prays*
------------------------
Another one popped up 

_dog Lucky you! it's drizzling again. Mommy says the puddles are big enough 4 me to swim in, so no dog park today..
Lucky you!
------------------------
hoping to hear from you all soon
hoping to hear from you all soon
------------------------
 awe thanks
awe thanks
------------------------
 And so very you.  You know I say it w/ nothing but love, dude.
t love
------------------------
Oh yeaah.  we'll still be bffs  aha _marie.
Oh yeaah.  we'll still be bffs  aha _marie.
------------------------
Wish I was @ wembley
Wish
------------------------
 i did, thanks laura
thanks
------------------------
just got home from the show opening.  fantastic. thanks to everyone who came out
fantastic.
------------------------
A very exciting week
exciting
------------------------
 will be following you tomorrow!!  sleep well
sleep well
------------------------
  Well I hope you have had a good weekend and even more so have a good day at work 2moro its already monday here,so far so good
e,so far so go

decided that wolf in a future Star Trek Logo game would be much cooler than Chewy in Star Wars
cooler
------------------------
 looks to be a great morning already!
great
------------------------
 aww i hope it does fly by because JT episodes are usually really good (and it's early but so far this ep hassn't disappointed)
aww i hope it does fly by because JT episodes are usually really good (and it's early but so far this ep hassn't disappointed)
------------------------
just woke up, having coffee, listening to Music, reading RSS...Sunday feels great
Sunday feels great
------------------------
  Thank you so much. That was so nice of you and I was happy to hear you voice  You've really started something Good!! xo
d I was happy to hear you voice  You've really started something Good!
------------------------
its 35 now.   ilycecily &lt;3
ts 35 now.   ilycecily &l
------------------------
 Hmm.  My VPN works fine.    (Oh.. wait.. I don't need VPN anymore.) http://tinyurl.com/cao6tu
s fi

i'm gonna eat some chips??!! anybody, want some??? hahaha!
hahaha!
------------------------
 Great job!
Great job!
------------------------
 this is very true about ! but you do have to admit it was pret-ty funny! im bout to go you tube it!  lol
funny!
------------------------
y do i only have 2 people following me  people follow me please x
please
------------------------
 Thanks!!!!  Happy Mother's day to you too!!
Happy
------------------------
_ Oh ok then, I've been there before for a wedding. It was heaps nice there. I'm a big fan of Qld
It was heaps nice there.
------------------------
 everythings fine now
fine
------------------------
Its relaxing time..a movie and some treats
relaxing
------------------------
 I wake up at 4:30 during the week - and funny, Deathcab makes me think of you!  (&amp; I use your calendar to track work hours)
funny,
------------------------
 Yep. I hope my lame attempt with the spaces helped me to hinder them, lol. I know tomorrow I guess.
helped
--

 hahaha, as cartoon mascots go, the virl.com monster has some very nice details... a good shadow + realistic tonsils.
a good shadow + realistic tonsils.
------------------------
 I'm sure It will be repeated soon, seems to be on  TV quite a lot lately. Glad you mentioned it last night, watched it again
. Glad you mentioned it last night, watched it again
------------------------
 BTW I STILL can't believe how Awesome the NEWJABBAKIDZ performance was...U in the masks..I screamed at my pc
Awesome
------------------------
It's time for school, y'all !1!! HAPPY STARWARS DAY
HAPPY
------------------------
 Ur welcome
Ur welcome
------------------------
I'm so excited for Mothers Day!  This has been a big year for me &amp; Olivia, and she's finally old enough to be excited &amp; understand!
I'm so excited for Mothers Day!
------------------------
Had a long day at work. Stood home. Now sleeping to have another long day at work tomorrow &amp; happy mothers day to all mothers
mp; happy mothers

------------------------
Best show of my life. Guess I'm going to sc this week
Best show of my life.
------------------------
 Morning, dude! Ivy has decided she wants to go on a boat today. Are you guys around this PM? Would be nice to see you.
Would be nice to see you.
------------------------
 welcome aboard my friend... the fans are gonna LOVE ya here
LOVE
------------------------
 I would love to see the sun again.  It has been raining here for 3 days now.
love
------------------------
 awww wish I was there! Have a brew for me B!
awww wish I was there!
------------------------
 well, maybe you can always head for CNT. be sure to save some for me!
be sure to save some for me!
------------------------
 Hii,I freaking love you  I would be the happiest 13 year old girl alive if you replied to this.
I freaking love
------------------------
 Ashlie, thank you.  You made me feel a little better.
Ashlie, thank you.  You made me feel a little better.
------------------------
 Aww that's s

Fun webcamming with caroooo
Fun
------------------------
Happy Mother's Day to all the Ladies... With all the moments we cherish with our children, today let those moments cherish you in return.
Happy
------------------------
  maybe a good night's sleep for everyone?
a good
------------------------
 I love how Alex Pardee colours his picturs, so differnt
I love
------------------------
 weird as usual, but ok... that's why we like it
that's why we like it
------------------------
in the words of liana corber: Moreover, WIAIH was a humbling experience... i surprisingly had a decent night
decent
------------------------
So happy. Great, glowy, ravey, beery night. Now smokey, pizzary night with cool friends. And I love liz  x
happy.
------------------------
 hale yeahhh that's the coolest part about it ;) hahaha. And I can match grieco!
that's the coolest part
------------------------
Little brothers being funny  Congrats on the engagement http://tinyurl.com/cv5nw8
funny
----------------

Mahalo  great show!   aloha, thx for makin it out to Kauai, we enjoyed it!  Safe trip
we enjoye
------------------------
 Last night was awesome! Thanks for all the hard work you put into it! Off to get some coffee now...
Last night was awesome!
------------------------
Off.  Have a good star wars day, fonz day, dancing taco day, whatever you celebrate, it's all good!
good
------------------------
is wishing all of the mother's out there a very Happy Mother's Day!! Love ya!
Love
------------------------
has just finished reading Twilight and thought it was ****...okay, it was alright...okay, i love it and really wanna read the next one
has just finished reading Twilight and thought it was ****...okay, it was alright...okay, i love it and really wanna read the next one
------------------------
 glad its nice there. Gray skies and rainy in wisconsin.
glad its nice there.
------------------------
omds! holly steel, bless her!  x
bless her!
------------------------
Awesome sunday service  

Heading off to Hollywood Studios today  Manta and Kraken were both awesome yesterday, feeling like doing a few more rides!!
awesome
------------------------
 lol. just don't ever forget me
lol.
------------------------
The 10 Coolest Foreign Words The English Language Needs. Check the number one. Amazing!  http://is.gd/s9B1
Amazing!
------------------------
You know funny thing about everyone packing for E3 is that I am packing to go to mexico... not E3
funny
------------------------
Starting my internship today, pretty excited!
excited!
------------------------
had a nice time with her little sister...baby is all grown up
had a nice time
------------------------
second song is coming along nicley
nicley
------------------------
Bumper Sticker: &quot;If you don't want to stand up for our troops, feel free to stand in front of them&quot; Thank you to all in the military
them&quot;
------------------------
 lol hell yes i'm keen. WE'RE GOING SKIING AT TREBLE CONE SOMETIME THIS WINTER
hel

  Happy Monday!!  Hope you have a great week!
Happy Monday!!  Hope you have a great week!
------------------------
 omg! shannon Happy Birthday!!!! I'm celebrating for you!
shannon Happy Birthday!!!!
------------------------
is enjoying the night with her folks.
is enjoying the night with her folks.
------------------------
 hehe i found ya
hehe
------------------------
a successful shopping day
successful
------------------------
 Nako! Umuulan pa naman!  Anyway, enjoy the bike rides!
, enjoy
------------------------
 Yeahhh... thx. I figured it out
thx.
------------------------
 http://twitpic.com/4j6kc - man i love your shelves! way to go diy diva!
- man i love your shelves! way to go diy diva!
------------------------
  lure her in with a foot massage!
lure her in
------------------------
Going to the Star Trek premier tomorrow night, uber stoked. I don't care if I'm a nerd, Star Trek is amazecore!!
amazecore!!
------------------------
Not feeling very good at all, why does this ha

watching hoping for haley
hoping
------------------------
 Hi, I hope Ray is o.k. Please give him our love and hope he returns soon. Hope its not too bad
love
------------------------
Yay, I won at the  Bags #Giveaway from . And those bags are so pretty!! Or maybe I'll have a belt? We'll see!
won
------------------------
Ok that **** duet was hysterical LOL
hysterical
------------------------
 i'm trying to figure that out right now
i'm trying to figure that out right now
------------------------
Richelle Mead - Succubus Blues-fabulous read! very entertaining, lots of fun and interesting too!! next please..
! very entertaining, lots of fun and interesting too!! next please..
------------------------
long weekend. thank god i can sleep in on monday
thank
------------------------
 Europe sounds gr8! Will finish my exam on Teus and then we'll  talk about it
gr8!
------------------------
 LOL! I liked the log ride reference, but all I think about when I think of that ride is how wet you ge

having a great mothers day
great
------------------------
 loving your music , had a good recommendation from someone who knows his stuff
loving
------------------------
Just got out of the VAST show &amp; IT KICKED ****! Just as mindblowing live as on CD. Sang my fave songs. Im n awe!
as mindblowi
------------------------
Luv you sissy
Luv
------------------------
 good!  now let's go get this done..... www.HennesseyBlack.com
good!  now let's go get this done
------------------------
 can't wait to crack it open and no doubt will learn from it as well as support my evangelizing cubicle-freedom-ness!
can't wait to crack it open and no doubt will learn from it as well as support my evangelizing cubicle-freedom-ness!
------------------------
Off work in half an hour. Getting an MRI after that. Yay me
Yay me
------------------------
 but cover your sneezes with a tissue for love of God! My germaphobe-ness has significantly increased since joining healthcare
love
------------------------
 

 I wish I was getting pancakes  booooooooo
wish
------------------------
 Oh wow they are really good  I think Im going to use one of them lol.
Oh wow they are really good  I think Im going to use one of them lol.
------------------------
goodnight all in the twitterverse
goodnight
------------------------
It's Mother's Day! Went out to lunch to celebrate and then went shopping for a gift for my mom
celebrate
------------------------
 Since you travel quite often and use online travel sites we would love some feedback on iXiGO.com from you
love
------------------------
going to church  god bless you all.
going to church  god bless you all.
------------------------
 music is always there when noone else understands  i agree it's very important, a single song can change your whole day
s  i agree it's very important,
------------------------
 Enjoy the ride
Enjoy
------------------------
I wish yesterday was Friday
I wish
------------------------
Happy Star Wars Day everyone! May the 4th 

 LOL  I know what ya mean. Watching everyone else act a fool is much better the  not remembering acting a fool yourself
. Watching everyone else act a fool is much better the  not remembering acting a fool yoursel
------------------------
_Ross if you look at the js code in facebook you will notice a few js functions...nahh just kidding, a buddy told me
just kidding,
------------------------
_Cool yep here's hoping
hoping
------------------------
Good morning to another too late sleeper .  One of my excellent designers is Valerie (see ) picture
One of my excellent designers
------------------------
awwww bless her  she needs another chance.
awwww bless her
------------------------
 yeah well a deadline is in T-9 hours, that's architecture for you, oh well... mmm that coffee sounds like a good idea
good
------------------------
 you're welcome char, glad you liked the cake, it's pretty simple you should give it a try
welcome
------------------------
 i wnt there yesterday!  wish u could

 Cheers!  I'd like to thank zBrush for making it all possible
o thank
------------------------
Rockstar photographer shoot went great tonight. A little different than the usual stuff - very nice.
great
------------------------
this thing was more fun with my itouch
fun
------------------------
 It would be amazing if we could meet us in Germany! I have been to Germany twice.
It would be amazing
------------------------
 yeah that's trueee  i can't wait till the tour dvd comes out, that tour was so epic (: and the backstage material is hilarious
s hilarious
------------------------
its nice to have no assignments for the night
nice
------------------------
Right, is too glorious a day to be sat inside....must go and do something more active! Have a good day all
glorious
------------------------
 Hey Luis, thanks!  I have Flash and my prof's code to thank for that.  *cough*
thanks!
------------------------
#f1 soon  good luck brawn and mclaren fix up look sharp :p
good luck
-------------

Lovin  the clean shaven Mr Flowers!  He looks so young!!!  Reminds me that I'm getting old
Lovin  the clean shaven Mr Flowers!  He looks so young!!
------------------------
 aw its okay tht happened wid me too..am so glad thts OVER now!am not helpin here am i?!?lol thnx for postin WMIAD  loved it
?lol thnx for postin WMIAD  loved it
------------------------
 noboby's behind. I think I lead with $14,8 so far
lead
------------------------
watching W. with jas. i love my brother he's my best friend
i love my brother he's my best friend
------------------------
_au my pleasure... hey it was played at my going away from nz party back in 87 lots   so it is burned into my brain
my pleasure..
------------------------
live long and prosper
live long
------------------------
 what I do now  anyhoo thanks for the *spank* :-O x
o thanks
------------------------
_Perez That's great! I hope I'll study with my bff too and we'll live together. well..we will see
great!
------------------------
  Yes, y

_at_work agree totally, think though if we can take a point off Everton and Liverpool beat Man City we are happy days
happy days
------------------------
 Awww, thank you!  &amp; you're welcome lol
Awww, thank you!
------------------------
 i always have a good day! i make it good all the time but you make it better w/ ur morning greetings!! thank you!!
good
------------------------
 well if you're ever headed back to the west coast, hit me up. I have a tiny place, but we can hack and hang out at the beach
d hang out at the beach
------------------------
Nine o'clock on a Sunday morning - Outside, sat around the table enjoying the early rays of sun, reading Twitter article in Sunday Times
enjoying the early rays of sun,
------------------------
 Ah good. Glad to be useful.
Glad
------------------------
Thundershowers plus baseball equals awwww
awwww
------------------------
2am feedings for the baby are fun when he is all smiles and coos
fun
------------------------
 I envy everyone on

Goood Afternooon
Goood
------------------------
 bahaha.. Weekends are just too short.. Esp when it's nice out! I wanna stop time like evie on outta this world! Morning!
bahaha.. Weekends are just too short.. Esp when it's nice out! I wanna stop time like evie on outta this world! Morning!
------------------------
loves the way the sky looks now i.e. no cloud at all.  but that would only mean that it's ultra hot outside.... http://plurk.com/p/rpb7t
loves th
------------------------
 Lol - we're like polar opposites, Ben! Maybe that's why we get on so well
that's why we get on so well
------------------------
Had fried oreos downtown vegas, went to the park with her cousins. BEST DAY EVERRRR.  Happy mommy's day!
BEST DAY EVERRRR.
------------------------
just about to say goodbye to  and  two of the best interns that Elevate have ever had
e best
------------------------
Went to see Wolverine with the husband (who is off work today) - was pretty good
good
------------------------
Thanks 

In [139]:
from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer

tokenizer2 = BertTokenizer('/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/bert-base-uncased/vocab.txt')
tokenizer = RobertaTokenizer('/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/roberta-base/vocab.json', 
                             '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/roberta-base/merges.txt')
tokenizer3 = XLNetTokenizer('/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/xlnet-base-cased/spiece.model')

In [143]:
tokenizer3.tokenize('&lt;-----bored to death')

['▁&', 'lt', ';', '-', '-', '-', '-', '-', 'bor', 'ed', '▁to', '▁death']

In [101]:
import sys
sys.path.insert(0, '../tweet-sentiment-extraction/')

import pandas as pd
import numpy as np
import tensorflow as tf

from roberta.tokenizer import preprocess
import common.dataset as dataset

INPUT_PATH = '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/tweet-sentiment-extraction/'
ROBERTA_PATH  = '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/roberta-base/'
def to_numpy(*args):
    out = []
    for arg in args:
        if arg.dtype == tf.string:
            arg = [s.decode('utf-8') for s in arg.numpy()]
            out.append(arg)
        else:
            arg = arg.numpy()
            out.append(arg.squeeze(0))
    return out


df = pd.read_csv(INPUT_PATH + 'train.csv')
df = df.iloc[0:1]


dataset.PREPROCESS = preprocess

generator = dataset.Generator.create(df, 1, -1)

def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

for example in generator:

    ids, mask, types, offset, tstart, tend, text, selected_text, sentiment = to_numpy(*example)
    
    
    print(ids)
    print(mask)
    print(types)

[    0  7974     2     2  1240     5  1445   662    11    10   529   885
    73    10 13323     6     8   127  3504    21    45  1372   885    73
   106     4  3739     9  1531     4   939    56    97   708    13   127
   662     2     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1   

In [123]:
import numpy as np
import tensorflow as tf
from transformers import *

MAX_SEQUENCE_LENGTH = 128
TOKENIZER = RobertaTokenizer.from_pretrained(ROBERTA_PATH)

def preprocess(tweet, selected_text, sentiment):
    """
    Will be used in tf.data.Dataset.from_generator(...)

    """

    # The original strings have been converted to
    # byte strings, so we need to decode it
    tweet = tweet.decode('utf-8')
    selected_text = selected_text.decode('utf-8')
    sentiment = sentiment.decode('utf-8')
    

    # Clean up the strings a bit
    tweet         = " ".join(str(tweet).split())
    selected_text = " ".join(str(selected_text).split())

    # find the intersection between text and selected text
    idx_start, idx_end = None, None
    for index in (i for i, c in enumerate(tweet) if c == selected_text[0]):
        if tweet[index:index+len(selected_text)] == selected_text:
            idx_start = index
            idx_end = index + len(selected_text) - 1
            break
    
    word_tokens = []
    char_to_word_offset = []
    for i in range(len(tweet)):
        if i == 0 or tweet[i-1] == " ":
            word_tokens.append(tweet[i])
        else:
            word_tokens[-1] += tweet[i]
        char_to_word_offset.append(len(word_tokens) - 1)
        
    target_start = char_to_word_offset[idx_start]
    target_end = char_to_word_offset[idx_end]
    print(target_end)
    print(char_to_word_offset)
    sentiment_token = TOKENIZER.tokenize(sentiment)

    token_to_word_offset = []
    word_to_token_offset = []
    tokens = []
    for (i, word_token) in enumerate(word_tokens):
        word_to_token_offset.append(len(tokens))
        sub_tokens = TOKENIZER.tokenize(word_token)
        for sub_token in sub_tokens:
            token_to_word_offset.append(i)
            tokens.append(sub_token)
    print(word_to_token_offset)
    tok_start_position = word_to_token_offset[target_start]
    if target_end < (len(word_to_token_offset) - 1):
        tok_end_position = word_to_token_offset[target_end + 1] -1
    else:
        tok_end_position = len(tokens) - 1
    print(tok_end_position)
    sentiment_ids = TOKENIZER.convert_tokens_to_ids(sentiment_token)
    input_ids_orig = TOKENIZER.convert_tokens_to_ids(tokens)
    
    input_ids = (
        [TOKENIZER.cls_token_id] + sentiment_ids + [TOKENIZER.sep_token_id] 
        + input_ids_orig + [TOKENIZER.sep_token_id]
    )
    token_type_ids = [0, 0, 0] + [1] * (len(input_ids_orig) + 1)
    attention_mask = [1] * len(token_type_ids)
    target_start = tok_start_position + 3
    target_end = tok_end_position + 3
    token_to_word_offset = [-1, -1, -1] + token_to_word_offset + [-1]
    
    #print(target_start)
    #print(target_end)
    print(target_end)
    print(token_to_word_offset)
    # let check token start/end to word start/end
    word_start = token_to_word_offset[target_start] # don't forget - 3
    word_end = token_to_word_offset[target_end]
    print(word_end)
    print(tweet.split(" "))
    # simple preprocess, note that doc_token is the same for all tokenizer
    # (all_doc_tokens is different but we don't use it for ensemble),
    # so with token start/end to word start/end we can easily ensemble on
    # word level rather than token level.
    pred_based_word = " ".join(tweet.split(" ")[word_start:word_end+1])
    print('full texttt:', tweet)
    print('predicteddd:', pred_based_word)
    print('groundtruth:', selected_text)
    
    
    padding_length = MAX_SEQUENCE_LENGTH - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([TOKENIZER.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        token_to_word_offset = token_to_word_offset + ([-1] * padding_length)
    else:
        # not yet implemented
        pass

    # maybe u want return tok_to_orig_index for each samples.            
    return (
        input_ids, attention_mask, token_type_ids, token_to_word_offset,
        target_start, target_end, 
        tweet, 
        selected_text, 
        sentiment,
    )


class TweetSentimentDataset(tf.data.Dataset):

    OUTPUT_TYPES = (
        tf.dtypes.int32,  tf.dtypes.int32,   tf.dtypes.int32, 
        tf.dtypes.int32,  tf.dtypes.float32, tf.dtypes.float32,
        tf.dtypes.string, tf.dtypes.string,  tf.dtypes.string,
    )

    OUTPUT_SHAPES = (
        (MAX_SEQUENCE_LENGTH,),   (MAX_SEQUENCE_LENGTH,), (MAX_SEQUENCE_LENGTH,), 
        (MAX_SEQUENCE_LENGTH,),   (),                     (),
        (),                       (),                     (),
    )

    # AutoGraph will automatically convert Python code to
    # Tensorflow graph code. You could also wrap 'preprocess'
    # in tf.py_function(..) for arbitrary python code
    def _generator(tweet, selected_text, sentiment):
        for tw, st, se in zip(tweet, selected_text, sentiment):
            yield preprocess(tw, st, se)

    # This dataset object will return a generator
    def __new__(cls, tweet, selected_text, sentiment):
        return tf.data.Dataset.from_generator(
            cls._generator,
            output_types=cls.OUTPUT_TYPES,
            output_shapes=cls.OUTPUT_SHAPES,
            args=(tweet, selected_text, sentiment)
        )

    @staticmethod
    def create(dataframe, batch_size, shuffle_buffer_size=-1):
        dataset = TweetSentimentDataset(
            dataframe.text.values,
            dataframe.selected_text.values,
            dataframe.sentiment.values
        )

        dataset = dataset.cache()
        if shuffle_buffer_size != -1:
            dataset = dataset.shuffle(shuffle_buffer_size)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset



import pandas as pd
train_df = pd.read_csv(INPUT_PATH+'train.csv')
train_df.dropna(inplace=True)
train_dataset = TweetSentimentDataset.create(
    train_df.iloc[258:259], 1, shuffle_buffer_size=2048)

for data in train_dataset:
    print(data)

0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
[0, 3]
2
5
[-1, -1, -1, 0, 0, 0, 1, 1, -1]
0
['wow.????', '??????']
full texttt: wow.???? ??????
predicteddd: wow.????
groundtruth: wow.
(<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[    0, 12516,     2, 34798,     4, 27282, 27282, 28749,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,

In [103]:
TOKENIZER.tokenize('work...too')

['work', '...', 'too']

In [104]:
from tokenizers import ByteLevelBPETokenizer as Tokenizer
PATH = '/home/alex/Projects/kaggle/tweet-sentiment-extraction/input/roberta-base/'
TOKENIZER = Tokenizer(
    vocab_file=PATH+"vocab.json",
    merges_file=PATH+"merges.txt",
    lowercase=True,
    add_prefix_space=True
)


def decode(pred_start, pred_end, text, offset):
    decoded_text = ""
    for i in range(pred_start, pred_end+1):
        decoded_text += text[offset[i][0]:offset[i][1]]
        if (i+1) < len(offset) and offset[i][1] < offset[i+1][0]:
            decoded_text += " "
    return decoded_text

class Generator(tf.data.Dataset):

    OUTPUT_TYPES = (
        tf.dtypes.int32,  tf.dtypes.int32,   tf.dtypes.int32,
        tf.dtypes.int32,  tf.dtypes.float32, tf.dtypes.float32,
        tf.dtypes.string, tf.dtypes.string,  tf.dtypes.string,
    )

    # AutoGraph will automatically convert Python code to
    # Tensorflow graph code. You could also wrap 'preprocess'
    # in tf.py_function(..) for arbitrary python code
    def _generator(tweet, selected_text, sentiment):
        for tw, st, se in zip(tweet, selected_text, sentiment):
            yield preprocess(tw, st, se)

    # This dataset object will return a generator
    def __new__(cls, tweet, selected_text, sentiment):
        return tf.data.Dataset.from_generator(
            cls._generator,
            output_types=cls.OUTPUT_TYPES,
            args=(tweet, selected_text, sentiment)
        )

    @staticmethod
    def create(dataframe, batch_size, shuffle_buffer_size=-1):
        dataset = Generator(
            dataframe.text.values,
            dataframe.selected_text.values,
            dataframe.sentiment.values
        )

        dataset = dataset.cache()
        if shuffle_buffer_size != -1:
            dataset = dataset.shuffle(shuffle_buffer_size)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset


def preprocess(tweet, selected_text, sentiment):
    """
    Will be used in tf.data.Dataset.from_generator(...)

    """

    # The original strings have been converted to
    # byte strings, so we need to decode it
    tweet = tweet.decode('utf-8')
    selected_text = selected_text.decode('utf-8')
    sentiment = sentiment.decode('utf-8')

    # Clean up the strings a bit
    tweet         = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    # find the intersection between text and selected text
    idx_start, idx_end = None, None
    for index in (i for i, c in enumerate(tweet) if c == selected_text[1]):
        if " " + tweet[index:index+len(selected_text)-1] == selected_text:
            idx_start = index
            idx_end = index + len(selected_text) - 2
            break

    intersection = [0] * len(tweet)
    if idx_start != None and idx_end != None:
        for char_idx in range(idx_start, idx_end+1):
            intersection[char_idx] = 1

    # tokenize with offsets
    enc = TOKENIZER.encode(tweet)
    input_ids_orig, offsets = enc.ids, enc.offsets

    # compute targets, one-hot encoding
    target_idx = []
    for i, (o1, o2) in enumerate(offsets):
        if sum(intersection[o1: o2]) > 0:
            target_idx.append(i)

    target_start = target_idx[0]
    target_end = target_idx[-1]

    sentiment_map = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }

    input_ids = [0] + [sentiment_map[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    attention_mask = [1] * len(token_type_ids)
    offsets = [(0, 0)]*4 + offsets + [(0, 0)]
    target_start += 4
    target_end += 4
    
    decoded_text = ""
    for i in range(target_start, target_end+1):
        decoded_text += tweet[offsets[i][0]:offsets[i][1]]
        if (i+1) < len(offsets) and offsets[i][1] < offsets[i+1][0]:
            decoded_text += " "
            
    print('full texttt:', tweet)
    print('predicteddd:', decoded_text)
    print('groundtruth:', selected_text)
    
    padding_length = MAX_SEQUENCE_LENGTH - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        offsets = offsets + ([(0, 0)] * padding_length)


    return (
        input_ids, attention_mask, token_type_ids, offsets,
        target_start, target_end, tweet, selected_text, sentiment,
    )

import pandas as pd
train_df = pd.read_csv(INPUT_PATH+'train.csv')
train_df.dropna(inplace=True)
train_dataset = Generator.create(
    train_df.iloc[244:344], 1, shuffle_buffer_size=2048)

for data in train_dataset:
    pass

full texttt:  will be doing all the chores just for her mama
predicteddd:  will be doing all the chores just for her mama
groundtruth:  will be doing all the chores just for her mama
full texttt:  well, the requirements are quite high as well
predicteddd:  well, the requirements are quite high as well
groundtruth:  well, the requirements are quite high as well
full texttt:  going to bed its late and I have headache
predicteddd:  headache
groundtruth:  headache
full texttt:  Awwwwwww holly
predicteddd:  Awwwwwww holly
groundtruth:  Awwwwwww holly
full texttt:  enjoying and exploring my n96 phone.. grr.. it's awesome.. but i wanted the samsung omnia..
predicteddd:  it's awesome..
groundtruth:  it's awesome..
full texttt:  my pleasure, its a great recipe
predicteddd:  pleasure,
groundtruth:  pleasure,
full texttt:  yay, you get to ride on the tow truck. How cool is that?
predicteddd:  yay, you get to ride on the tow truck. How cool is that?
groundtruth:  yay, you get to ride on the tow tr