In [None]:
import math
import glob
import sys
import random
import gc
import os
import re
from math import floor, ceil

import pandas as pd
import numpy as np
#TF&K
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, losses, models, callbacks
from tensorflow.keras.initializers import *
from tensorflow.keras.regularizers import *
from tensorflow.keras.constraints import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.layers import *
from tensorflow.keras.losses import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
print(tf.__version__)

from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers

import warnings
warnings.filterwarnings('ignore')

2.1.0


In [None]:
def seed_everything(framework,SEED):
    random.seed(SEED)
    os.environ["PYTHONHASHSEED"] = str(SEED)
    if framework == 'Pytorch':
        torch.manual_seed(SEED)
        torch.cuda.manual_seed(SEED)
    elif framework == 'Tensorflow':
        tf.random.set_seed(SEED)
        
framework = 'Tensorflow'
SEED = 88888
seed_everything(framework, SEED)

In [None]:
MAX_LEN = 96
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
EPOCHS = 3
BATCH_SIZE = 32
PAD_ID = 1
LABEL_SMOOTHING = 0.1
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

In [None]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask[k,:len(enc.ids)+3] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+2] = 1
        end_tokens[k,toks[-1]+2] = 1

In [None]:
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').fillna('')

ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask_t[k,:len(enc.ids)+3] = 1

In [None]:
import pickle

def save_weights(model, dst_fn):
    weights = model.get_weights()
    with open(dst_fn, 'wb') as f:
        pickle.dump(weights, f)


def load_weights(model, weight_fn):
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model

In [None]:
def loss_fn(y_true, y_pred):
    # adjust the targets for sequence bucketing
    ll = tf.shape(y_pred)[1]
    y_true = y_true[:, :ll]
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
        from_logits=False, label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss

def scheduler(epoch):
    return 3e-5 * 0.2**epoch

lrate = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)

    lens = MAX_LEN - tf.reduce_sum(padding, -1)
    max_len = tf.reduce_max(lens)
    ids_ = ids[:, :max_len]
    att_ = att[:, :max_len]
    tok_ = tok[:, :max_len]

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    #config.output_hidden_states = True
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids_,attention_mask=att_,token_type_ids=tok_)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(128, 3,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 3,padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(128, 3, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 3, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)
    
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) 
    model.compile(loss=loss_fn, optimizer=optimizer)
    
    # this is required as `model.predict` needs a fixed size!
    x1_padded = tf.pad(x1, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    x2_padded = tf.pad(x2, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    
    padded_model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1_padded,x2_padded])
    return model, padded_model

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
jac = []; VER='v2'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    model, padded_model = build_model()
        
    #sv = tf.keras.callbacks.ModelCheckpoint(
    #    '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
    #    save_weights_only=True, mode='auto', save_freq='epoch')
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='auto', patience=2, 
                                          restore_best_weights=True, verbose=1)
    inpT = [input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]]
    targetT = [start_tokens[idxT,], end_tokens[idxT,]]
    inpV = [input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]]
    targetV = [start_tokens[idxV,], end_tokens[idxV,]]
    # sort the validation data
    shuffleV = np.int32(sorted(range(len(inpV[0])), key=lambda k: (inpV[0][k] == PAD_ID).sum(), reverse=True))
    inpV = [arr[shuffleV] for arr in inpV]
    targetV = [arr[shuffleV] for arr in targetV]
    weight_fn = '%s-sequence-bucketing-roberta-%i.h5'%(VER,fold)
    for epoch in range(1, EPOCHS + 1):
        # sort and shuffle: We add random numbers to not have the same order in each epoch
        shuffleT = np.int32(sorted(range(len(inpT[0])), key=lambda k: (inpT[0][k] == PAD_ID).sum() + np.random.randint(-3, 3), reverse=True))
        # shuffle in batches, otherwise short batches will always come in the beginning of each epoch
        num_batches = math.ceil(len(shuffleT) / BATCH_SIZE)
        batch_inds = np.random.permutation(num_batches)
        shuffleT_ = []
        for batch_ind in batch_inds:
            shuffleT_.append(shuffleT[batch_ind * BATCH_SIZE: (batch_ind + 1) * BATCH_SIZE])
        shuffleT = np.concatenate(shuffleT_)
        # reorder the input data
        inpT = [arr[shuffleT] for arr in inpT]
        targetT = [arr[shuffleT] for arr in targetT]
        model.fit(inpT, targetT, 
            epochs=epoch, initial_epoch=epoch - 1, batch_size=BATCH_SIZE, verbose=DISPLAY, callbacks=[lrate,es],
            validation_data=(inpV, targetV), shuffle=False)  # don't shuffle in `fit`
        save_weights(model, weight_fn)

    print('Loading model...')
    # model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    load_weights(model, weight_fn)

    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = padded_model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    print('Predicting Test...')
    preds = padded_model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        text1 = " "+" ".join(train.loc[k,'text'].split())
        
        if train.loc[k,'sentiment'] == 'neutral':
            if a>b:
                st = train.loc[k,'text']
            else:
                enc = tokenizer.encode(text1)
                st = tokenizer.decode(enc.ids[a-2:b-1])
        else:
            if a>b:
                print('index :' + str(k))
                print('text :' + train.loc[k,'text'])
                print('selected_text :' + train.loc[k,'selected_text'])
                st_a , st_b = '' , ''
                start_sort = np.argsort(oof_start[k,])[::-1] 
                end_sort = np.argsort(oof_end[k,])[::-1]
                a1 = start_sort[1]
                b1 = end_sort[1]
                a2 = start_sort[2]
                b2 = end_sort[2]
                #print('a: ' + str(a) + ' a proba: ' + str(oof_start[k,][a]) + '\na1: ' + str(a1) + ' a1 proba: ' + str(oof_start[k,][a1]) + '\na2: '+ str(a2) + ' a2 proba: ' + str(oof_start[k,][a2]))
                #print('b: ' + str(b) + ' b proba: ' + str(oof_end[k,][b]) + '\nb1: ' + str(b1) + ' b1 proba: ' + str(oof_end[k,][b1]) + '\nb2: '+ str(b2) + ' b2 proba: ' + str(oof_end[k,][b2]))
                
                if a1 < b:
                    enc = tokenizer.encode(text1)
                    st_a = tokenizer.decode(enc.ids[a1-2:b-1])
                    #print('model_select a1<b :' + st_a)
                elif a2 < b:
                    enc = tokenizer.encode(text1)
                    st_a = tokenizer.decode(enc.ids[a2-2:b-1])
                    #print('model_select a2<b :' + st_a)
                    
                if a < b1:
                    enc = tokenizer.encode(text1)
                    st_b = tokenizer.decode(enc.ids[a-2:b1-1])
                    #print('model_select a<b1 :' + st_b)
                elif a1 < b1:
                    enc = tokenizer.encode(text1)
                    st_b = tokenizer.decode(enc.ids[a1-2:b1-1])
                    #print('model_select a1<b1 :' + st_b)
                    
                st = st_a + ' ' + st_b
                print('model_select :' + st)
                if len(st.split()) == 0:
                    st = train.loc[k,'text']

            else:
                enc = tokenizer.encode(text1)
                st = tokenizer.decode(enc.ids[a-2:b-1])
            st = st.replace('!!!!', '!') if len(st.split())==1 else st
            st = st.replace('..', '.') if len(st.split())==1 else st
            st = st.replace('...', '.') if len(st.split())==1 else st
        all.append(jaccard(st,train.loc[k,'selected_text']))
        #a = np.argmax(oof_start[k,])
        #b = np.argmax(oof_end[k,])
        #if a>b: 
        #    st = train.loc[k,'text'] # IMPROVE CV/LB with better choice here
        #else:
        #    text1 = " "+" ".join(train.loc[k,'text'].split())
        #    enc = tokenizer.encode(text1)
        #    st = tokenizer.decode(enc.ids[a-2:b-1])
        #all.append(jaccard(st,train.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()

#########################
### FOLD 1
#########################
Train on 21984 samples, validate on 5497 samples

Epoch 00001: LearningRateScheduler reducing learning rate to 3e-05.
Train on 21984 samples, validate on 5497 samples

Epoch 00002: LearningRateScheduler reducing learning rate to 6e-06.
Epoch 2/2
Train on 21984 samples, validate on 5497 samples

Epoch 00003: LearningRateScheduler reducing learning rate to 1.2000000000000004e-06.
Epoch 3/3
Loading model...
Predicting OOF...
Predicting Test...
index :407
text : yep infact she is popular, miss india 99, talented film actress .... and lot more
selected_text :she is popular,
model_select : popular,  popular, miss india 99, talented
index :2187
text :New work wellness challenge not going well.  I committed to not check email between 10 pm and 6 am.  Failed on first day.  Twice
selected_text :Failed on first day.  Twic
model_select : not going well.  not going well. i committed to not check email between 10 pm and 6 am. failed
inde

In [None]:
print(jac)
print('>>>> OVERALL 5Fold CV Jaccard =',np.mean(jac))

[0.7096777971288136, 0.7205401777315689, 0.7108880016902843, 0.707352406599804, 0.7072035540222091]
>>>> OVERALL 5Fold CV Jaccard = 0.7111323874345359


In [None]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-2:b-1])
    all.append(st)

In [None]:
test['selected_text'] = all
test['selected_text'] = test['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
test['selected_text'] = test['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
test['selected_text'] = test['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
1080,a2819eaa7a,we`ll be visiting my grandparents later. BTW i just hear...,positive,whew so cool i love
940,261a5bbd14,tell me what you think of Pride Prejudice and Zombies ....,neutral,tell me what you think of pride prejudice and zombies ....
2826,769c8775bf,so technically....i havent really gotten out of bed. so...,neutral,so technically....i havent really gotten out of bed. so...
1993,204d6510c6,**** awake at 2 am ! i hate sickness,negative,i hate sickness
416,e892068fd3,"lol that just totally made me laugh, which made my day",positive,"laugh,"
2332,43ee369d34,Following new other #sanctuary fans! See the wonders of ...,positive,good
1483,9932b9c033,if i wasnt workin in 5 hours id be gettin ratarsed to th...,negative,**** i feel
1240,2cd5a21fbc,Sending blessings out to,positive,sending blessings
2443,1c5ca67cf9,"Good Grief! I can`t say much, I was driving home from M...",neutral,"good grief! i can`t say much, i was driving home from m..."
3025,6030362588,_flo well thats not nice... hope ur all good now,neutral,well thats not nice... hope ur all good now


In [None]:
test.head(50)

Unnamed: 0,textID,text,sentiment,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrap...,positive,exciting
2,eee518ae67,"Recession hit Veronique Branquinho, she has to quit her ...",negative,such a shame!
3,01082688c6,happy bday!,positive,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,i like it!!
5,726e501993,that`s great!! weee!! visitors!,positive,that`s great!!
6,261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative,hates
7,afa11da83f,"soooooo wish i could, but im in school and myspace is c...",negative,blocked
8,e64208b4ef,and within a short time of the last clue all of them,neutral,and within a short time of the last clue all of them
9,37bcad24ca,What did you get? My day is alright.. haven`t done any...,neutral,what did you get? my day is alright.. haven`t done anyt...
