In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from transformers import *
import tokenizers
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold



In [2]:
MAX_LEN = 192
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)

In [3]:
df = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
df = df.drop(['textID'], axis =1)
df.dropna(axis = 0, inplace = True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,text,selected_text,sentiment
0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,my boss is bullying me...,bullying me,negative
3,what interview! leave me alone,leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...
27475,wish we could come see u on Denver husband l...,d lost,negative
27476,I`ve wondered about rake to. The client has ...,", don`t force",negative
27477,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27478,But it was worth it ****.,But it was worth it ****.,positive


In [4]:
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

In [5]:
length = df.shape[0]

input_ids = np.ones((length,192),dtype='int32')
attention_mask = np.zeros((length,192),dtype='int32')
token_type_ids = np.zeros((length,192),dtype='int32')
start_tokens = np.zeros((length,192),dtype='int32')
end_tokens = np.zeros((length,192),dtype='int32')

In [6]:
input_ids = np.ones((length,192),dtype='int32')
attention_mask = np.zeros((length,192),dtype='int32')
token_type_ids = np.zeros((length,192),dtype='int32')
start_tokens = np.zeros((length,192),dtype='int32')
end_tokens = np.zeros((length,192),dtype='int32')

for k in range(length):
    
    text1 = str(df.loc[k,'text'])
    text2 = str(df.loc[k,'selected_text'])
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[df.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[k,:len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+1] = 1
        end_tokens[k,toks[-1]+1] = 1

In [7]:
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').fillna('')

input_ids_t = np.ones((test.shape[0],192),dtype='int32')
attention_mask_t = np.zeros((test.shape[0],192),dtype='int32')
token_type_ids_t = np.zeros((test.shape[0],192),dtype='int32')

for k in range(test.shape[0]):
        
    text1 = str(test.loc[k,'text'])
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1

In [8]:
def build_model():
    ids = tf.keras.layers.Input((192,), dtype=tf.int32)
    att = tf.keras.layers.Input((192,), dtype=tf.int32)
    tok = tf.keras.layers.Input((192,), dtype=tf.int32)

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(1,1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(1,1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    return model

In [9]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [10]:
jac = []; 
oof_start = np.zeros((input_ids.shape[0],192))
oof_end = np.zeros((input_ids.shape[0],192))
preds_start = np.zeros((input_ids_t.shape[0],192))
preds_end = np.zeros((input_ids_t.shape[0],192))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=777)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,df.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
       
    model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=3, batch_size=32, verbose=1,
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))
    
    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],
                                                    verbose=1)
    
    print('Predicting Test...')
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=1)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = df.loc[k,'text'] 
        else:
            text1 = " "+" ".join(df.loc[k,'text'].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
        all.append(jaccard(st,df.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()

#########################
### FOLD 1
#########################
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predicting OOF...
Predicting Test...
>>>> FOLD 1 Jaccard = 0.7069887840564903

#########################
### FOLD 2
#########################
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predicting OOF...
Predicting Test...
>>>> FOLD 2 Jaccard = 0.7051891641875885

#########################
### FOLD 3
#########################
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predicting OOF...
Predicting Test...
>>>> FOLD 3 Jaccard = 0.7049067896964941

#########################
### FOLD 4
#########################
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predicting OOF...
Predicting Test...
>>>> FOLD 4 Jaccard = 0.6898373544975882

#########################
### FOLD 5
#########################
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predicting OOF...
Predicting Test...
>>>> FOLD 5 Jaccard = 0.7019315174475471



In [11]:
print('OVERALL Jaccard =',np.mean(jac))

OVERALL Jaccard = 0.7017707219771416


In [13]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)
    
    
test['selected_text'] = all
test[['textID','sentiment','selected_text']]

Unnamed: 0,textID,sentiment,selected_text
0,f87dea47db,neutral,last session of the day
1,96d74cb729,positive,really exciting
2,eee518ae67,negative,such a shame!
3,01082688c6,positive,happy bday!
4,33987a8ee5,positive,i like it!!
...,...,...,...
3529,e5f0e6ef4b,negative,tired
3530,416863ce47,positive,for
3531,6332da480c,negative,depression...
3532,df1baec676,positive,i love
