# LB 0.703

In [1]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
print('TF version',tf.__version__)

TF version 2.2.0


# 데이터불러오기

In [2]:
MAX_LEN = 96
PATH = './input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
train = pd.read_csv('train_split.csv').fillna('')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,b7797a8477,I created my account just to get a chance to ...,: I ADMIRE YOU! you`re amazing! you inspire me...,positive
1,cbdb23868f,too kind Jim too kind brother,kind,positive
2,69879e6134,"Oh, I`ve just watched the third episode of JON...",it`s awesome.,positive
3,169ca458b0,GrimeStopper loss client lost their job so we...,GrimeStopper loss client lost their job so we...,neutral
4,0dbaca8e07,goood,goood,positive


# 데이터 학습
https://www.kaggle.com/cdeotte/tensorflow-roberta-0-705 의 자료를 사용하였습니다.

In [3]:
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[k,:len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+1] = 1
        end_tokens[k,toks[-1]+1] = 1

# test 데이터 나누기

In [4]:
test = pd.read_csv('test_split.csv').fillna('')

In [5]:
ans_st = test['selected_text']
del test['selected_text']
test

Unnamed: 0,textID,text,sentiment
0,85a02242e3,"Its been a slow day at home, one of my kids is...",neutral
1,a0e0ed4311,about to head to Starbucks. was gonna take the...,negative
2,f5f2a709ca,I like them. I mean I like what`s left of them.,positive
3,064985c587,Grad present photos on facebook,neutral
4,5ab37ce023,"That`s kinda cute, to be honest",positive
...,...,...,...
5491,9e951e5565,Aw! Tear! I feel special to da family. Haha t...,positive
5492,ebc1e3bcb6,Yes PM PM you are right,positive
5493,eefc505b2b,self-portrait week http://unbecominglily.blogs...,neutral
5494,7f392f9c06,The time is not my friend today,negative


In [6]:
ans_st

0       Its been a slow day at home, one of my kids is...
1                                                  missed
2                                            I like them.
3                         Grad present photos on facebook
4                                kinda cute, to be honest
                              ...                        
5491    Aw! Tear! I feel special to da family.  Haha t...
5492                                        you are right
5493    self-portrait week http://unbecominglily.blogs...
5494                      The time is not my friend today
5495    Finally got a call for marriage counseling 3 d...
Name: selected_text, Length: 5496, dtype: object

In [7]:
ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1

# 모델

In [8]:
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(1,1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(1,1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    return model

In [9]:
model = build_model()
model.summary()

All model checkpoint weights were used when initializing TFRobertaModel.

All the weights of TFRobertaModel were initialized from the model checkpoint at ./input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 96, 768), (N 124645632   input_1[0][0]                    
______________________________________________________________________________________________

# 자카드 스코어

In [10]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# 모델 학습

In [11]:
jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=3,shuffle=True,random_state=777)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
        
    sv = tf.keras.callbacks.ModelCheckpoint(
        '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')
        
    model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=3, batch_size=32, verbose=DISPLAY, callbacks=[sv],
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))
    
    print('Loading model...')
    model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    
    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    print('Predicting Test...')
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = train.loc[k,'text'] # IMPROVE CV/LB with better choice here
        else:
            text1 = " "+" ".join(train.loc[k,'text'].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
        all.append(jaccard(st,train.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()

#########################
### FOLD 1
#########################


All model checkpoint weights were used when initializing TFRobertaModel.

All the weights of TFRobertaModel were initialized from the model checkpoint at ./input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.69476, saving model to v0-roberta-0.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.69476 to 1.64124, saving model to v0-roberta-0.h5
Epoch 3/3
Epoch 00003: val_loss improved from 1.64124 to 1.63105, saving model to v0-roberta-0.h5
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 1 Jaccard = 0.708979100753134

#########################
### FOLD 2
#########################


All model checkpoint weights were used when initializing TFRobertaModel.

All the weights of TFRobertaModel were initialized from the model checkpoint at ./input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.80993, saving model to v0-roberta-1.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.80993 to 1.68999, saving model to v0-roberta-1.h5
Epoch 3/3
Epoch 00003: val_loss did not improve from 1.68999
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 2 Jaccard = 0.696239611372558

#########################
### FOLD 3
#########################


All model checkpoint weights were used when initializing TFRobertaModel.

All the weights of TFRobertaModel were initialized from the model checkpoint at ./input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.72688, saving model to v0-roberta-2.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.72688 to 1.66374, saving model to v0-roberta-2.h5
Epoch 3/3
Epoch 00003: val_loss did not improve from 1.66374
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 3 Jaccard = 0.6921691970050207



In [12]:
print('>>>> OVERALL 3Fold CV Jaccard =',np.mean(jac))

>>>> OVERALL 5Fold CV Jaccard = 0.6991293030435708


# Kaggle Submission

In [14]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

In [15]:
test['selected_text'] = all
test[['textID','selected_text']].to_csv('submission.csv1',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
5151,1635b82ee7,did your car get towed? Thats what happened to redgie`s...,neutral,did your car get towed? thats what happened to redgie`s...
4159,71d78dd6c4,"Back at home,11 hours till work",neutral,"back at home,11 hours till work"
2759,dce4480092,And a good morning to you! Up early with a bad conscie...,neutral,and a good morning to you! up early with a bad conscien...
649,77d4a716dd,OO YAY 39 FOLLOWERS I WANT 100 HELP ME,positive,help me
4175,5d0aff483b,@_Mintyfresh Thanks very much!!,positive,thanks very much!!
2543,bf73c02eb3,Is listening to the new jonas brothers song,neutral,is listening to the new jonas brothers song
542,b068ec2560,Wow - the dude said I was better than Bobbi Lewis.. neve...,positive,wow - the dude said i was better than bobbi lewis.. nev...
2713,c70c228685,They just admitted my grandma to the hospital... Shes my...,negative,shes my only last one please pray for her
5455,c8e658737f,Installed office for Mac & messed up all my fonts on saf...,negative,"messed up all my fonts on safari, **** on a stick! help..."
5017,248497cbac,signed up for broadband today could take 4-6 weeks,neutral,signed up for broadband today could take 4-6 weeks


## 스코어 비교

In [20]:
scores = []
for i in range(test.shape[0]):
    scores.append(jaccard(test["selected_text"][i], ans_st[i]))
print("average jaccard score:", sum(scores) / len(scores))

average jaccard score: 0.7032586629289909


In [36]:
print("원본 :",test["text"][0]) 
print("예측 :",test["selected_text"][0]) 
print("정답 :",ans_st[0])

원본 : Its been a slow day at home, one of my kids is sick  .  This little picture cheered me up http://is.gd/JrLa
예측 :  its been a slow day at home, one of my kids is sick . this little picture cheered me up
정답 : Its been a slow day at home, one of my kids is sick  .  This little picture cheered me up


In [38]:
print("원본 :",test["text"][2421]) 
print("예측 :",test["selected_text"][2421]) 
print("정답 :",ans_st[2421])

원본 : Scrubs tonight (8.00pm). woo!
예측 :  woo!
정답 : woo!


In [43]:
print("원본 :",test["text"][87]) 
print("예측 :",test["selected_text"][87]) 
print("정답 :",ans_st[87])

원본 : Im sad right now becuz of my lady
예측 :  im sad
정답 : sad


얼추 비슷하게 예측을 한다.