In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

import warnings 
import gc
import os

import tokenizers
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaConfig



In [2]:
pd.plotting.register_matplotlib_converters()
pd.options.display.max_rows=50
pd.options.display.max_columns=100
plt.xkcd()
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('darkgrid')
plt.rcParams.update({'font.size':18, 
                    'font.family':'Humor Sans'})

In [3]:
data = '../input/tweet-sentiment-extraction/'

train_df_main=pd.read_csv(f'{data}train.csv').dropna()

submit=pd.read_csv(f'{data}sample_submission.csv')

In [4]:
class TweetData(Dataset):
    def __init__(self,df,max_len=96):
        self.df=df
        self.max_len=max_len
        self.label='selected_text' in df
        roberta_files='../input/roberta-base/'
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file = f'{roberta_files}vocab.json',
            merges_file = f'{roberta_files}merges.txt',
            lowercase=True,
            add_prefix_space=True)
        
    def __getitem__(self, idx):
        data={}
        row=self.df.iloc[idx]
        
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids']=ids
        data['masks']=masks
        data['tweet']=tweet
        data['offsets']=offsets
        
        if self.label:
            start_idx, end_idx = self.get_target_idx(row,tweet,offsets)
            data['start_idx']=start_idx
            data['end_idx']=end_idx
        
        return data
    
    def __len__(self):
        return len(self.df)
    
    def get_input_data(self,row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding = self.tokenizer.encode(tweet)
        sentiment_id = self.tokenizer.encode(row.sentiment).ids
        ids = [0] + sentiment_id + [2,2] + encoding.ids + [2]
        offsets = [(0,0)]*4 + encoding.offsets + [(0,0)]
        
        pad_len = self.max_len - len(ids)
        if pad_len>0:
            ids+=[1]*pad_len
            offsets += [(0,0)] * pad_len
            
        ids = torch.tensor(ids)
        masks = torch.where(ids!=1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        
        return ids, masks, tweet, offsets    
    
    def get_target_idx(self, row, tweet, offsets):
        selected_text=" "+" ".join(row.selected_text.lower().split())
        len_st = len(selected_text)-1
        idx0 = None
        idx1 = None
        
        for ind in (i for i,e in enumerate(tweet) if e==selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break
            
        char_targets = [0]*len(tweet)
        if idx0!=None and idx1!=None:
            for ct in range(idx0, idx1+1):
                char_targets[ct]=1
                
        target_idx = []
        for j,(offset1,offset2) in enumerate(offsets):
            if sum(char_targets[offset1:offset2]) > 0:
                target_idx.append(j)
                
        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx

In [5]:
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df=df.iloc[train_idx]
    val_df=df.iloc[val_idx]
    
    train_loader=DataLoader(
        TweetData(train_df),
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        drop_last=True)
    
    val_loader=DataLoader(
        TweetData(val_df),
        batch_size=batch_size,
        shuffle=False,
        num_workers=0)
    
    dl_dict = {'train': train_loader, 'val':val_loader}
    return dl_dict

def get_testloader(df, batch_size=32):
    loader = DataLoader(
        TweetData(df),
        batch_size=batch_size,
        shuffle=False,
        num_workers=0)
    return loader

In [6]:
class TweetNet(nn.Module):
    def __init__(self):
        super().__init__()
        roberta_files='../input/roberta-base/'
        config=RobertaConfig.from_pretrained(
            f'{roberta_files}config.json', output_hidden_states=True)
        self.roberta = RobertaModel.from_pretrained(
            f'{roberta_files}pytorch_model.bin', config=config)
        self.dropout=nn.Dropout(0.5)
        self.fc=nn.Linear(config.hidden_size, 2)
        
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
        
    def forward(self, input_ids, attention_mask):
        _,_, hs = self.roberta(input_ids, attention_mask)
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
        x = torch.mean(x,0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits=start_logits.squeeze(-1)
        end_logits=end_logits.squeeze(-1)
        
        return start_logits, end_logits

In [7]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss=nn.CrossEntropyLoss()
    start_loss=ce_loss(start_logits, start_positions)
    end_loss=ce_loss(end_logits, end_positions)
    total_loss=start_loss+end_loss
    return total_loss

In [8]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def Jaccard_Score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

In [9]:
def train_model(model, dl_dict, criterion, optimizer, n_epochs, filename):
    model.cuda()
    for epoch in range(n_epochs):
        for phase in ['train','val']:
            if phase=='train':
                model.train()
            else:
                model.eval()
                
            epoch_loss=0.0
            epoch_jaccard=0.0
            
            for data in (dl_dict[phase]):
                torch.cuda.empty_cache()
                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'].cuda()
                end_idx = data['end_idx'].cuda()                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase=='train'):
                    start_logits, end_logits = model(ids, masks)
                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    epoch_loss += loss.item() * len(ids)
                    
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    for i in range(len(ids)):
                        jaccard_score = Jaccard_Score(tweet[i], start_idx[i], end_idx[i], 
                                                     start_logits[i], end_logits[i], offsets[i])
                        epoch_jaccard+=jaccard_score
                        
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard/len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                        epoch+1, n_epochs, phase, epoch_loss, epoch_jaccard))
            
    torch.save(model.state_dict(), filename)

In [10]:
n_epochs=3
batch_size=32
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

In [11]:
%%time

train_df_main['text']=train_df_main['text'].astype(str)
train_df_main['selected_text'] = train_df_main['selected_text'].astype(str)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df_main, train_df_main.sentiment)):
    print(f'Fold: {fold}')
    
    model = TweetNet()
    optimizer = optim.AdamW(model.parameters(),lr = 5e-5, betas = (0.99, 0.999))
    criterion = loss_fn
    dataloaders_dict = get_train_val_loaders(train_df_main, train_idx, val_idx, batch_size)
    
    train_model(model, dataloaders_dict, criterion, optimizer,
                    n_epochs, f'roberta_fold{fold}.pth')
    


Fold: 0
Epoch 1/3 | train | Loss: 2.2496 | Jaccard: 0.6503
Epoch 1/3 |  val  | Loss: 1.7535 | Jaccard: 0.7047
Epoch 2/3 | train | Loss: 1.6749 | Jaccard: 0.7110
Epoch 2/3 |  val  | Loss: 1.6268 | Jaccard: 0.7236
Epoch 3/3 | train | Loss: 1.5180 | Jaccard: 0.7310
Epoch 3/3 |  val  | Loss: 1.6795 | Jaccard: 0.7190
Fold: 1
Epoch 1/3 | train | Loss: 2.2329 | Jaccard: 0.6477
Epoch 1/3 |  val  | Loss: 1.6584 | Jaccard: 0.7076
Epoch 2/3 | train | Loss: 1.6548 | Jaccard: 0.7155
Epoch 2/3 |  val  | Loss: 1.5870 | Jaccard: 0.7187
Epoch 3/3 | train | Loss: 1.4913 | Jaccard: 0.7344
Epoch 3/3 |  val  | Loss: 1.6266 | Jaccard: 0.7200
Fold: 2
Epoch 1/3 | train | Loss: 2.1375 | Jaccard: 0.6559
Epoch 1/3 |  val  | Loss: 1.6915 | Jaccard: 0.7033
Epoch 2/3 | train | Loss: 1.5964 | Jaccard: 0.7230
Epoch 2/3 |  val  | Loss: 1.6705 | Jaccard: 0.7057
Epoch 3/3 | train | Loss: 1.4137 | Jaccard: 0.7439
Epoch 3/3 |  val  | Loss: 1.7126 | Jaccard: 0.7120
Fold: 3
Epoch 1/3 | train | Loss: 2.1739 | Jaccard: 0.6587

In [12]:
test_df = pd.read_csv(f'../input/tweet-sentiment-extraction/test.csv')

In [13]:
%%time

# test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_loader = get_testloader(test_df)
predictions = []
models = []

# print(1)
for fold in range(skf.n_splits):
    model = TweetNet()
    model.cuda()
    model.load_state_dict(torch.load(f'roberta_fold{fold}.pth'))
    model.eval()
    models.append(model)

# print(2)

for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
#         print(3)
        with torch.no_grad():
#             print(4)
            output = model(ids, masks)
#             print(5)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
#             print(6)
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())
#             print(7)

#     print(3)
    
    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

CPU times: user 1min 5s, sys: 21.5 s, total: 1min 26s
Wall time: 1min 26s


In [14]:
submit['selected_text'] = predictions
submit['selected_text'] = submit['selected_text'].apply(lambda x: x.replace('!!!!','!') if len(x.split())==1 else x)
submit['selected_text'] = submit['selected_text'].apply(lambda x: x.replace('..','.') if len(x.split())==1 else x)
submit['selected_text'] = submit['selected_text'].apply(lambda x: x.replace('...','.') if len(x.split())==1 else x)

submit.to_csv('submission.csv', index=False)
submit

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy bday!
4,33987a8ee5,i like it!!
...,...,...
3529,e5f0e6ef4b,tired
3530,416863ce47,thanks
3531,6332da480c,sinking into depression...
3532,df1baec676,i love your videos!
