In [None]:
!nvidia-smi

Sat Jul  4 04:40:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Tweet Sentiment Extraction Final

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Tweet Sentiment Extraction Final


In [None]:
!pip install transformers



# Utils

In [None]:
import os
import random

import torch
import numpy as np

class utils:
  def seed_everything(seed):
      random.seed(seed)
      os.environ['PYTHONHASHSEED'] = str(seed)
      torch.manual_seed(seed)
      torch.backends.cudnn.deterministic = True
      torch.backends.cudnn.benchmark = False
      np.random.seed(seed)


  def token_level_to_char_level(text, offsets, preds):
      probas_char = np.zeros(len(text))
      for i, offset in enumerate(offsets):
          if offset[0] or offset[1]:
              probas_char[offset[0]:offset[1]] = preds[i]

      return probas_char


  def jaccard(str1, str2):
      """Original metric implementation."""
      a = set(str1.lower().split())
      b = set(str2.lower().split())
      c = a.intersection(b)
      return float(len(c)) / (len(a) + len(b) - len(c))


  def get_best_start_end_idx(start_logits, end_logits,
                            orig_start, orig_end):
      """Return best start and end indices following BERT paper."""
      best_logit = -np.inf
      best_idxs = None
      start_logits = start_logits[orig_start:orig_end + 1]
      end_logits = end_logits[orig_start:orig_end + 1]
      for start_idx, start_logit in enumerate(start_logits):
          for end_idx, end_logit in enumerate(end_logits[start_idx:]):
              logit_sum = start_logit + end_logit
              if logit_sum > best_logit:
                  best_logit = logit_sum
                  best_idxs = (orig_start + start_idx,
                              orig_start + start_idx + end_idx)
      return best_idxs


  def calculate_jaccard(original_tweet, target_string,
                        start_logits, end_logits,
                        orig_start, orig_end,
                        offsets, 
                        verbose=False):
      """Calculates final Jaccard score using predictions."""
      start_idx, end_idx = get_best_start_end_idx(
          start_logits, end_logits, orig_start, orig_end)

      filtered_output = ''
      for ix in range(start_idx, end_idx + 1):
          filtered_output += original_tweet[offsets[ix][0]:offsets[ix][1]]
          if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
              filtered_output += ' '

      # Return orig tweet if it has less then 2 words
      if len(original_tweet.split()) < 2:
          filtered_output = original_tweet

      if len(filtered_output.split()) == 1:
          filtered_output = filtered_output.replace('!!!!', '!')
          filtered_output = filtered_output.replace('..', '.')
          filtered_output = filtered_output.replace('...', '.')

      filtered_output = filtered_output.replace('ïï', 'ï')
      filtered_output = filtered_output.replace('¿¿', '¿')

      jac = jaccard(target_string.strip(), filtered_output.strip())
      return jac, filtered_output


  class AverageMeter:
      """Computes and stores the average and current value."""

      def __init__(self):
          self.reset()

      def reset(self):
          self.val = 0
          self.avg = 0
          self.sum = 0
          self.count = 0

      def update(self, val, n=1):
          self.val = val
          self.sum += val * n
          self.count += n
          self.avg = self.sum / self.count

  class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
      self.patience = patience
      self.counter = 0
      self.mode = mode
      self.best_score = None
      self.early_stop = False
      self.delta = delta
      if self.mode == "min":
          self.val_score = np.Inf
      else:
          self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

      if self.mode == "min":
          score = -1.0 * epoch_score
      else:
          score = np.copy(epoch_score)

      if self.best_score is None:
          self.best_score = score
          self.save_checkpoint(epoch_score, model, model_path)
      elif score < self.best_score + self.delta:
          self.counter += 1
          print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
          if self.counter >= self.patience:
              self.early_stop = True
      else:
          self.best_score = score
          self.save_checkpoint(epoch_score, model, model_path)
          self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
      if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
          print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
          torch.save(model.state_dict(), model_path)
      self.val_score = epoch_score

# Config


```
MODEL_PATHS = {
    'bert-base-uncased': './input/bert-configs/uncased_L-12_H-768_A-12',
    'bert-large-uncased-whole-word-masking-finetuned-squad': './input/bertconfigs/wwm_uncased_L-24_H-1024_A-16',
    'albert-large-v2': './input/albert-configs/albert-large-v2',
    'albert-base-v2': './input/albert-configs/albert-base-v2',
    'distilbert': './input/albert-configs/distilbert',
}
```



```
TRANSFORMERS = {   
    "bert-base-uncased": (BertModel, "bert-base-uncased", BertConfig),
    "bert-large-uncased-whole-word-masking-finetuned-squad": (BertModel, "bert-large-uncased-whole-word-masking-finetuned-squad", BertConfig),
    'albert-base-v2': (AlbertModel, 'albert-base-v2', AlbertConfig),
    'albert-large-v2': (AlbertModel, 'albert-large-v2', AlbertConfig),
    "distilbert": (DistilBertModel, "distilbert-base-uncased-distilled-squad", DistilBertConfig),
}

```






In [None]:
import tokenizers

class config:

  # Paths
  TOKENIZER_PATH = './input/bert-configs/wwm_uncased_L-24_H-1024_A-16'
  PRETRAINED_MODEL_PATH = './input/bert-configs/wwm_uncased_L-24_H-1024_A-16'
  TRAIN_FILE = './input/tweet-sentiment-extraction/train_folds.csv' 
  TEST_FILE = './input/tweet-sentiment-extraction/test.csv'
  SAMPLE_SUBMISSION_FILE = './input/tweet-sentiment-extraction/sample_submission.csv' 
  SAVE_WEIGHTS_PATH = './output/Bert Large'
  FINAL_SUBMISSION_FILE = './output/Bert Large' 

  # Model config
  MODEL_CONFIG = './input/bert-configs/wwm_uncased_L-24_H-1024_A-16'

  # Model params
  SEED = 25
  N_FOLDS = 5
  EPOCHS = 5
  LEARNING_RATE = 4e-5
  PATIENCE = None
  EARLY_STOPPING_DELTA = None
  TRAIN_BATCH_SIZE = 32
  VALID_BATCH_SIZE = 16
  MAX_LEN = 128  
  TOKENIZER = tokenizers.BertWordPieceTokenizer(
        f"{TOKENIZER_PATH}/vocab.txt", 
        lowercase=True
    )
  
  HIDDEN_SIZE = 1024
  N_LAST_HIDDEN = 24
  HIGH_DROPOUT = 0.5
  SOFT_ALPHA = 0.6
  WARMUP_RATIO = 0.25
  WEIGHT_DECAY = 0.001
  USE_SWA = False
  SWA_RATIO = 0.9
  SWA_FREQ = 30


# Dataset

In [None]:
import numpy as np
import torch

def jaccard_array(a, b):
    """Calculates Jaccard on arrays."""
    a = set(a)
    b = set(b)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def process_data(tweet, selected_text, sentiment,
                 tokenizer, max_len):
    """Preprocesses one data sample and returns a dict
    with targets and other useful info.
    """
    len_sel_text = len(selected_text)
    # Get selected_text start and end idx
    idx_0 = None
    idx_1 = None
    for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
      if tweet[ind: ind+len_sel_text] == selected_text:
        idx_0 = ind
        idx_1 = ind + len_sel_text - 1
        break

    # Assign 1 as target for each char in sel_text
    char_targets = [0] * len(tweet)
    if idx_0 is not None and idx_1 is not None:
        for ct in range(idx_0, idx_1 + 1):
            char_targets[ct] = 1

    # Check the example - https://github.com/huggingface/tokenizers
    tokenized_tweet = tokenizer.encode(tweet)
    # Vocab ids
    input_ids_original = tokenized_tweet.ids[1:-1]
    # Start and end char
    tweet_offsets = tokenized_tweet.offsets[1:-1]

    # Get ids within tweet of words that have target char
    target_ids = []
    for i, (offset_0, offset_1) in enumerate(tweet_offsets):
        if sum(char_targets[offset_0:offset_1]) > 0:
            target_ids.append(i)

    targets_start = target_ids[0]
    targets_end = target_ids[-1]

    # Sentiment 'word' id in vocab
    sentiment_id = {'positive': 3893,
                    'negative': 4997,
                    'neutral': 8699}

    # Soft Jaccard labels
    # ----------------------------------
    n = len(input_ids_original)
    sentence = np.arange(n)
    answer = sentence[targets_start:targets_end + 1]

    start_labels = np.zeros(n)
    for i in range(targets_end + 1):
        jac = jaccard_array(answer, sentence[i:targets_end + 1])
        start_labels[i] = jac + jac ** 2
    start_labels = (1 - config.SOFT_ALPHA) * start_labels / start_labels.sum()
    start_labels[targets_start] += config.SOFT_ALPHA

    end_labels = np.zeros(n)
    for i in range(targets_start, n):
        jac = jaccard_array(answer, sentence[targets_start:i + 1])
        end_labels[i] = jac + jac ** 2
    end_labels = (1 - config.SOFT_ALPHA) * end_labels / end_labels.sum()
    end_labels[targets_end] += config.SOFT_ALPHA

    start_labels = [0, 0, 0] + list(start_labels) + [0]
    end_labels = [0, 0, 0] + list(end_labels) + [0]
    # ----------------------------------

    # Input for BERT
    input_ids = [101] + [sentiment_id[sentiment]] + [102] + input_ids_original + [102]
    token_type_ids = [0, 0, 0] + [1] * (len(input_ids_original) + 1)
    # Mask of input without padding
    mask = [1] * len(token_type_ids)
    # Start and end char ids for each word including new tokens
    tweet_offsets = [(0, 0)] * 3 + tweet_offsets + [(0, 0)]
    # Ids within tweet of words that have target char including new tokens
    targets_start += 3
    targets_end += 3

    # Input padding: new mask, token type ids, tweet offsets
    padding_len = max_len - len(input_ids)
    if padding_len > 0:
        input_ids = input_ids + ([1] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_len)
        start_labels = start_labels + ([0] * padding_len)
        end_labels = end_labels + ([0] * padding_len)
    else:
        input_ids = input_ids[:max_len]
        mask = mask[:max_len]
        token_type_ids = token_type_ids[:max_len]
        tweet_offsets = tweet_offsets[:max_len]
        start_labels = start_labels[:max_len]
        end_labels = end_labels[:max_len]

    return {'ids': input_ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'start_labels': start_labels,
            'end_labels': end_labels,
            'orig_tweet': tweet,
            'orig_selected': selected_text,
            'sentiment': sentiment,
            'offsets': tweet_offsets}


class TweetDataset:
    def __init__(self, tweets, sentiments, selected_texts):
        self.tweets = tweets
        self.sentiments = sentiments
        self.selected_texts = selected_texts
        self.max_len = config.MAX_LEN
        self.tokenizer = config.TOKENIZER

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        """Returns preprocessed data sample as dict with
        data converted to tensors.
        """
        data = process_data(self.tweets[item],
                            self.selected_texts[item],
                            self.sentiments[item],
                            self.tokenizer,
                            self.max_len)

        return {'ids': torch.tensor(data['ids'], dtype=torch.long),
                'mask': torch.tensor(data['mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(data['token_type_ids'],dtype=torch.long),
                'start_labels': torch.tensor(data['start_labels'],dtype=torch.float),
                'end_labels': torch.tensor(data['end_labels'],dtype=torch.float),
                'orig_tweet': data['orig_tweet'],
                'orig_selected': data['orig_selected'],
                'sentiment': data['sentiment'],
                'offsets': torch.tensor(data['offsets'], dtype=torch.long)}


# Models

In [None]:
import torch
import transformers

class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.BertModel.from_pretrained(
            config.PRETRAINED_MODEL_PATH,
            config=conf)
        self.high_dropout = torch.nn.Dropout(config.HIGH_DROPOUT)
        self.classifier = torch.nn.Linear(config.HIDDEN_SIZE * 2, 2)

        torch.nn.init.normal_(self.classifier.weight, std=0.02)

    def forward(self, ids, mask, token_type_ids):
        # sequence_output of N_LAST_HIDDEN + Embedding states
        # (N_LAST_HIDDEN + 1, batch_size, num_tokens, 768)
        _, _, out = self.roberta(ids, attention_mask=mask,
                                 token_type_ids=token_type_ids)

        out = torch.stack(
            tuple(out[-i - 1] for i in range(config.N_LAST_HIDDEN)), dim=0)
        out_mean = torch.mean(out, dim=0)
        out_max, _ = torch.max(out, dim=0)
        out = torch.cat((out_mean, out_max), dim=-1)

        # Multisample Dropout: https://arxiv.org/abs/1905.09788
        logits = torch.mean(torch.stack([
            self.classifier(self.high_dropout(out))
            for _ in range(5)
        ], dim=0), dim=0)

        start_logits, end_logits = logits.split(1, dim=-1)

        # (batch_size, num_tokens)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits


# Loss Function

In [None]:
# outputs must be log-probabilities and labels must be probabilities 
def loss_fn(start_logits, end_logits, start_labels, end_labels):
  logsoftmax = nn.LogSoftmax(dim=1)
  loss_fct = nn.KLDivLoss(reduction='batchmean')
  start_loss = loss_fct(logsoftmax(start_logits), start_labels)
  end_loss = loss_fct(logsoftmax(end_logits), end_labels)
  total_loss = (start_loss + end_loss)
  return total_loss

# Training Function

In [None]:
from tqdm.autonotebook import tqdm

def train_fn(data_loader, model, optimizer, device, scheduler=None):
    
    # First thing we want to do is put the model in train mode
    model.train()
    
    # Check the AverageMeter class in utils class
    # Instantiate the AverageMeter class to print average loss & jaccard score
    losses = utils.AverageMeter()
    jaccards = utils.AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        
        # Load data into variables
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]
        start_labels = d["start_labels"]
        end_labels = d["end_labels"]
        orig_tweet = d["orig_tweet"]
        orig_selected = d["orig_selected"]
        sentiment = d["sentiment"]
        offsets = d["offsets"]

        # Push the variables to GPU device
        # https://pytorch.org/docs/stable/tensor_attributes.html
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        start_labels = start_labels.to(device, dtype=torch.float)
        end_labels = end_labels.to(device, dtype=torch.float)

        # Clear gradients w.r.t. parameters
        model.zero_grad()
        
        # Forward pass to get outputs
        outputs_start, outputs_end = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )

        # Calculate Loss: cross entropy loss
        loss = loss_fn(outputs_start, outputs_end, start_labels, end_labels)
        
        # Getting gradients w.r.t. parameters
        loss.backward()
        
        # Updating parameters
        optimizer.step()
        scheduler.step()

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            jaccard_score, _ = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            jaccard_scores.append(jaccard_score)

        # Update and print
        losses.update(loss.item(), ids.size(0))
        jaccards.update(np.mean(jaccard_scores), ids.size(0))
        tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)

# Evaluation Functions

In [None]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    if idx_end < idx_start:
        idx_end = idx_start
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

    if sentiment_val == "neutral" or len(original_tweet.split()) < 2:
        filtered_output = original_tweet

    jac = utils.jaccard(target_string.strip(), filtered_output.strip())
    return jac, filtered_output


def eval_fn(data_loader, model, device):
  model.eval()
  losses = utils.AverageMeter()
  jaccards = utils.AverageMeter()
  
  with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
      ids = d["ids"]
      mask = d["mask"]
      token_type_ids = d["token_type_ids"]
      start_labels = d["start_labels"]
      end_labels = d["end_labels"]
      orig_tweet = d["orig_tweet"]
      orig_selected = d["orig_selected"]
      sentiment = d["sentiment"]
      offsets = d["offsets"]

      ids = ids.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      start_labels = start_labels.to(device, dtype=torch.float)
      end_labels = end_labels.to(device, dtype=torch.float)

      outputs_start, outputs_end = model(
          ids=ids,
          mask=mask,
          token_type_ids=token_type_ids
      )
      
      loss = loss_fn(outputs_start, outputs_end, start_labels, end_labels)
      outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
      outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
      jaccard_scores = []
      for px, tweet in enumerate(orig_tweet):
        selected_tweet = orig_selected[px]
        tweet_sentiment = sentiment[px]
        jaccard_score, _ = calculate_jaccard_score(
            original_tweet=tweet,
            target_string=selected_tweet,
            sentiment_val=tweet_sentiment,
            idx_start=np.argmax(outputs_start[px, :]),
            idx_end=np.argmax(outputs_end[px, :]),
            offsets=offsets[px]
        )
        jaccard_scores.append(jaccard_score)

      jaccards.update(np.mean(jaccard_scores), ids.size(0))
      losses.update(loss.item(), ids.size(0))
      tk0.set_postfix(loss=losses.avg, jaccard=jaccards.avg)
  
  print(f"Jaccard = {jaccards.avg}")
  return jaccards.avg

Training

In [None]:
import pandas as pd
import torch.nn as nn

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

def run(fold):
    dfx = pd.read_csv(config.TRAIN_FILE)

    df_train = dfx[dfx.kfold != fold].reset_index(drop=True)
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    
    ####################### Create Dataset and make it iterable #######################
    train_dataset = TweetDataset(
        tweets=df_train.text.values,
        sentiments=df_train.sentiment.values,
        selected_texts=df_train.selected_text.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=0 # Changed from 4 to 0 - Need further investigation
    )

    valid_dataset = TweetDataset(
        tweets=df_valid.text.values,
        sentiments=df_valid.sentiment.values,
        selected_texts=df_valid.selected_text.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=0 # Changed from 2 to 0 - Need further investigation
    )
    ##################################################################################

    # Load BertConfig
    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.MODEL_CONFIG)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    # Patience can be set to 4
    es = utils.EarlyStopping(patience=2, mode="max")
    print(f"Training is Starting for fold={fold}")
    
    # I'm training only for 3 epochs even though I specified 5!!!
    for epoch in range(config.EPOCHS):
      train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
      jaccard = eval_fn(valid_data_loader, model, device)
      print(f"Jaccard Score = {jaccard}")
      es(jaccard, model, model_path=f"{config.SAVE_WEIGHTS_PATH}/model_{fold}.bin")
      if es.early_stop:
          print("Early stopping")
          break

In [None]:
run(fold=0)

Training is Starting for fold=0


HBox(children=(FloatProgress(value=0.0, max=688.0), HTML(value='')))

In [None]:
run(fold=1)

In [None]:
run(fold=2)

In [None]:
run(fold=3)

In [None]:
run(fold=4)

# Do the evaluation on test data



In [None]:
df_test = pd.read_csv(config.TEST_FILE)
df_test.loc[:, "selected_text"] = df_test.text.values

In [None]:
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(config.MODEL_CONFIG)
model_config.output_hidden_states = True

In [None]:
fold_models = []
for i in range(config.N_FOLDS):
  model = TweetModel(conf=model_config)
  model.to(device)
  model.load_state_dict(torch.load(f'{config.SAVE_WEIGHTS_PATH}/model_{i}.bin'))
  model.eval()
  fold_models.append(model)

In [None]:
final_output = []

test_dataset = TweetDataset(
        tweets=df_test.text.values,
        sentiments=df_test.sentiment.values,
        selected_texts=df_test.selected_text.values)

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=config.VALID_BATCH_SIZE,
    num_workers=0)

with torch.no_grad():
  tk0 = tqdm(data_loader, total=len(data_loader))
  for bi, d in enumerate(tk0):

    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    start_labels = d["start_labels"]
    end_labels = d["end_labels"]
    orig_tweet = d["orig_tweet"]
    orig_selected = d["orig_selected"]
    sentiment = d["sentiment"]
    offsets = d["offsets"]

    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    start_labels = start_labels.to(device, dtype=torch.long)
    end_labels = end_labels.to(device, dtype=torch.long)

    outputs_start_folds = []
    outputs_end_folds = []
    for i in range(config.N_FOLDS):
      outputs_start, outputs_end = fold_models[i](ids=ids,
                                                  mask=mask,
                                                  token_type_ids=token_type_ids)
      outputs_start_folds.append(outputs_start)
      outputs_end_folds.append(outputs_end)

    outputs_start = sum(outputs_start_folds) / config.N_FOLDS
    outputs_end = sum(outputs_end_folds) / config.N_FOLDS
    
    outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
    outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

    for px, tweet in enumerate(orig_tweet):
      selected_tweet = orig_selected[px]
      tweet_sentiment = sentiment[px]
      _, output_sentence = calculate_jaccard_score(
          original_tweet=tweet,
          target_string=selected_tweet,
          sentiment_val=tweet_sentiment,
          idx_start=np.argmax(outputs_start[px, :]),
          idx_end=np.argmax(outputs_end[px, :]),
          offsets=offsets[px]
      )
      final_output.append(output_sentence)

In [None]:
# post-process trick:
# Note: This trick comes from: https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/140942
# When the LB resets, this trick won't help
def post_process(selected):
    return " ".join(set(selected.lower().split()))

In [None]:
sample = pd.read_csv(config.SAMPLE_SUBMISSION_FILE)
sample.loc[:, 'selected_text'] = final_output
sample.selected_text = sample.selected_text.map(post_process)
sample.to_csv(config.FINAL_SUBMISSION_FILE + '/submission.csv', index=False)

In [None]:
sample.head()