In [1]:
import json
import os

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn

from transformers.tokenization_utils_base import BatchEncoding
from transformers.data.data_collator import default_data_collator
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
seed = 0xC0FFEE

In [3]:
eat_with_folds = pd.read_pickle('./eat_with_folds.pkl')

In [4]:
eat_with_folds.iloc[-3]

story         [Tom took watermelon out of the fridge., Tom c...
label                                                         0
breakpoint                                                    4
id                                                   train_1041
fold                                                          2
Name: 1041, dtype: object

In [5]:
eat_with_folds.iloc[-3][0]

['Tom took watermelon out of the fridge.',
 'Tom cut the watermelon into cubes.',
 'Tom put a banana in a cup.',
 'Tom mashed the banana.',
 'Tom cut the banana into slices.']

In [6]:
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)

# Sentence encode

In [8]:
model = SentenceTransformer('roberta-large-nli-mean-tokens')
model.cuda()
all_embeddings = dict()

for ix, row in eat_with_folds.iterrows():
    all_embeddings[row['id']] = model.encode(row['story'])

# Tokenizer

In [9]:
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModel,
    AutoTokenizer,
    default_data_collator,
)

In [10]:
model_name_or_path = 'roberta-large'

In [11]:
# if BERT_MODEL.endswith('uncased'):
#     DO_LOWER_CASE = True
# elif BERT_MODEL.endswith('cased'):
#     DO_LOWER_CASE = False
# else:
#     raise ValueError("Improper bert model name!")

# model_name = BERT_MODEL + "_" + str(abs(layer))

# # tokenizer = BertTokenizer.from_pretrained(
# #     BERT_MODEL,
# #     do_lower_case=DO_LOWER_CASE,
# #     never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]", "[A]", "[B]", "[P]")
# # )

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
)

# Dataset

In [40]:
def tokenize(text, tokenizer):
    """Returns a list of tokens and the positions of A, B, and the pronoun."""
    entries = {}
    final_tokens = []
    for token in tokenizer.tokenize(text):
        if token in ("[A]", "[B]", "[P]"):
            entries[token] = len(final_tokens)
            continue
        final_tokens.append(token)
    return final_tokens #, (entries["[A]"], entries["[B]"], entries["[P]"])

class EATDataset(Dataset):
    """Custom EAT Dataset class"""

    def __init__(self, df, tokenizer, all_embeddings, labeled=True, task=1):
        self.labeled = labeled
        if labeled:
            if task == 1:
                self.y = pd.get_dummies(df['label']).values.astype("bool")
            elif task == 2:
                # TODO one-hot encode me
                self.y = df['breakpoint'].values
                
        # Extracts the tokens and offsets(positions of A, B, and P)
        self.offsets, self.tokens, self.in_urls, self.other_feats = [], [], [], []
        
        for ix, row in df.iterrows():
            sentence_embeddings = all_embeddings[row['id']][:5, ]
#             text = ' '.join(row['story']) 
#             self.tokens.append(tokenizer.encode(text, 
#                                 padding='max_length',
#                                 max_length=100))
            self.tokens.append(sentence_embeddings)
#             tokens = tokenize(text, tokenizer)
#             if len(tokens) <= 512:
#                 self.tokens.append(tokenizer.convert_tokens_to_ids(
#                     tokens))
#             else:
#                 self.tokens.append(tokenizer.convert_tokens_to_ids(
#                     tokens[0:510]))
#                 print('Shortened seq')

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        if self.labeled:
            return self.tokens[idx], self.y[idx]
        return self.tokens[idx],  None
    
def collate_examples(batch, truncate_len=512):  # 512 as in paper
    """Batch preparation.
    1. Pad the sequences
    2. Transform the target.
    """
    transposed = list(zip(*batch))
    
    max_len = min(
        max((len(x) for x in transposed[0])),
        truncate_len
    )
#     tokens = np.zeros((len(batch), max_len), dtype=np.int64)
#     for i, row in enumerate(transposed[0]):
#         row = np.array(row[:truncate_len])
#         tokens[i, :len(row)] = row
    token_tensor = torch.Tensor(transposed[0])

    # Labels
    if len(transposed) == 1:
        return token_tensor, None
    
    one_hot_labels = torch.stack([
        torch.from_numpy(x.astype("uint8")) for x in transposed[-1]
    ], dim=0)
    
    _, labels = one_hot_labels.max(dim=1)
    
    return token_tensor, labels

# Define the model

In [41]:
# import torch
# import torch.nn as nn
# from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor
# # from transformers.models.bert import BertModel
# # from pytorch_pretrained_bert.modeling import BertModel


# class Head(nn.Module):
#     """The MLP submodule"""

#     def __init__(self, bert_hidden_size: int, cnn_context: int, hidden_size: int):
#         super().__init__()
#         self.bert_hidden_size = bert_hidden_size
#         self.cnn_context = cnn_context
#         self.proj_dim = 64
#         self.k = 1 + 2 * self.cnn_context
#         self.hidden_size = hidden_size

# #         self.span_extractor = SelfAttentiveSpanExtractor(self.proj_dim)  # span extractor comes directly after BERT
#         # all the main parameters are coming from the conv layer
#         # works!
#         self.context_conv = nn.Conv1d(self.bert_hidden_size, self.proj_dim, kernel_size=self.k, stride=1,
#                                       padding=self.cnn_context, dilation=1, groups=1, bias=True)

#         self.fc = nn.Sequential(
#             nn.BatchNorm1d(32),
#             #             nn.Dropout(0.7),
#             nn.Linear(32, self.hidden_size),
#             nn.ReLU(),
#             nn.BatchNorm1d(self.hidden_size),
#             nn.Dropout(0.6),
#         )

#         self.new_last_layer = nn.Linear(self.hidden_size, 2)
#         self.lstm = nn.LSTM(self.bert_hidden_size, 32, 1, batch_first=True)
        
#         # 64 are from proj_dim, 2 are from url, 9 is for the other features, 1 is gender,
#         # 3 are synt distance, 2 are the distances to the root

#         # after fine-tuning BERT this is not required, throw away
#         for i, module in enumerate(self.fc):
#             if isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
#                 nn.init.constant_(module.weight, 1)
#                 nn.init.constant_(module.bias, 0)
#                 print("Initing batchnorm")
#             elif isinstance(module, nn.Linear):
#                 if getattr(module, "weight_v", None) is not None:
#                     nn.init.uniform_(module.weight_g, 0, 1)
#                     nn.init.kaiming_normal_(module.weight_v)
#                     print("Initing linear with weight normalization")
#                     assert model[i].weight_g is not None
#                 else:
#                     nn.init.kaiming_normal_(module.weight)
#                     print("Initing linear")
#                 nn.init.constant_(module.bias, 0)

#     def forward(self, bert_outputs):
#         assert bert_outputs.size(2) == self.bert_hidden_size
#         # reduce the dimension
# #         conv_output = self.context_conv(bert_outputs.transpose(1, 2)).transpose(2, 1).contiguous()
#         # and extract the span
# #         maybe later?
# #         extracted_outputs = self.span_extractor(conv_output, offsets).view(bert_outputs.size(0), -1)
# #         extracted_outputs = bert_outputs.view(bert_outputs.size(0), -1)
# #         print(outputs.shape)
#         output, (hn, cn) = self.lstm(bert_outputs)
        
#         fc_output = self.fc(hn.squeeze(0))
#         concatenated_outputs = torch.cat([fc_output], dim=1)
#         return self.new_last_layer(concatenated_outputs)


# class GAPModel(nn.Module):
#     """The main model."""

#     def __init__(self, bert_model: str, cnn_context: int, layer: int, hidden_size: int, device: torch.device):
#         super().__init__()
#         self.device = device
#         if bert_model in ("bert-base-uncased", "bert-base-cased"):
#             self.bert_hidden_size = 768
#         elif bert_model in ("bert-large-uncased", "bert-large-cased"):
#             self.bert_hidden_size = 1024
#         else:
#             raise ValueError("Unsupported BERT model.")
            
#         config = AutoConfig.from_pretrained(
#             bert_model,
#             num_labels=2,
#         )
        
#         self.bert = AutoModel.from_pretrained(
#             bert_model,
#             from_tf=False,
#             config=config,
#         )
        
#         self.num_layers = self.bert.config.num_hidden_layers
#         self.head = Head(self.bert_hidden_size, cnn_context, hidden_size)

#     def forward(self, token_tensor):
# #         token_tensor = token_tensor.to(self.device)
#         bert_outputs, _ = self.bert(
#             token_tensor, attention_mask=(token_tensor > 0).long(),
#             token_type_ids=None)
    
#         # calling output_all_encoded_layers False and True with last index is different
#         # most likely because of the pooling layer. Without pooling layers slighly better results
#         #         h_enc = bert_outputs[-1]
#         #         h_lex = self.bert.embeddings.word_embeddings(token_tensor) # this option takes the first part of bert only
#         #         h_lex = self.bert.embeddings.LayerNorm(h_lex)
        
#         #         bert_outputs = bert_outputs # bert_outputs[self.layer]

#         head_outputs = self.head(bert_outputs)
#         return head_outputs

In [42]:
# model = GAPModel("bert-base-uncased", 0, 12345, 256, 'vlavla')

In [43]:
def children(m):
    return m if isinstance(m, (list, tuple)) else list(m.children())


def set_trainable_attr(m, b):
    m.trainable = b
    for p in m.parameters():
        p.requires_grad = b


def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module):
        f(m)
    if len(c) > 0:
        for l in c:
            apply_leaf(l, f)


def set_trainable(l, b):
    apply_leaf(l, lambda m: set_trainable_attr(m, b))

In [21]:
# model_name_or_path = 't5-small'

In [23]:
from transformers.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaClassificationHead

In [74]:
class RobertaForSequenceClassification(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.classifier = RobertaClassificationHead(config)

        self.init_weights()
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

#         outputs = self.roberta(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids,
#             position_ids=position_ids,
#             head_mask=head_mask,
#             inputs_embeds=inputs_embeds,
#             output_attentions=output_attentions,
#             output_hidden_states=output_hidden_states,
#             return_dict=return_dict,
#         )
        sequence_output = input_ids # outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + ()
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [75]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=2,
)

In [76]:
model = RobertaForSequenceClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.embeddings.position_ids', 

In [77]:
from transformers import AutoModel

In [78]:
model.cuda();

In [92]:
set_trainable(model.roberta, False)
set_trainable(model.classifier, True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [93]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

In [94]:
pytorch_total_params

355361794

In [95]:
pytorch_trainable_params

1051650

# Training helper

In [83]:
from train_helpers import GAPBot, TriangularLR

# Train

In [96]:
for fold in range(10):
    print("fold n#{}".format(fold))
    train = eat_with_folds[eat_with_folds['fold'] != fold]
    val = eat_with_folds[eat_with_folds['fold'] == fold]
    
    train_ds = EATDataset(train, tokenizer, all_embeddings)
    valid_ds = EATDataset(val, tokenizer, all_embeddings)
    
    train_loader = DataLoader(
        train_ds,
        collate_fn=collate_examples,
        batch_size=20,
        num_workers=2,
        pin_memory=True,
        shuffle=False,
        drop_last=False
    )

    val_loader = DataLoader(
        valid_ds,
        collate_fn=collate_examples,
        batch_size=64,
        num_workers=2,
        pin_memory=True,
        shuffle=False
    )
    
    break

fold n#0


In [97]:
a = iter(train_loader)

In [98]:
b = next(a)

In [99]:
b[0].shape

torch.Size([20, 5, 1024])

In [100]:
model(b[0].cuda())[0].shape

torch.Size([20, 2])

In [101]:
criterion = torch.nn.CrossEntropyLoss()

In [None]:
next(a)[0]

In [None]:
model.roberta((next(a)[0]).cuda())

In [None]:
model.roberta((next(a)[0]).cuda())[0].shape

In [None]:
criterion(model((next(a)[0]).cuda())[0], next(a)[1].cuda())

In [None]:
tokenizer.convert_ids_to_tokens(15691)

In [None]:
# ####################### TRAIN ######################
# # Initialize our Trainer
# def compute_metrics(pred: EvalPrediction):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

# training_args = TrainingArguments(output_dir='./',
#                                 overwrite_output_dir=True, 
#                                 do_train=True, 
#                                 do_eval=True,
#                                 evaluation_strategy='epoch',
#                                 per_device_train_batch_size=10,
#                                 per_device_eval_batch_size=8,
#                                 gradient_accumulation_steps=1,
#                                 learning_rate=2e-06,
#                                 weight_decay=0.0, 
#                                 adam_beta1=0.9, 
#                                 adam_beta2=0.999, 
#                                 adam_epsilon=1e-08, 
#                                 max_grad_norm=1.0, 
#                                 num_train_epochs=20.0,
#                                 max_steps=-1, 
#                                 warmup_steps=0,
#                                 logging_dir='runs/whataver', 
#                                 logging_first_step=False, 
#                                 logging_steps=100, 
#                                 save_steps=500,
#                                 seed=42, 
#                                 eval_steps=100,
#                                 dataloader_num_workers=0)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_ds,
#     eval_dataset=valid_ds,
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer,
#     # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
#     data_collator=default_data_collator,
# )

In [102]:
# TODO: no need to create each time separate folder, but different checkpoints
bot = GAPBot(
    model, train_loader, val_loader,
    optimizer=optimizer, echo=True,
    avg_window=25, checkpoint_dir="./models/gap/" + 'try' + '/' + str(fold) + '/'
)

steps_per_epoch = len(train_loader) * 2
n_steps = steps_per_epoch * 5

[[11/26/2020 02:24:08 PM]] SEED: 2711
[[11/26/2020 02:24:08 PM]] # of paramters: 355,361,794
[[11/26/2020 02:24:08 PM]] # of trainable paramters: 1,051,650


In [103]:
bot.train(
    3000,
    log_interval=20,
    snapshot_interval=200,  # check the performance every epoch
    scheduler=TriangularLR(
        optimizer, 20, ratio=2, steps_per_cycle=n_steps)
)

[[11/26/2020 02:24:09 PM]] Optimizer Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    initial_lr: 1e-05
    lr: 5.000000000000001e-07
    weight_decay: 0
)
[[11/26/2020 02:24:09 PM]] Batches per epoch: 47
[[11/26/2020 02:24:09 PM]] Step 20: train 0.645113 lr: 1.657e-06
[[11/26/2020 02:24:10 PM]] Step 40: train 0.641647 lr: 2.875e-06
[[11/26/2020 02:24:10 PM]] Step 60: train 0.647229 lr: 4.093e-06
[[11/26/2020 02:24:10 PM]] Step 80: train 0.649033 lr: 5.311e-06
[[11/26/2020 02:24:11 PM]] Step 100: train 0.647806 lr: 6.529e-06
[[11/26/2020 02:24:11 PM]] Step 120: train 0.644060 lr: 7.747e-06
[[11/26/2020 02:24:11 PM]] Step 140: train 0.644504 lr: 8.965e-06
[[11/26/2020 02:24:12 PM]] Step 160: train 0.643391 lr: 9.909e-06
[[11/26/2020 02:24:12 PM]] Step 180: train 0.643309 lr: 9.304e-06
[[11/26/2020 02:24:13 PM]] Step 200: train 0.643833 lr: 8.699e-06
100%|██████████| 2/2 [00:00<00:00,  3.08it/s]
[[11/26/2020 02:24:14 PM]] Snapshot loss 1.016435
[[11/

In [104]:
# Load the best checkpoint
bot.load_model(bot.best_performers[0][1])

In [106]:
pred[1]

tensor([1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
        0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
        1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 1, 1, 1, 0, 1])

In [107]:
pred[0].argmax(axis=1)

tensor([0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
        0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
        1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
        0, 1, 0, 1, 0, 0, 1, 0])

In [105]:
pred = bot.predict(val_loader, return_y=True)

precision, recall, f1, _ = precision_recall_fscore_support(pred[1], pred[0].argmax(axis=1), average='macro')
acc = accuracy_score(pred[1], pred[0].argmax(axis=1))

metrics = {'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall}

print(metrics)

100%|██████████| 2/2 [00:00<00:00,  3.11it/s]

{'accuracy': 0.17307692307692307, 'f1': 0.17307692307692307, 'precision': 0.17314095449500555, 'recall': 0.17314095449500555}





# peek into swag

In [None]:
import csv
import glob
import json
import logging
import os
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional

import tqdm
from filelock import FileLock

from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available


logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class InputExample:
    """
    A single training/test example for multiple choice
    Args:
        example_id: Unique id for the example.
        question: string. The untokenized text of the second sequence (question).
        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """

    example_id: str
    question: str
    contexts: List[str]
    endings: List[str]
    label: Optional[str]


@dataclass(frozen=True)
class InputFeatures:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.
    """

    example_id: str
    input_ids: List[List[int]]
    attention_mask: Optional[List[List[int]]]
    token_type_ids: Optional[List[List[int]]]
    label: Optional[int]


class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"


if is_torch_available():
    import torch
    from torch.utils.data.dataset import Dataset

    class MultipleChoiceDataset(Dataset):
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """

        features: List[InputFeatures]

        def __init__(
            self,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            processor = processors[task]()

            cached_features_file = os.path.join(
                data_dir,
                "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,),
            )

            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.
            lock_path = cached_features_file + ".lock"
            with FileLock(lock_path):

                if os.path.exists(cached_features_file) and not overwrite_cache:
                    logger.info(f"Loading features from cached file {cached_features_file}")
                    self.features = torch.load(cached_features_file)
                else:
                    logger.info(f"Creating features from dataset file at {data_dir}")
                    label_list = processor.get_labels()
                    if mode == Split.dev:
                        examples = processor.get_dev_examples(data_dir)
                    elif mode == Split.test:
                        examples = processor.get_test_examples(data_dir)
                    else:
                        examples = processor.get_train_examples(data_dir)
                    logger.info("Training examples: %s", len(examples))
                    # TODO clean up all this to leverage built-in features of tokenizers
                    self.features = convert_examples_to_features(
                        examples,
                        label_list,
                        max_seq_length,
                        tokenizer,
                        pad_on_left=bool(tokenizer.padding_side == "left"),
                        pad_token=tokenizer.pad_token_id,
                        pad_token_segment_id=tokenizer.pad_token_type_id,
                    )
                    logger.info("Saving features into cached file %s", cached_features_file)
                    torch.save(self.features, cached_features_file)

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]


if is_tf_available():
    import tensorflow as tf

    class TFMultipleChoiceDataset:
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """

        features: List[InputFeatures]

        def __init__(
            self,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            task: str,
            max_seq_length: Optional[int] = 128,
            overwrite_cache=False,
            mode: Split = Split.train,
        ):
            processor = processors[task]()

            logger.info(f"Creating features from dataset file at {data_dir}")
            label_list = processor.get_labels()
            if mode == Split.dev:
                examples = processor.get_dev_examples(data_dir)
            elif mode == Split.test:
                examples = processor.get_test_examples(data_dir)
            else:
                examples = processor.get_train_examples(data_dir)
            logger.info("Training examples: %s", len(examples))
            # TODO clean up all this to leverage built-in features of tokenizers
            self.features = convert_examples_to_features(
                examples,
                label_list,
                max_seq_length,
                tokenizer,
                pad_on_left=bool(tokenizer.padding_side == "left"),
                pad_token=tokenizer.pad_token_id,
                pad_token_segment_id=tokenizer.pad_token_type_id,
            )

            def gen():
                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
                    if ex_index % 10000 == 0:
                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))

                    yield (
                        {
                            "example_id": 0,
                            "input_ids": ex.input_ids,
                            "attention_mask": ex.attention_mask,
                            "token_type_ids": ex.token_type_ids,
                        },
                        ex.label,
                    )

            self.dataset = tf.data.Dataset.from_generator(
                gen,
                (
                    {
                        "example_id": tf.int32,
                        "input_ids": tf.int32,
                        "attention_mask": tf.int32,
                        "token_type_ids": tf.int32,
                    },
                    tf.int64,
                ),
                (
                    {
                        "example_id": tf.TensorShape([]),
                        "input_ids": tf.TensorShape([None, None]),
                        "attention_mask": tf.TensorShape([None, None]),
                        "token_type_ids": tf.TensorShape([None, None]),
                    },
                    tf.TensorShape([]),
                ),
            )

        def get_dataset(self):
            return self.dataset

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]


class DataProcessor:
    """Base class for data converters for multiple choice data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the test set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()


class RaceProcessor(DataProcessor):
    """Processor for the RACE data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        high = os.path.join(data_dir, "train/high")
        middle = os.path.join(data_dir, "train/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
        return self._create_examples(high + middle, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        high = os.path.join(data_dir, "dev/high")
        middle = os.path.join(data_dir, "dev/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
        return self._create_examples(high + middle, "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} test".format(data_dir))
        high = os.path.join(data_dir, "test/high")
        middle = os.path.join(data_dir, "test/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
        return self._create_examples(high + middle, "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_txt(self, input_dir):
        lines = []
        files = glob.glob(input_dir + "/*txt")
        for file in tqdm.tqdm(files, desc="read files"):
            with open(file, "r", encoding="utf-8") as fin:
                data_raw = json.load(fin)
                data_raw["race_id"] = file
                lines.append(data_raw)
        return lines

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (_, data_raw) in enumerate(lines):
            race_id = "%s-%s" % (set_type, data_raw["race_id"])
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
                truth = str(ord(data_raw["answers"][i]) - ord("A"))
                question = data_raw["questions"][i]
                options = data_raw["options"][i]

                examples.append(
                    InputExample(
                        example_id=race_id,
                        question=question,
                        contexts=[article, article, article, article],  # this is not efficient but convenient
                        endings=[options[0], options[1], options[2], options[3]],
                        label=truth,
                    )
                )
        return examples


class SynonymProcessor(DataProcessor):
    """Processor for the Synonym data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))

        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3", "4"]

    def _read_csv(self, input_file):
        with open(input_file, "r", encoding="utf-8") as f:
            return list(csv.reader(f))

    def _create_examples(self, lines: List[List[str]], type: str):
        """Creates examples for the training and dev sets."""

        examples = [
            InputExample(
                example_id=line[0],
                question="",  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
                contexts=[line[1], line[1], line[1], line[1], line[1]],
                endings=[line[2], line[3], line[4], line[5], line[6]],
                label=line[7],
            )
            for line in lines  # we skip the line with the column names
        ]

        return examples


class SwagProcessor(DataProcessor):
    """Processor for the SWAG data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        raise ValueError(
            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
            "setting!"
        )
        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_csv(self, input_file):
        with open(input_file, "r", encoding="utf-8") as f:
            return list(csv.reader(f))

    def _create_examples(self, lines: List[List[str]], type: str):
        """Creates examples for the training and dev sets."""
        if type == "train" and lines[0][-1] != "label":
            raise ValueError("For training, the input file must contain a label column.")

        examples = [
            InputExample(
                example_id=line[2],
                question=line[5],  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
                contexts=[line[4], line[4], line[4], line[4]],
                endings=[line[7], line[8], line[9], line[10]],
                label=line[11],
            )
            for line in lines[1:]  # we skip the line with the column names
        ]

        return examples


class ArcProcessor(DataProcessor):
    """Processor for the ARC data set (request from allennlp)."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")

    def get_test_examples(self, data_dir):
        logger.info("LOOKING AT {} test".format(data_dir))
        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_json(self, input_file):
        with open(input_file, "r", encoding="utf-8") as fin:
            lines = fin.readlines()
            return lines

    def _create_examples(self, lines, type):
        """Creates examples for the training and dev sets."""

        # There are two types of labels. They should be normalized
        def normalize(truth):
            if truth in "ABCD":
                return ord(truth) - ord("A")
            elif truth in "1234":
                return int(truth) - 1
            else:
                logger.info("truth ERROR! %s", str(truth))
                return None

        examples = []
        three_choice = 0
        four_choice = 0
        five_choice = 0
        other_choices = 0
        # we deleted example which has more than or less than four choices
        for line in tqdm.tqdm(lines, desc="read arc data"):
            data_raw = json.loads(line.strip("\n"))
            if len(data_raw["question"]["choices"]) == 3:
                three_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) == 5:
                five_choice += 1
                continue
            elif len(data_raw["question"]["choices"]) != 4:
                other_choices += 1
                continue
            four_choice += 1
            truth = str(normalize(data_raw["answerKey"]))
            assert truth != "None"
            question_choices = data_raw["question"]
            question = question_choices["stem"]
            id = data_raw["id"]
            options = question_choices["choices"]
            if len(options) == 4:
                examples.append(
                    InputExample(
                        example_id=id,
                        question=question,
                        contexts=[
                            options[0]["para"].replace("_", ""),
                            options[1]["para"].replace("_", ""),
                            options[2]["para"].replace("_", ""),
                            options[3]["para"].replace("_", ""),
                        ],
                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
                        label=truth,
                    )
                )

        if type == "train":
            assert len(examples) > 1
            assert examples[0].label is not None
        logger.info("len examples: %s}", str(len(examples)))
        logger.info("Three choices: %s", str(three_choice))
        logger.info("Five choices: %s", str(five_choice))
        logger.info("Other choices: %s", str(other_choices))
        logger.info("four choices: %s", str(four_choice))

        return examples


def convert_examples_to_features(
    examples: List[InputExample],
    label_list: List[str],
    max_length: int,
    tokenizer: PreTrainedTokenizer,
    pad_token_segment_id=0,
    pad_on_left=False,
    pad_token=0,
    mask_padding_with_zero=True,
) -> List[InputFeatures]:
    """
    Loads a data file into a list of `InputFeatures`
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        choices_inputs = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
            text_a = context
            if example.question.find("_") != -1:
                # this is for cloze question
                text_b = example.question.replace("_", ending)
            else:
                text_b = example.question + " " + ending

            inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=max_length,
                pad_to_max_length=True,
                return_overflowing_tokens=True,
            )
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
                    "If you are training ARC and RACE and you are poping question + options,"
                    "you need to try to use a bigger max seq length!"
                )

            choices_inputs.append(inputs)

        label = label_map[example.label]

        input_ids = [x["input_ids"] for x in choices_inputs]
        attention_mask = (
            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
        )
        token_type_ids = (
            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
        )

        features.append(
            InputFeatures(
                example_id=example.example_id,
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                label=label,
            )
        )

    for f in features[:2]:
        logger.info("*** Example ***")
        logger.info("feature: %s" % f)

    return features


processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}

In [None]:
import logging
import os
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

from transformers import (
    AutoConfig,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
)
# from utils_multiple_choice import MultipleChoiceDataset, Split, processors


logger = logging.getLogger(__name__)


def simple_accuracy(preds, labels):
    return (preds == labels).mean()


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )

In [None]:
args=['--model_name_or_path', 'bert-base-uncased',
      '--task_name', 'swag',
      '--data_dir', './swag',
      '--do_train',
      '--do_eval',
      '--max_seq_length', '256',
       '--per_device_train_batch_size', '32',
      '--learning_rate', '2e-5',
       '--num_train_epochs', '5.0',
       '--output_dir', './',
       '--overwrite_output_dir']

In [None]:
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses(args)

In [None]:
if (
    os.path.exists(training_args.output_dir)
    and os.listdir(training_args.output_dir)
    and training_args.do_train
    and not training_args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
    )

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

# Set seed
set_seed(training_args.seed)

try:
    processor = processors[data_args.task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list)
except KeyError:
    raise ValueError("Task not found: %s" % (data_args.task_name))

# Load pretrained model and tokenizer
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
    cache_dir=model_args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)

model = AutoModelForMultipleChoice.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
)

# Get datasets
train_dataset = (
    MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
    )
    if training_args.do_train
    else None
)
eval_dataset = (
    MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
    )
    if training_args.do_eval
    else None
)

def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": simple_accuracy(preds, p.label_ids)}

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
eval_dataset.features[0]

In [None]:
train_dl = DataLoader(
            train_dataset,
            collate_fn=default_data_collator,
            batch_size=8,
            drop_last=False,
            num_workers=2,
        )

In [None]:
a = iter(train_dl)

In [None]:
b = next(a)

In [None]:
pghjkk = model(**b)

In [None]:
model.cpu()

In [None]:
input_ids = b['input_ids']

In [None]:
input_ids.shape

In [None]:
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

In [None]:
num_choices

In [None]:
attention_mask.shape

In [None]:
b

In [None]:
b

In [None]:
input_ids.shape

In [None]:
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
attention_mask = b['attention_mask'].view(-1, b['attention_mask'].size(-1))
token_type_ids = b['token_type_ids'].view(-1, b['token_type_ids'].size(-1))
position_ids = None#b['position_ids'].view(-1, b['position_ids'].size(-1)) if b['position_ids'] is not None else None

inputs_embeds = None# (
#     b['inputs_embeds'].view(-1, b['inputs_embeds'].size(-2), b['inputs_embeds'].size(-1))
#     if b['inputs_embeds'] is not None
#     else None
# )

In [None]:
from_bert_only = model.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=None,
            inputs_embeds=inputs_embeds,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
        )

In [None]:
b['position_ids']

In [None]:
b['input_i'] is not None

In [None]:
from_bert_only[0].shape

In [None]:
input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None

In [None]:
from_bert_only[0].shape

In [None]:
len(from_bert_only)

In [None]:
dropout = nn.Dropout(0.1)
classifier = nn.Linear(768, 1)

In [None]:
from_bert_only[2][0]

In [None]:
from_bert_only[0]

In [None]:
pooled_output = from_bert_only[1]

pooled_output = dropout(pooled_output)
logits = classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)

In [None]:
reshaped_logits

In [None]:
from_bert_only

In [None]:
b['input_ids'].shape

In [None]:
pghjkk

In [None]:
# Training
if training_args.do_train:
    trainer.train(
        model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
    )
    trainer.save_model()
    # For convenience, we also re-save the tokenizer to the same directory,
    # so that you can share your model easily on huggingface.co/models =)
    if trainer.is_world_master():
        tokenizer.save_pretrained(training_args.output_dir)

# Evaluation
results = {}
if training_args.do_eval:
    logger.info("*** Evaluate ***")

    result = trainer.evaluate()

    output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
    if trainer.is_world_master():
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

            results.update(result)

return results

In [None]:
t = pd.read_csv('./swag/train.csv')

In [None]:
train_dataset.features[0]

In [None]:
train_dataset.features[0].input_ids

In [None]:
tokenizer.convert_ids_to_tokens(102)

In [None]:
tokenizer.convert_tokens_to_ids('someone')

In [None]:
t[t['fold-ind'] == 3416].iloc[0]['sent1']

In [None]:
t.iloc[[102]]

In [None]:
t.iloc[[102]]['startphrase'].item()

In [None]:
t.iloc[[100]]['ending0'].item()

In [None]:
t.iloc[[100]]['ending1'].item()

In [None]:
**b

In [None]:
t.iloc[[100]]['ending2'].item()

In [None]:
t.iloc[[100]]['ending3'].item()