In [1]:
import os
import sys
import json
import random
import time
import re
import datetime
import pickle

from tqdm import tqdm

import numpy as np
from localutils import *

import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertTokenizer, DistilBertTokenizer, RobertaTokenizer
from transformers import get_linear_schedule_with_warmup

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaForSequenceClassification, BertForSequenceClassification, AdamW, BertConfig, DistilBertForSequenceClassification, DistilBertConfig

### Dataset Configs

In [2]:
configs = {
    'news_headline' : {
        'dataset_path': 'data/NewsHeadline_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'kaggle-news',
        'epochs': 5,
        'test_size': 0.5
    },
    
    'semeval' : {
        'dataset_path': 'data/SemEval_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'semeval',
        'epochs': 10,
        'test_size': 0.2
    },
    
    'figlang' : {
        'dataset_path': 'data/FigLang_comet_autocomplete.jsonl',
        'model_name': 'distilbert',
        'model_save_point': 'figlang',
        'epochs': 10,
        'test_size': 0.2
    },
}

In [3]:
def load_dataset(filename):
    dataset = []
    with open(filename) as f:
        for line in f:
            entry = {}
            
            line = line.strip()
            d = json.loads(line)
            
            entry['sentence'] = d['sentence']
            entry['label'] = int(d['label'])
            entry['support'] = []

            for k in d['common_sense'].keys():
                if k == 'xWant' or k == 'xEffect':
                    if dataset_name == 'semeval' or dataset_name == 'figlang':
                        entry['support'].append(d['common_sense'][k])
                    elif dataset_name == 'news_headline':
                        entry['support'].append(d['common_sense'][k][0] if d['common_sense'][k][0] != 'none' else d['common_sense'][k][1])
            dataset.append(entry)
    return dataset

In [4]:
dataset_name = 'news_headline'
config = configs[dataset_name]

PATH = config['dataset_path']

dataset = load_dataset(PATH)

### BERT Pre-processing

In [5]:
model_name = config['model_name']

if model_name == "distilbert":
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
elif model_name == "bert":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
elif model_name == "roberta":
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [6]:
def encode_all(sentences):
    input_ids = []
    for data in sentences:
        input_ids.append(tokenizer.encode(data, add_special_tokens=True))
    return input_ids

In [7]:
MAX_LEN = 32

### Attention Mask

In [8]:
def get_attn(input_ids):
    attention_masks = []

    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

In [9]:
def create_dataset(dataset):
    all_data = []
    
    for data in tqdm(dataset):
        input_ids = []
        
        input_ids.append(tokenizer.encode(data['sentence'].lower()))
        
        for s in data['support']:
            input_ids.append(tokenizer.encode(s.lower()))
            
        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
        attn_mask = torch.tensor(get_attn(input_ids))
        input_ids = torch.tensor(input_ids)
        
        entry = {}
        entry['raw_sentence'] = data['sentence'] + " [SUPPORT]: " + " [SEP] ".join(data['support'])
        entry['sentence'] = input_ids[0]
        entry['sentence_mask'] = attn_mask[0]
        
        entry['support'] = input_ids[1:]
        entry['support_mask'] = attn_mask[1:]
        
        all_data.append((entry, data['label']))
    return all_data

In [10]:
dataset_list = create_dataset(dataset)

100%|██████████| 55328/55328 [00:36<00:00, 1510.90it/s]


### Training & Validation Split

In [11]:
trainset, validationset = train_test_split(dataset_list, random_state=2018, test_size=config['test_size'])

In [12]:
def save_dataset():
    with open('model/trainset-' + config['model_save_point'] +'.data', 'wb') as f:
        pickle.dump(trainset, f)

    with open('model/validationset-' + config['model_save_point'] + '.data', 'wb') as f:
        pickle.dump(validationset, f)

In [13]:
save_dataset()

### Data Loaders

In [14]:
BATCH_SIZE = 16

train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(validationset, batch_size=BATCH_SIZE, shuffle=True)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Train Our Classification Model

In [16]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

def reset_model():
    if model_name == "distilbert":
        model = DistilBertForSequenceClassification.from_pretrained(
                                                "distilbert-base-uncased",
                                                num_labels = 2, 
                                                output_attentions = False, 
                                                output_hidden_states = True, 
                                            )
    elif model_name == "roberta":
        model = RobertaForSequenceClassification.from_pretrained(
                                                "roberta-base",
                                                num_labels = 2, 
                                                output_attentions = True, 
                                                output_hidden_states = False, 
                                            )
    elif model_name == "roberta":
        model = BertForSequenceClassification.from_pretrained(
                                                "bert-base-uncased",
                                                num_labels = 2, 
                                                output_attentions = False, 
                                                output_hidden_states = False, 
                                            )
    model.to(device)
    
    optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8 
                )

    total_steps = len(train_loader) * config['epochs']

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    return model, optimizer, scheduler

### Utility

In [17]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Training

In [18]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [19]:
def train(model, optimizer, scheduler):
    total_loss = 0
    model.train()
    for step, (batch, labels) in enumerate(train_loader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))

        b_input_ids = batch['sentence'].to(device)
        b_input_mask = batch['sentence_mask'].to(device)
        b_labels = labels.to(device)
        model.zero_grad()        
        
        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
                
        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)            
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

In [20]:
def test(model):
    print("")
    print("Running Validation...")
    t0 = time.time()
    
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch, b_labels in test_loader:
        
        with torch.no_grad():        
            outputs = model(batch['sentence'].to(device), 
                            attention_mask=batch['sentence_mask'].to(device))
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        prediction = list(np.argmax(logits, axis=1).flatten())
        all_predictions.extend(prediction)
        all_labels.extend(label_ids.flatten())
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    acc = eval_accuracy/nb_eval_steps
    print("  Accuracy: {0:.4f}".format(acc))
    
    
    f1 = f1_score(all_predictions, all_labels, average = 'macro')
    precision = precision_score(all_predictions, all_labels, average = 'macro')
    recall = recall_score(all_predictions, all_labels, average = 'macro')
        
    print("  F1-score: {0:.4f}".format(f1))
    print("  Precision: {0:.4f}".format(precision))
    print("  Recall: {0:.4f}".format(recall))
    print()

    matrix = confusion_matrix(all_predictions, all_labels)

    tp = matrix[0][0]
    fp = matrix[0][1]
    fn = matrix[1][0]
    tn = matrix[1][1]
    
    sarcastic_precision = tp/ (tp + fp)
    sarcastic_f1 = 2*tp / (2*tp + fn + fp)
    sarcastic_recall = tp / (tp + fn)
    
    print("  Sarcastic Precision: {0:.4f}".format(sarcastic_precision))
    print("  Sarcastic F1-score: {0:.4f}".format(sarcastic_f1))
    print("  Sarcastic Recall: {0:.4f}".format(sarcastic_recall)) 
    
    print()
    
    
    nonsarcastic_precision = tn / (tn + fn)
    nonsarcastic_f1 = 2*tn / (2*tn + fn + fp)
    nonsarcastic_recall = tn / (tn + fp)
    
    print("  Non-sarcastic Precision: {0:.4f}".format(nonsarcastic_precision))
    print("  Non-Sarcastic F1-score: {0:.4f}".format(nonsarcastic_f1))
    print("  Non-sarcasm Recall: {0:.4f}".format(nonsarcastic_recall))
    
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    return [f1, precision, recall, sarcastic_f1, sarcastic_precision, sarcastic_recall, nonsarcastic_f1, nonsarcastic_precision, nonsarcastic_recall]

In [21]:
def save_model(filename = 'model/distilbert-'+ config['model_save_point'] +'.pb'):
    print('Saving model...')
    torch.save(model, filename)

In [22]:
def run(model, optimizer, scheduler):
    loss_values = []
    best_results = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    for epoch_i in range(0, config['epochs']):
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, config['epochs']))
        print('Training...')

        t0 = time.time()
        train(model, optimizer, scheduler)
        results = test(model)
        if best_results[0] < results[0]:
            best_results = results
            save_model()


    print("")
    print("Training complete!")
    return best_results

In [23]:
all_results = []
t0 = time.time()

total_iterations = 5
for iteration in range(total_iterations):
    print("======== Iteration {:2d}".format(iteration))
    model, optimizer, scheduler = reset_model()
    result = run(model, optimizer, scheduler)
    all_results.append(result)

final_results = [sum(value)/3 for value in zip(all_results[0], all_results[1], all_results[2])]

print("  F1-score: {0:.4f}".format(final_results[0]))
print("  Precision: {0:.4f}".format(final_results[1]))
print("  Recall: {0:.4f}".format(final_results[2]))
print()

print("  Sarcastic Precision: {0:.4f}".format(final_results[3]))
print("  Sarcastic F1-score: {0:.4f}".format(final_results[4]))
print("  Sarcastic Recall: {0:.4f}".format(final_results[5])) 

print()

print("  Non-sarcastic Precision: {0:.4f}".format(final_results[6]))
print("  Non-Sarcastic F1-score: {0:.4f}".format(final_results[7]))
print("  Non-sarcasm Recall: {0:.4f}".format(final_results[8]))




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Training...
  Batch    40  of  1,729.    Elapsed: 0:00:06.
  Batch    80  of  1,729.    Elapsed: 0:00:07.
  Batch   120  of  1,729.    Elapsed: 0:00:08.
  Batch   160  of  1,729.    Elapsed: 0:00:09.
  Batch   200  of  1,729.    Elapsed: 0:00:11.
  Batch   240  of  1,729.    Elapsed: 0:00:12.
  Batch   280  of  1,729.    Elapsed: 0:00:13.
  Batch   320  of  1,729.    Elapsed: 0:00:14.
  Batch   360  of  1,729.    Elapsed: 0:00:16.
  Batch   400  of  1,729.    Elapsed: 0:00:17.
  Batch   440  of  1,729.    Elapsed: 0:00:18.
  Batch   480  of  1,729.    Elapsed: 0:00:19.
  Batch   520  of  1,729.    Elapsed: 0:00:21.
  Batch   560  of  1,729.    Elapsed: 0:00:22.
  Batch   600  of  1,729.    Elapsed: 0:00:23.
  Batch   640  of  1,729.    Elapsed: 0:00:25.
  Batch   680  of  1,729.    Elapsed: 0:00:26.
  Batch   720  of  1,729.    Elapsed: 0:00:27.
  Batch   760  of  1,729.    Elapsed: 0:00:28.
  Batch   800  of  1,729.    Elapsed: 0:00:30.
  Batch   840  of  1,729.    Elapsed: 0:00:31.
 

  Batch   760  of  1,729.    Elapsed: 0:03:47.
  Batch   800  of  1,729.    Elapsed: 0:03:49.
  Batch   840  of  1,729.    Elapsed: 0:03:50.
  Batch   880  of  1,729.    Elapsed: 0:03:51.
  Batch   920  of  1,729.    Elapsed: 0:03:52.
  Batch   960  of  1,729.    Elapsed: 0:03:54.
  Batch 1,000  of  1,729.    Elapsed: 0:03:55.
  Batch 1,040  of  1,729.    Elapsed: 0:03:56.
  Batch 1,080  of  1,729.    Elapsed: 0:03:57.
  Batch 1,120  of  1,729.    Elapsed: 0:03:58.
  Batch 1,160  of  1,729.    Elapsed: 0:04:00.
  Batch 1,200  of  1,729.    Elapsed: 0:04:01.
  Batch 1,240  of  1,729.    Elapsed: 0:04:02.
  Batch 1,280  of  1,729.    Elapsed: 0:04:03.
  Batch 1,320  of  1,729.    Elapsed: 0:04:05.
  Batch 1,360  of  1,729.    Elapsed: 0:04:06.
  Batch 1,400  of  1,729.    Elapsed: 0:04:07.
  Batch 1,440  of  1,729.    Elapsed: 0:04:08.
  Batch 1,480  of  1,729.    Elapsed: 0:04:10.
  Batch 1,520  of  1,729.    Elapsed: 0:04:11.
  Batch 1,560  of  1,729.    Elapsed: 0:04:12.
  Batch 1,600

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Training...
  Batch    40  of  1,729.    Elapsed: 0:05:37.
  Batch    80  of  1,729.    Elapsed: 0:05:38.
  Batch   120  of  1,729.    Elapsed: 0:05:40.
  Batch   160  of  1,729.    Elapsed: 0:05:41.
  Batch   200  of  1,729.    Elapsed: 0:05:42.
  Batch   240  of  1,729.    Elapsed: 0:05:44.
  Batch   280  of  1,729.    Elapsed: 0:05:45.
  Batch   320  of  1,729.    Elapsed: 0:05:46.
  Batch   360  of  1,729.    Elapsed: 0:05:47.
  Batch   400  of  1,729.    Elapsed: 0:05:49.
  Batch   440  of  1,729.    Elapsed: 0:05:50.
  Batch   480  of  1,729.    Elapsed: 0:05:51.
  Batch   520  of  1,729.    Elapsed: 0:05:52.
  Batch   560  of  1,729.    Elapsed: 0:05:54.
  Batch   600  of  1,729.    Elapsed: 0:05:55.
  Batch   640  of  1,729.    Elapsed: 0:05:56.
  Batch   680  of  1,729.    Elapsed: 0:05:58.
  Batch   720  of  1,729.    Elapsed: 0:05:59.
  Batch   760  of  1,729.    Elapsed: 0:06:00.
  Batch   800  of  1,729.    Elapsed: 0:06:01.
  Batch   840  of  1,729.    Elapsed: 0:06:03.
 

  Batch   760  of  1,729.    Elapsed: 0:09:19.
  Batch   800  of  1,729.    Elapsed: 0:09:20.
  Batch   840  of  1,729.    Elapsed: 0:09:22.
  Batch   880  of  1,729.    Elapsed: 0:09:23.
  Batch   920  of  1,729.    Elapsed: 0:09:24.
  Batch   960  of  1,729.    Elapsed: 0:09:25.
  Batch 1,000  of  1,729.    Elapsed: 0:09:27.
  Batch 1,040  of  1,729.    Elapsed: 0:09:28.
  Batch 1,080  of  1,729.    Elapsed: 0:09:29.
  Batch 1,120  of  1,729.    Elapsed: 0:09:30.
  Batch 1,160  of  1,729.    Elapsed: 0:09:32.
  Batch 1,200  of  1,729.    Elapsed: 0:09:33.
  Batch 1,240  of  1,729.    Elapsed: 0:09:34.
  Batch 1,280  of  1,729.    Elapsed: 0:09:35.
  Batch 1,320  of  1,729.    Elapsed: 0:09:36.
  Batch 1,360  of  1,729.    Elapsed: 0:09:38.
  Batch 1,400  of  1,729.    Elapsed: 0:09:39.
  Batch 1,440  of  1,729.    Elapsed: 0:09:40.
  Batch 1,480  of  1,729.    Elapsed: 0:09:41.
  Batch 1,520  of  1,729.    Elapsed: 0:09:43.
  Batch 1,560  of  1,729.    Elapsed: 0:09:44.
  Batch 1,600

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Training...
  Batch    40  of  1,729.    Elapsed: 0:11:10.
  Batch    80  of  1,729.    Elapsed: 0:11:11.
  Batch   120  of  1,729.    Elapsed: 0:11:12.
  Batch   160  of  1,729.    Elapsed: 0:11:13.
  Batch   200  of  1,729.    Elapsed: 0:11:15.
  Batch   240  of  1,729.    Elapsed: 0:11:16.
  Batch   280  of  1,729.    Elapsed: 0:11:17.
  Batch   320  of  1,729.    Elapsed: 0:11:18.
  Batch   360  of  1,729.    Elapsed: 0:11:20.
  Batch   400  of  1,729.    Elapsed: 0:11:21.
  Batch   440  of  1,729.    Elapsed: 0:11:22.
  Batch   480  of  1,729.    Elapsed: 0:11:24.
  Batch   520  of  1,729.    Elapsed: 0:11:25.
  Batch   560  of  1,729.    Elapsed: 0:11:26.
  Batch   600  of  1,729.    Elapsed: 0:11:27.
  Batch   640  of  1,729.    Elapsed: 0:11:29.
  Batch   680  of  1,729.    Elapsed: 0:11:30.
  Batch   720  of  1,729.    Elapsed: 0:11:31.
  Batch   760  of  1,729.    Elapsed: 0:11:33.
  Batch   800  of  1,729.    Elapsed: 0:11:34.
  Batch   840  of  1,729.    Elapsed: 0:11:35.
 

  Batch   760  of  1,729.    Elapsed: 0:14:51.
  Batch   800  of  1,729.    Elapsed: 0:14:53.
  Batch   840  of  1,729.    Elapsed: 0:14:54.
  Batch   880  of  1,729.    Elapsed: 0:14:55.
  Batch   920  of  1,729.    Elapsed: 0:14:56.
  Batch   960  of  1,729.    Elapsed: 0:14:58.
  Batch 1,000  of  1,729.    Elapsed: 0:14:59.
  Batch 1,040  of  1,729.    Elapsed: 0:15:00.
  Batch 1,080  of  1,729.    Elapsed: 0:15:01.
  Batch 1,120  of  1,729.    Elapsed: 0:15:03.
  Batch 1,160  of  1,729.    Elapsed: 0:15:04.
  Batch 1,200  of  1,729.    Elapsed: 0:15:05.
  Batch 1,240  of  1,729.    Elapsed: 0:15:06.
  Batch 1,280  of  1,729.    Elapsed: 0:15:08.
  Batch 1,320  of  1,729.    Elapsed: 0:15:09.
  Batch 1,360  of  1,729.    Elapsed: 0:15:10.
  Batch 1,400  of  1,729.    Elapsed: 0:15:11.
  Batch 1,440  of  1,729.    Elapsed: 0:15:13.
  Batch 1,480  of  1,729.    Elapsed: 0:15:14.
  Batch 1,520  of  1,729.    Elapsed: 0:15:15.
  Batch 1,560  of  1,729.    Elapsed: 0:15:16.
  Batch 1,600

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Training...
  Batch    40  of  1,729.    Elapsed: 0:16:40.
  Batch    80  of  1,729.    Elapsed: 0:16:41.
  Batch   120  of  1,729.    Elapsed: 0:16:42.
  Batch   160  of  1,729.    Elapsed: 0:16:44.
  Batch   200  of  1,729.    Elapsed: 0:16:45.
  Batch   240  of  1,729.    Elapsed: 0:16:46.
  Batch   280  of  1,729.    Elapsed: 0:16:47.
  Batch   320  of  1,729.    Elapsed: 0:16:49.
  Batch   360  of  1,729.    Elapsed: 0:16:50.
  Batch   400  of  1,729.    Elapsed: 0:16:51.
  Batch   440  of  1,729.    Elapsed: 0:16:52.
  Batch   480  of  1,729.    Elapsed: 0:16:54.
  Batch   520  of  1,729.    Elapsed: 0:16:55.
  Batch   560  of  1,729.    Elapsed: 0:16:56.
  Batch   600  of  1,729.    Elapsed: 0:16:58.
  Batch   640  of  1,729.    Elapsed: 0:16:59.
  Batch   680  of  1,729.    Elapsed: 0:17:00.
  Batch   720  of  1,729.    Elapsed: 0:17:01.
  Batch   760  of  1,729.    Elapsed: 0:17:03.
  Batch   800  of  1,729.    Elapsed: 0:17:04.
  Batch   840  of  1,729.    Elapsed: 0:17:05.
 

  Batch   760  of  1,729.    Elapsed: 0:20:21.
  Batch   800  of  1,729.    Elapsed: 0:20:23.
  Batch   840  of  1,729.    Elapsed: 0:20:24.
  Batch   880  of  1,729.    Elapsed: 0:20:25.
  Batch   920  of  1,729.    Elapsed: 0:20:26.
  Batch   960  of  1,729.    Elapsed: 0:20:27.
  Batch 1,000  of  1,729.    Elapsed: 0:20:29.
  Batch 1,040  of  1,729.    Elapsed: 0:20:30.
  Batch 1,080  of  1,729.    Elapsed: 0:20:31.
  Batch 1,120  of  1,729.    Elapsed: 0:20:32.
  Batch 1,160  of  1,729.    Elapsed: 0:20:34.
  Batch 1,200  of  1,729.    Elapsed: 0:20:35.
  Batch 1,240  of  1,729.    Elapsed: 0:20:36.
  Batch 1,280  of  1,729.    Elapsed: 0:20:37.
  Batch 1,320  of  1,729.    Elapsed: 0:20:39.
  Batch 1,360  of  1,729.    Elapsed: 0:20:40.
  Batch 1,400  of  1,729.    Elapsed: 0:20:41.
  Batch 1,440  of  1,729.    Elapsed: 0:20:42.
  Batch 1,480  of  1,729.    Elapsed: 0:20:43.
  Batch 1,520  of  1,729.    Elapsed: 0:20:45.
  Batch 1,560  of  1,729.    Elapsed: 0:20:46.
  Batch 1,600

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Training...
  Batch    40  of  1,729.    Elapsed: 0:22:11.
  Batch    80  of  1,729.    Elapsed: 0:22:12.
  Batch   120  of  1,729.    Elapsed: 0:22:13.
  Batch   160  of  1,729.    Elapsed: 0:22:15.
  Batch   200  of  1,729.    Elapsed: 0:22:16.
  Batch   240  of  1,729.    Elapsed: 0:22:17.
  Batch   280  of  1,729.    Elapsed: 0:22:18.
  Batch   320  of  1,729.    Elapsed: 0:22:20.
  Batch   360  of  1,729.    Elapsed: 0:22:21.
  Batch   400  of  1,729.    Elapsed: 0:22:22.
  Batch   440  of  1,729.    Elapsed: 0:22:23.
  Batch   480  of  1,729.    Elapsed: 0:22:25.
  Batch   520  of  1,729.    Elapsed: 0:22:26.
  Batch   560  of  1,729.    Elapsed: 0:22:27.
  Batch   600  of  1,729.    Elapsed: 0:22:29.
  Batch   640  of  1,729.    Elapsed: 0:22:30.
  Batch   680  of  1,729.    Elapsed: 0:22:31.
  Batch   720  of  1,729.    Elapsed: 0:22:32.
  Batch   760  of  1,729.    Elapsed: 0:22:34.
  Batch   800  of  1,729.    Elapsed: 0:22:35.
  Batch   840  of  1,729.    Elapsed: 0:22:36.
 

  Batch   760  of  1,729.    Elapsed: 0:25:51.
  Batch   800  of  1,729.    Elapsed: 0:25:52.
  Batch   840  of  1,729.    Elapsed: 0:25:54.
  Batch   880  of  1,729.    Elapsed: 0:25:55.
  Batch   920  of  1,729.    Elapsed: 0:25:56.
  Batch   960  of  1,729.    Elapsed: 0:25:57.
  Batch 1,000  of  1,729.    Elapsed: 0:25:59.
  Batch 1,040  of  1,729.    Elapsed: 0:26:00.
  Batch 1,080  of  1,729.    Elapsed: 0:26:01.
  Batch 1,120  of  1,729.    Elapsed: 0:26:02.
  Batch 1,160  of  1,729.    Elapsed: 0:26:04.
  Batch 1,200  of  1,729.    Elapsed: 0:26:05.
  Batch 1,240  of  1,729.    Elapsed: 0:26:06.
  Batch 1,280  of  1,729.    Elapsed: 0:26:07.
  Batch 1,320  of  1,729.    Elapsed: 0:26:08.
  Batch 1,360  of  1,729.    Elapsed: 0:26:10.
  Batch 1,400  of  1,729.    Elapsed: 0:26:11.
  Batch 1,440  of  1,729.    Elapsed: 0:26:12.
  Batch 1,480  of  1,729.    Elapsed: 0:26:13.
  Batch 1,520  of  1,729.    Elapsed: 0:26:15.
  Batch 1,560  of  1,729.    Elapsed: 0:26:16.
  Batch 1,600