<a href="https://colab.research.google.com/github/adesam146/nlpcw/blob/master/BERT_and_ELMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Summary of BERT and ELMO work. We were able to get BERT to load and train but it was slow and performed poorly for a large number of epochs. ELMO just caused Colab to crash (due to GPU memory issues).

## Check GPU memory

In [0]:
#Check GPU Memory allocation
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize



In [0]:
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm() 

Gen RAM Free: 12.8 GB  | Proc size: 555.5 MB
GPU RAM Free: 11441MB | Used: 0MB | Util   0% | Total 11441MB


In [0]:
#run this if GPU utilization is not 0%
!kill -9 -1

## Imports

In [0]:
!pip install spacy ftfy pytorch-pretrained-bert
!python -m spacy download en
!pip install torchvision torch allennlp


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import sampler
import spacy
import torchvision.datasets as dset
from torchtext import data
from torchtext import datasets as nlp_dset
import random
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from google.colab import files
import numpy as np
import pandas as pd

import torchvision.transforms as T

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

nlp_spaCy = spacy.load('en')

GPU = True
device_idx = 0
if GPU:
    device = torch.device("cuda:"+str(device_idx) if torch.cuda.is_available() else "cpu")
else:
    
    device = torch.device("cpu")
print(device)

#Fix all seeds
SEED = 0
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.cuda.manual_seed(SEED)


from allennlp.modules.elmo import Elmo, batch_to_ids
#Use pretrained ELMO weights. 
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
cuda:0


In [2]:
# Load datafiles from own google drive

# EDIT AS NECESSARY:
#################

from google.colab import drive
drive.mount('/content/drive')
train_fp = """/content/drive/My Drive/colab_data/offenseval-training-v1.tsv"""
trial_data_fp = """/content/drive/My Drive/colab_data/offenseval-trial.txt"""

train_df = pd.read_csv(train_fp, delimiter="\t")
train_df, valid_df = train_test_split(train_df, train_size = 0.8)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




## Bert Pre-processing

In [0]:
# BERT summary
# The first token of every sequence is the classification embedding "[CLS]"
# There are two types of sentence in the representation: A & B. These are used for
# Question-answering systems. For our purposes, all sentences/tweets will be type A. 
# A and B are seperated with the special token "[SEP]". Again, we don't need this here.

# We should use the "bert-large-uncased" eventually which has 1024 latent features
# but for now use "bert-base-uncased" which has 768


#The model returns the embedded representations in the form:
# encoded_layers, pooled_output
# encoded_layers: The activations of each of the 12 layers (or 24 layers in BERT-large)
#                 list of length 12/24 where each element is a tensor of dimensions:
#                 (B, L, F) for Batch size B, sequence length L and number feautures F

#if you want the output embeddings per word, use encoded_layers[-1]
#if you want to use the BERT sentence embedding use pooled_output
# We will use pooled_output for now

#encoded_layers, pooled_output = model(tokens_tensor, segments_tensor)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
def convert_tweets_to_features(tweet_list, seq_length, tokenizer):
    """returns the BERT features"""

    tokenized_tweets = []
    input_ids_list = []
    input_masks = []
    input_type_ids_list= []
                
    for (index, tweet) in enumerate(tweet_list):
       
        
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        
        tweet = tweet_preprocess(tweet)
        tokens_a = tokenizer.tokenize(tweet)
            
        tokens = []
        input_type_ids = []
        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)
        #print("current length input ids:", len(input_ids))
        
        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)

        assert len(input_ids) == seq_length, "{} should = {}".format(len(input_ids), seq_length)
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length
        
        input_ids_tensor = torch.tensor(input_ids)
        input_mask_tensor = torch.tensor(input_mask)
        input_type_ids_tensor = torch.tensor(input_type_ids)
    
        tokenized_tweets.append(tokens)
        input_ids_list.append(input_ids_tensor)
        input_masks.append(input_mask_tensor)
        input_type_ids_list.append(input_type_ids_tensor)
        
    results = (tokenized_tweets, input_ids_list, input_masks, input_type_ids_list)
    return results


In [0]:
def tweet_preprocess(tweet_text):
  """Add tweet specific preprocessing steps here"""
  
  #Remove 'USER' (but leave '@')
  tweet_text = tweet_text.replace("@USER", "@") 
  
  return tweet_text

def convert_labels_A(labels):
    """Preproceses and return labels"""

    final_labels = []
    for label in labels:
        assert label == "OFF" or label == "NOT", "Label should not be: {}".format(label)
    
        if label == "OFF":
            res = 1
        elif label == "NOT":
            res = 0        
        label = torch.tensor([res])
        final_labels.append(label)
    return final_labels
 

In [5]:
#Preprocessing BERT

tweet_train_list = train_df["tweet"].tolist()
tweet_valid_list = valid_df["tweet"].tolist()
#find max length of tokens 
max_len = 0
longest_tokens = None
for tweet in (tweet_train_list + tweet_valid_list):
    tweet = tweet_preprocess(tweet)
    tokens = tokenizer.tokenize(tweet)
    tokens_len = len(tokens)
    if tokens_len > max_len:
        max_len = tokens_len
        longest_tokens = tokens 
        
print("Max token length is", max_len)

#Add an extra few symbols in case the tweets in the test-set are longer
MAX_SEQ = max_len + 4 

#preprocess tweets and extract labels
LABELS_TRAIN = convert_labels_A(train_df["subtask_a"].tolist())
FEATURES_TRAIN = convert_tweets_to_features(tweet_train_list, MAX_SEQ, tokenizer)

LABELS_VALID = convert_labels_A(valid_df["subtask_a"].tolist())
FEATURES_VALID = convert_tweets_to_features(tweet_valid_list, MAX_SEQ, tokenizer)

Max token length is 119


In [0]:
#BERT Data Loaders 
BATCH_SIZE = 32

def train_loader(batch_size = BATCH_SIZE, labels = LABELS_TRAIN, features = FEATURES_TRAIN):
    """Training Loader"""
    return loader_gen(batch_size = BATCH_SIZE, labels = LABELS_TRAIN, features = FEATURES_TRAIN)
def valid_loader():
    """Validate set loader"""
    return loader_gen(batch_size = BATCH_SIZE, labels = LABELS_VALID, features = FEATURES_VALID)
    
def loader_gen(batch_size = BATCH_SIZE, labels = LABELS_TRAIN, features = FEATURES_TRAIN):
    """Generator - bespoke loader. 
    yields an output of (data, label).
        data is a torch tensor of shape (B, L, 2)
            where B is batch size, L is number of tokens per tweet and the final 
            dimension holds the BERT token indexes and the BERT token masks 
            in the first and second components respectively"""
    
    (tokenized_tweets, tweet_ids_list, input_masks, input_type_ids_list) = features 
    
    batch_id_tensor_list = []
    batch_mask_tensor_list = []
    batch_labels_tensor_list = []
    for (idx, tweet_ids) in enumerate(tweet_ids_list):

        batch_id_tensor_list.append(tweet_ids)
        batch_mask_tensor_list.append(input_masks[idx])
        batch_labels_tensor_list.append(labels[idx])
        
        if len(batch_id_tensor_list) == BATCH_SIZE:
            
            #Then produce and yield an output batch tensor and label
            batch_id_tensor = torch.stack(batch_id_tensor_list)
            batch_mask_tensor = torch.stack(batch_mask_tensor_list)
            input_tensor = torch.stack((batch_id_tensor, batch_mask_tensor), dim=2)
            batch_labels_tensor = torch.stack(batch_labels_tensor_list)
            
            assert batch_mask_tensor.shape == (batch_size, MAX_SEQ)
            assert batch_id_tensor.shape == (batch_size, MAX_SEQ)
            assert input_tensor.shape == (batch_size, MAX_SEQ, 2)
            assert batch_labels_tensor.shape == (batch_size, 1)
            
            
            yield (input_tensor, batch_labels_tensor)
            batch_id_tensor_list = []
            batch_mask_tensor_list = []
            batch_labels_tensor_list = []
    
    #check if there is a small batch left...
    if len(batch_id_tensor_list) > 0:
        batch_id_tensor = torch.stack(batch_id_tensor_list)
        batch_mask_tensor = torch.stack(batch_mask_tensor_list)
        input_tensor = torch.stack((batch_id_tensor, batch_mask_tensor), dim=2)
        batch_labels_tensor = torch.stack(batch_labels_tensor_list)
        
        yield (input_tensor, batch_labels_tensor)
    
    

In [0]:
PRINT_EVERY = 50

def check_accuracy(loader, model, conf=False): 
    num_correct = 0
    num_samples = 0
    TP, TN, FP, FN = 0, 0, 0, 0
    
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for idx, (x, y) in enumerate(loader()):
            x = x.to(device=device, dtype=torch.long)  # move to  GPU
            y = y.to(device=device, dtype=torch.long)
            pred_prob = model(x)
            pred_1 = (pred_prob > 0.5).type(torch.long)
            num_correct += (pred_1 == y).sum()
            num_samples += pred_prob.size(0)
            
            if conf:
                #find confusion matrix
                
                #find number correct class 1
                TP += ((pred_1 == 1) & (y == 1)).sum()
                FP += ((pred_1 == 1) & (y == 0)).sum()
                TN += ((pred_1 == 0) & (y == 0)).sum()
                FN += ((pred_1 == 0) & (y == 1)).sum()
            
            x = x.to(device="cpu", dtype=torch.long)  # move to CPU to prevent memory overflow
            y = y.to(device="cpu", dtype=torch.long)
            
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
        if conf:
            F1_0 = 2 * TP / (2.0 * TP + FN + FP)
            F1_1 = 2 * TN / (2 * TN + FN + FP)
            print("TP = {}, TN = {}, FP = {}, FN = {}".format(TP, TN, FP, FN))
            print("F1_0 = {:.4f}, F1_1 = {:.4f}, F1_macro = {:.4f}".format(F1_0, F1_1, 0.5 * (F1_0 + F1_0)))

def train_part(model, optimizer, epochs=1, loss_fn = F.binary_cross_entropy, print_every=PRINT_EVERY):
    """
    Train a model
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to GPU
    try:
        for e in range(epochs):
            for batch_idx, (inputs, targets) in enumerate(train_loader()):

                model.train()  # put model to training mode

                x = inputs.to(device=device, dtype=torch.long)  # move to device, e.g. GPU
                y = targets.to(device=device, dtype=torch.float) #this should be a float cross entropy
                #x = inputs
                #y = targets
                prob = model(x)
                y = y.type(torch.float)
                loss = loss_fn(prob, y)
                # Zero out all of the gradients for the variables which the optimizer
                # will update.
                optimizer.zero_grad()

                # This is the backwards pass: compute the gradient of the loss with
                # respect to each  parameter of the model.
                loss.backward()

                # Actually update the parameters of the model using the gradients
                # computed by the backwards pass.
                optimizer.step()

                x = x.to(device="cpu", dtype=torch.long)  # move to CPU to prevent memory overflow
                y = y.to(device="cpu", dtype=torch.long)

                if batch_idx % print_every == 0:
                    print('Iteration %d, loss = %.4f' % (batch_idx, loss.item()))
                    check_accuracy(valid_loader, model, conf=True)
            print()
            print("Validation Accuracy:")
            check_accuracy(valid_loader, model, conf=True)
            print()

    except Exception as e:
        #Attempt to prevent GPU memory overflow by transferring model back to cpu
        #model = model.to(device="cpu")
        raise e

### Test BERT Loader

In [0]:
#Test loader
#get first batch from train loader
#Now compare with values obtained w/o loader
(_, tweet_ids_list, input_masks, _) = FEATURES_TRAIN

tweet_ids_tensor = tweet_ids_list[0].view((1, -1)) #use just first value
input_masks = input_masks[0].view((1, -1))
with torch.no_grad():
    bert.eval()
    encoded_2, pooled_output_2 = bert(tweet_ids_tensor, output_all_encoded_layers=False,
                                               attention_mask=input_masks, )
    
for idx, (inputs, targets) in enumerate(train_loader()):

    input_ids = inputs[:, :, 0] #token IDs
    attention_mask = inputs[:, :, 1]  #attention mask (to ignore padding)
    with torch.no_grad():
        bert.eval()
        encoded_layers, pooled_output = bert(input_ids, output_all_encoded_layers=False,
                                                       attention_mask=attention_mask, )
    break #to stop full loop

## BERT Models

In [0]:
class FcnnBertEmbeddingBinary(nn.Module):
    "Bert with fully connected NN"
    def __init__(self, embedding_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, max_len):
        
        super(FcnnBertEmbeddingBinary, self).__init__()
        
        #embedding (lookup layer) layer
        self.embedding = BertModel.from_pretrained('bert-base-uncased')
        
        #hidden layers
        self.fc1 = nn.Linear(embedding_dim, hidden_dim_1, bias=True)
        self.fc2 = nn.Linear(hidden_dim_1, hidden_dim_2, bias = True) 
        self.fc3 = nn.Linear(hidden_dim_2, hidden_dim_3, bias = True)
        
        #output layer
        self.fc4 = nn.Linear(hidden_dim_3, 1, bias = True)
        
        #Kaming normalization
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.kaiming_normal_(self.fc2.weight)
        nn.init.kaiming_normal_(self.fc3.weight)
        nn.init.kaiming_normal_(self.fc4.weight)
        
    def forward(self, x):
        
        #Put into .eval mode to export exact weights
        self.eval()
        input_ids = x[:, :, 0] #token IDs
        attention_mask = x[:, :, 1]  #attention mask (to ignore padding)
        
        
        encoded_layers, pooled_output = self.embedding(input_ids, 
                                                     attention_mask=attention_mask)
        self.train()
        
        #Use 'pooled output' as the overall embedding of the sentence.
        #This is recommended in the BERT paper for classification tasks
        
        #A bit of background on what we are doing here:
        #BERT creates its vectors by taking context before and context 
        #after every token in the sequence. The pooled_output is the 
        #resultant vector for the first token and is (according to the paper)
        #the best representation of the sentence as a whole
        h = F.relu(pooled_output)
        h = F.relu(self.fc1(h))
        h = F.relu(self.fc2(h))
        h = F.relu(self.fc3(h))
        h = torch.sigmoid(self.fc4(h))
        
        return h

class SimpleClassifierWBert(nn.Module):
    """Bert w. 2d conv"""
    def __init__(self, out_channels, window_size, dropout):
        super(SimpleClassifierWBert, self).__init__()
        
        self.embedding = BertModel.from_pretrained('bert-base-uncased')
        embedding_dim = 768
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(window_size, embedding_dim))
        
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(out_channels, 1)
        
        #Kaming normalization
        nn.init.kaiming_normal_(self.conv.weight)
        nn.init.kaiming_normal_(self.fc.weight)

        
        
    def forward(self, x):
        self.eval()
        input_ids = x[:, :, 0] #token IDs
        attention_mask = x[:, :, 1]  #attention mask (to ignore padding)
        
        encoded_layers, pooled_output = self.embedding(input_ids, output_all_encoded_layers=False,
                                                       attention_mask=attention_mask )
        self.train()
        
        #Use 'final encoded layer' which is of size:
            #[batch_size, sequence_length, embedding_dim]
        
        embedded = encoded_layers.unsqueeze(1)
                
        #(batch size, 1, max sent length, embedding dim)
        
        feature_maps =  F.relu(self.conv(embedded).squeeze(3))
        # (batch size, out_channels, max sent length - window size +1, 1)
        # -> (batch size, out_channels, max sent length - window size +1)
           
        #the max pooling layer
        pooled = F.max_pool1d(feature_maps, feature_maps.shape[2]).squeeze(2)
        # (batch size, out_channels)      
 
        return self.fc( self.dropout(pooled))


In [0]:
#CONV with bert
embedding_dim = 768
max_len = MAX_SEQ
hidden_dim_1 = 128
hidden_dim_2 = 16
hidden_dim_3 = 4
lr = 0.00025

model = SimpleClassifierWBert(out_channels=100, window_size=3, dropout=0)

optimizer = optim.Adam(model.parameters(), lr)

pos_weight = torch.tensor([2.], device = device) #deals with unbalanced classes

loss_fn = nn.BCEWithLogitsLoss(pos_weight = pos_weight)


train_part(model, optimizer, loss_fn = loss_fn, epochs = 5)

#Note: the calculations of F1 are incorrect here. 

#It takes half an hour to run a single epoch and in that time, the system is 
#only slightly better than the baseline. If we had more computing resource, 
#we would love to explore this more but it is not practical to wait this long
#everytime we want to run a model.


Iteration 0, loss = 3.5033
Got 842 / 2648 correct (31.80)
TP = 842, TN = 0, FP = 1806, FN = 0
F1_0 = 0.0000, F1_1 = 0.0000, F1_macro = 0.0000
Iteration 50, loss = 0.9679
Got 1806 / 2648 correct (68.20)
TP = 0, TN = 1806, FP = 0, FN = 842
F1_0 = 0.0000, F1_1 = 0.0000, F1_macro = 0.0000
Iteration 100, loss = 0.9420
Got 1806 / 2648 correct (68.20)
TP = 0, TN = 1806, FP = 0, FN = 842
F1_0 = 0.0000, F1_1 = 0.0000, F1_macro = 0.0000
Iteration 150, loss = 0.9111
Got 1806 / 2648 correct (68.20)
TP = 0, TN = 1806, FP = 0, FN = 842
F1_0 = 0.0000, F1_1 = 0.0000, F1_macro = 0.0000
Iteration 200, loss = 0.8768
Got 1806 / 2648 correct (68.20)
TP = 0, TN = 1806, FP = 0, FN = 842
F1_0 = 0.0000, F1_1 = 0.0000, F1_macro = 0.0000
Iteration 250, loss = 0.8443
Got 1806 / 2648 correct (68.20)
TP = 0, TN = 1806, FP = 0, FN = 842
F1_0 = 0.0000, F1_1 = 0.0000, F1_macro = 0.0000
Iteration 300, loss = 0.8631
Got 1856 / 2648 correct (70.09)
TP = 52, TN = 1804, FP = 2, FN = 790
F1_0 = 0.0000, F1_1 = 0.0000, F1_mac

In [0]:
#Fully connected w. Bert sentence embeddings
embedding_dim = 768
max_len = MAX_SEQ
hidden_dim_1 = 128
hidden_dim_2 = 16
hidden_dim_3 = 4
lr = 0.00025

model = FcnnBertEmbeddingBinary(embedding_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, max_len)

optimizer = optim.Adam(model.parameters(), lr)

train_part(model, optimizer, epochs = 5)

## Test ELMO

In [0]:
# ELMO takes a list of parsed sentences as an input
# It generates an embedding of length 1024 per word
# We then need to find a good method of combining the word vecs to create 
# a sentence embedding (this article is good: https://medium.com/huggingface/universal-word-sentence-embeddings-ce48ddc8fc3a). 


#Elmo test
sentences = [['First', 'sentence', '.'], ['Another', '.'], 
             ["Oh", "here", "we", "Go", "now", "you", "fool", "."], 
             ["meaninglesswordnotinvocab"]]
             
character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)

print(character_ids.shape)
embed = embeddings["elmo_representations"]
print(len(embed))
print(embed[0].shape)
print(embed[1].shape)


torch.Size([4, 8, 50])
2
torch.Size([4, 8, 1024])
torch.Size([4, 8, 1024])


## Misc

In [0]:
#Save intermediate results to CSV so we can use the nice torchtext .TabularDataset loader
train_df.to_csv(path_or_buf=PREPROCESSED_FP, sep=',', na_rep='', float_format=None, 
                header=True, index=True, )

#train = pd.read_csv("offenseval-training-v1.tsv", delimiter="\t")
#test = pd.read_csv("offenseval-trial (2).txt", delimiter="\t")
train = None

#define our batch size
BATCH_SIZE = 4

text_field = data.Field(sequential = False, use_vocab = False, dtype = torch.long)
label_field = data.LabelField(sequential= False, dtype=torch.float, use_vocab = False)


#text_field.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
#label_field.build_vocab(train)

train = data.TabularDataset(PREPROCESSED_FP, 'CSV', fields = 
                            [('TWEET_IDS', text_field), ('LABEL_A', label_field)], 
                            skip_header=False)
#train = data.Dataset(examples=tweets_IDs, fields= [("tweet", text_field)])


train_iterator = data.Iterator(train, batch_size = BATCH_SIZE, )


In [0]:
#OLD BERT METHOD
#train = pd.read_csv("offenseval-training-v1.tsv", delimiter="\t")
#test = pd.read_csv("offenseval-trial (2).txt", delimiter="\t")
train = None
test = None

#define our batch size
BATCH_SIZE = 64

text_field = data.Field(tokenize=BERT_tokenize, preprocessing = BERT_retrieve_ID, use_vocab = True, dtype = torch.long)
label_field = data.LabelField(sequential= False, preprocessing = section_a_labels, dtype=torch.float, use_vocab = False)


#text_field.build_vocab(train, max_size=25000, vectors="glove.6B.100d")
#label_field.build_vocab(train)

train = data.TabularDataset("offenseval-training-v1.tsv", 'TSV', fields = 
                            { "tweet": ("tweet", text_field), "subtask_a": ("LabelA", label_field),}, skip_header=False)

text_field.build_vocab(train, vectors="glove.6B.50d") #USE "glove.840B.300d" or glove.twitter.27B.200d
label_field.build_vocab(train)

glove_dim = 50

#define our batch size
BATCH_SIZE = 64

#define types of data and their preprocessing

#get pre-defined split
#train = text_field.preprocess(train.iloc[0]["tweet"])
#print(train)

train_iterator = data.Iterator(train, batch_size = BATCH_SIZE, device="cuda")

  

NameError: ignored