In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import transformers
import torch
from torch.utils.data import Dataset, DataLoader

In [4]:
df = pd.read_csv("/content/drive/MyDrive/pro/8701/combined_14510_xlnet.csv")
title_only = df[(df['title_polyglot_detect'] == 'en') & (df['title_lang_detect'] == 'en') & (df['title_langid_detect'] == 'en') & (df['title_xl_detect'] == 'en')][['question_title', 'class_index']]

from sklearn.model_selection import train_test_split
X_tmp, X_test, y_tmp, y_test = train_test_split(title_only['question_title'], title_only['class_index'], stratify=title_only['class_index'], test_size=0.10, random_state=42)
X_tmp.shape, X_test.shape, y_tmp.shape, y_test.shape

((14737,), (1638,), (14737,), (1638,))

In [5]:
tmp = pd.DataFrame({
    "question_title": X_tmp,
    "class_index": y_tmp
}
)
# tmp.class_index.value_counts()
X_train, X_valid, y_train, y_valid = train_test_split(tmp['question_title'], tmp['class_index'], stratify=tmp['class_index'], test_size=0.10, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((13263,), (1474,), (13263,), (1474,))

In [6]:
(X_train.shape, X_test.shape, X_valid.shape), (y_train.shape, y_test.shape, y_valid.shape)

(((13263,), (1638,), (1474,)), ((13263,), (1638,), (1474,)))

In [6]:
class YahooDataset(Dataset):
 
  def __init__(self, x, y):
    self.x=x.values
    self.y=y.values
  
  def __len__(self):
    return len(self.y)
   
  def __getitem__(self,idx):
    return self.x[idx],self.y[idx]

In [7]:
from transformers import BertModel, BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.tokenize("This is superb!"))

# The senetence to be encoded
sent = "Let's learn deep learning!"

# Encode the sentence
encoded = bert_tokenizer.encode_plus(
    text=sent,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = 64,  # maximum length of a sentence
    pad_to_max_length=True,  # Add [PAD]s
    return_attention_mask = True,  # Generate the attention mask
    return_tensors = 'pt',  # ask the function to return PyTorch tensors
)

# Get the input IDs and attention mask in tensor format
input_ids = encoded['input_ids']
attn_mask = encoded['attention_mask']

input_ids, attn_mask

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(tensor([[ 101, 2292, 1005, 1055, 4553, 2784, 4083,  999,  102,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

In [14]:
bert_tokenizer.convert_ids_to_tokens(ids=input_ids[0], skip_special_tokens=True)

['let', "'", 's', 'learn', 'deep', 'learning', '!']

In [8]:
def encode(x, y, tokenizer):
    input_ids = []
    attention_mask = []

    labels = np.unique(y)
    id2label = {idx:label for idx, label in enumerate(labels)}
    label2id = {label:idx for idx, label in enumerate(labels)}
    
    for text in x:
        tokenized_text = tokenizer.encode_plus(text,
                                            max_length=512,
                                            add_special_tokens=True,
                                            pad_to_max_length=True,
                                            return_attention_mask=True)
        input_ids.append(tokenized_text['input_ids'])
        attention_mask.append(tokenized_text['attention_mask'])
    y = [ label2id[label] for label in y]
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_mask, dtype=torch.long), torch.tensor(y, dtype=torch.long)

In [9]:
def get_batches(x, y, tokenizer, batch_size=8):
    input_ids, attention_mask, y = encode(x, y, tokenizer)
    tensor_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, y)
    # tensor_randomsampler = torch.utils.data.RandomSampler(tensor_dataset)
    tensor_dataloader = torch.utils.data.DataLoader(tensor_dataset, batch_size=batch_size) # sampler=tensor_randomsampler,
    return tensor_dataloader

In [19]:
# input_ids, attention_mask, y = encode(list(X_train), list(y_train), tokenizer=bert_tokenizer)

In [10]:
# yd_trn = YahooDataset(x=X_train, y=y_train)
# yd_trn_dl = DataLoader(yd_trn)

# type(yd_trn_dl), type(yd_trn)

train_dl = get_batches(list(X_train), list(y_train), tokenizer=bert_tokenizer)
valid_dl = get_batches(list(X_valid), list(y_valid), tokenizer=bert_tokenizer)
test_dl = get_batches(list(X_test), list(y_test), tokenizer=bert_tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
def train_model(batch, model, optimizer, scheduler, epochs, device):
    model.train()  # Set the mode to training
    for e in range(epochs):
        for i, batch_tuple in enumerate(batch):
            batch_tuple = (t.to(device) for t in batch_tuple)
            input_ids, attention_mask, labels = batch_tuple
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits, hidden_states_output, attention_mask_output = outputs
            if i % 100 == 0:
                print("loss - {0}, iteration - {1}/{2}".format(loss, e + 1, i))
            model.zero_grad()
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['max_grad_norm'])
            optimizer.step()
            scheduler.step()

def evaluate(batch, model, device):
    input_ids, predictions, true_labels, attentions = [], [], [], []
    model.eval()
    for i, batch_cpu in enumerate(batch):
        batch_gpu = (t.to(device) for t in batch_cpu)
        input_ids_gpu, attention_mask, labels = batch_gpu
        with torch.no_grad():
            loss, logits, hidden_states_output, attention_mask_output = model(input_ids=input_ids_gpu, attention_mask=attention_mask, labels=labels)
            logits =  logits.cpu()
            prediction = torch.argmax(logits, dim=1).tolist()
            true_label = labels.cpu().tolist()
            input_ids_cpu = input_ids_gpu.cpu().tolist()
            attention_last_layer = attention_mask_output[-1].cpu() # selection the last attention layer
            attention_softmax = attention_last_layer[:,-1, 0].tolist()  # selection the last head attention of CLS token
            input_ids += input_ids_cpu
            predictions += prediction
            true_labels += true_label
            attentions += attention_softmax
    return input_ids, predictions, true_labels, attentions

In [19]:
epochs=2
parameters = {
    'learning_rate': 5e-3,
    'num_warmup_steps': 1000,
    'num_training_steps': len(train_dl) * epochs,
    'max_grad_norm': 1
}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bert_model = BertModel.from_pretrained('bert-base-uncased', num_labels=4, output_hidden_states=True, output_attentions=True)
bert_model.to(device)
optimizer = transformers.AdamW(bert_model.parameters(), lr=parameters['learning_rate'], correct_bias=False)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=parameters['num_warmup_steps'],
                                                         num_training_steps=parameters['num_training_steps'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
train_model(train_dl, bert_model, optimizer, scheduler, epochs, device)

TypeError: ignored

In [None]:
t = [ i for i in train_dl]
t

In [37]:
len(t)
# t[0][0][0]
len(t[0][1])

8

In [12]:
op = dict()
for i, batch_tuple in enumerate(train_dl):
    batch_tuple = (t for t in batch_tuple)
    # print(len([i for i in batch_tuple][2]))
    input_ids, attention_mask, label = batch_tuple
    op = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    if i == 2:
      break

In [24]:
op = bert_model()

KeyError: ignored

In [28]:
len(op[0][0])

512

In [214]:
import gzip
import shutil
import time
import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

from transformers import BertTokenizer
from transformers import BertForSequenceClassification

In [30]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_EPOCHS = 3

In [138]:
df = pd.read_csv("/content/drive/MyDrive/pro/8701/combined_14510_xlnet.csv")
title_only = df[(df['title_polyglot_detect'] == 'en') & (df['title_lang_detect'] == 'en') & (df['title_langid_detect'] == 'en') & (df['title_xl_detect'] == 'en')][['question_title', 'class_index']]

from sklearn.model_selection import train_test_split
X_tmp, X_test, y_tmp, y_test = train_test_split(title_only['question_title'], title_only['class_index'], stratify=title_only['class_index'], test_size=0.10, random_state=42)
X_tmp.shape, X_test.shape, y_tmp.shape, y_test.shape, type(X_tmp)

((14737,), (1638,), (14737,), (1638,), pandas.core.series.Series)

In [139]:
tmp = pd.DataFrame({
    "question_title": X_tmp,
    "class_index": y_tmp
}
)
# tmp.class_index.value_counts()
X_train, X_valid, y_train, y_valid = train_test_split(tmp['question_title'], tmp['class_index'], stratify=tmp['class_index'], test_size=0.10, random_state=42)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((13263,), (1474,), (13263,), (1474,))

In [199]:
def tokenize(list_of_sentences, tokenizer):
  for i in list_of_sentences:
    # print(list_of_sentences[i])
    encoded = tokenizer.encode_plus(
        text=i,  # the sentence to be encoded
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length = 512,  # maximum length of a sentence
        pad_to_max_length=True,  # Add [PAD]s
        return_attention_mask = True,  # Generate the attention mask
        return_tensors = 'pt',  # ask the function to return PyTorch tensors
    )
  return encoded

tokenizer = DistilBertTokenizerFast.from_pretrained(
     'distilbert-base-uncased', num_labels = 4
)

train_encodings = tokenize(list(X_train), tokenizer)
valid_encodings = tokenize(list(X_valid), tokenizer)
test_encodings = tokenize(list(X_test), tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [200]:
train_encodings[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [201]:
# tokenizer.convert_ids_to_tokens(train_encodings[0].ids, skip_special_tokens=True)
# train_encodings.items()

In [202]:
class YahooDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    label_to_id = {
        1: 0,
        5: 1,
        6: 2,
        10: 3
    }
    self.labels = [ label_to_id[i] for i in labels]
  
  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) 
            for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_dataset = YahooDataset(train_encodings, list(y_train))
valid_dataset = YahooDataset(valid_encodings, list(y_valid))
test_dataset = YahooDataset(test_encodings, list(y_test))
train_loader = DataLoader(train_dataset, batch_size=8) 
valid_loader = DataLoader(valid_dataset, batch_size=8) 
test_loader = DataLoader(test_dataset, batch_size=8)

In [203]:
# train_encodings.keys()
# { k:v  for k, v in train_encodings.items()}
# train_dataset[0]
train_loader
# train_dataset

<torch.utils.data.dataloader.DataLoader at 0x7f91b9986890>

In [204]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', output_attentions=True)
model.to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [205]:
def compute_accuracy(model, data_loader, device):
     with torch.no_grad():
         correct_pred, num_examples = 0, 0
         for batch_idx, batch in enumerate(data_loader):
             ### Prepare data
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
             labels = batch['labels'].to(device)
    
             outputs = model(input_ids, attention_mask=attention_mask)
             logits = outputs['logits']
             predicted_labels = torch.argmax(logits, 1)
             num_examples += labels.size(0)
             correct_pred += (predicted_labels == labels).sum()
     return correct_pred.float()/num_examples * 100

In [206]:
start_time = time.time()
for epoch in range(NUM_EPOCHS):
  model.train()
   
  for batch_idx, batch in enumerate(train_loader):
    ### Prepare data
    input_ids = batch['input_ids'].to(DEVICE)
    attention_mask = batch['attention_mask'].to(DEVICE)
    labels = batch['labels'].to(DEVICE)
    ### Forward pass
    outputs = model(input_ids, 
                    attention_mask=attention_mask,
                    labels=labels)
    loss, logits = outputs['loss'], outputs['logits']

    ### Backward pass
    optim.zero_grad()
    loss.backward()
    optim.step()

    ### Logging
    if not batch_idx % 250:
      print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}' 
                f' | Batch' 
                f'{batch_idx:04d}/'
                f'{len(train_loader):04d} | '
                f'Loss: {loss:.4f}')
  
  model.eval()
  with torch.set_grad_enabled(False):
    print(f'Training accuracy: '
            f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
            f'\nValid accuracy: '
            f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
    
  print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
 
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

  item = {key: torch.tensor(val[idx])


IndexError: ignored

In [207]:
for batch_idx, batch in enumerate(train_loader):
  print(batch_idx)
  print(batch)
  if batch_idx == 2:
    break

  item = {key: torch.tensor(val[idx])


IndexError: ignored

In [208]:
for i in train_loader:
  print(i)

  item = {key: torch.tensor(val[idx])


IndexError: ignored

In [209]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        label_to_id = {
        1: 0,
        5: 1,
        6: 2,
        10: 3
        }
        self.labels = [ label_to_id[i] for i in labels]
        # self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize text and truncate to max_len
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return inputs['input_ids'], inputs['attention_mask'], label

# Example usage:
# texts = ["This is a positive text", "This is a negative text"]
# labels = [1, 0] # 1 for positive, 0 for negative
texts = list(X_train)[:7]
labels = list(y_train)[:7]
tokenizer = tokenizer # Initialize your tokenizer
max_len = 32
dataset = TextClassificationDataset(texts, labels, tokenizer, max_len)
train_dataloader = DataLoader(dataset, batch_size=3)
valid_dataloader = DataLoader(dataset, batch_size=3)

# Iterate through batches
for batch in train_dataloader:
    input_ids, attention_mask, labels = batch
    print(input_ids)
    print(attention_mask)
    print(labels)

len(train_dataloader)

tensor([[[  101,  2040,  1005,  1055,  6069,  2663,  1996,  6156,  2452,  1029,
            102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]],

        [[  101, 18106, 14924, 26340,  1029,  2064, 10334,  2507,  2033,  2204,
           2609,  2005,  7760, 18471, 14924, 26340,  1029,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]],

        [[  101,  2054,  2064,  8135,  2175,  2000,  3109,  2005,  1029,   102,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]]])
tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 

3

In [None]:
from transformers.models.distilbert.modeling_distilbert import DistilBertModel
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

for i, b in enumerate(train_dataloader):
  input_ids, attention_mask, label = b
  print(input_ids)
  print(attention_mask)
  print(label)for i, b in enumerate(train_dataloader):
  input_ids, attention_mask, label = b
  print(input_ids)
  print(attention_mask)
  print(label)
  op = model(input_ids = input_ids, attention_mask=attention_mask)
  op = model(input_ids = input_ids, attention_mask=attention_mask)

In [None]:
model()

In [211]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=0.05)
loss_fn = torch.nn.CrossEntropyLoss()
# device = 'cpu'
model.train()
for epoch in range(epochs):
    for batch in train_dataloader:
        optimizer.zero_grad()
        # Get inputs and labels
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        labels = labels.to(DEVICE)
    
        # Forward pass
        outputs = model(input_ids= input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Compute loss and backpropagate
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

    # Print training loss for this epoch
    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item()}")

ValueError: ignored

In [213]:
# Define hyperparameters
epochs = 1
batch_size = 8
learning_rate = 5e-2

start_time = time.time()
for epoch in range(NUM_EPOCHS):
  model.train()
  
  for batch_idx, batch in enumerate(train_dataloader):
    ### Prepare data
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(DEVICE)
    attention_mask = attention_mask.to(DEVICE)
    labels = labels.to(DEVICE)
    print(input_ids.shape)
    print(attention_mask.shape)
    print(labels.shape)
    ### Forward pass
    outputs = \
    model(input_ids, 
                    attention_mask=attention_mask)
    loss, logits = outputs['loss'], outputs['logits']

    ### Backward pass
    optim.zero_grad()
    loss.backward()
    optim.step()

    ### Logging
    if not batch_idx % 250:
      print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}' 
                f' | Batch' 
                f'{batch_idx:04d}/'
                f'{len(train_loader):04d} | '
                f'Loss: {loss:.4f}')
  
  model.eval()
  with torch.set_grad_enabled(False):
    print(f'Training accuracy: '
            f'{compute_accuracy(model, train_dataloader, DEVICE):.2f}%'
            f'\nValid accuracy: '
            f'{compute_accuracy(model, valid_dataloader, DEVICE):.2f}%')
    
  print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

torch.Size([3, 1, 32])
torch.Size([3, 1, 32])
torch.Size([3])


ValueError: ignored

In [215]:
def tokenize(list_of_sentences, tokenizer):
  for i in list_of_sentences:
    # print(list_of_sentences[i])
    encoded = tokenizer.encode_plus(
        text=i,  # the sentence to be encoded
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length = 512,  # maximum length of a sentence
        pad_to_max_length=True,  # Add [PAD]s
        return_attention_mask = True,  # Generate the attention mask
        return_tensors = 'pt',  # ask the function to return PyTorch tensors
    )
  return encoded

tokenizer = BertTokenizer.from_pretrained(
     'bert-base-uncased', num_labels = 4
)

train_encodings = tokenize(list(X_train), tokenizer)
valid_encodings = tokenize(list(X_valid), tokenizer)
test_encodings = tokenize(list(X_test), tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [218]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        label_to_id = {
        1: 0,
        5: 1,
        6: 2,
        10: 3
        }
        self.labels = [ label_to_id[i] for i in labels]
        # self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize text and truncate to max_len
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return inputs['input_ids'], inputs['attention_mask'], label

# Example usage:
# texts = ["This is a positive text", "This is a negative text"]
# labels = [1, 0] # 1 for positive, 0 for negative
texts = list(X_train)[:7]
labels = list(y_train)[:7]
tokenizer = BertTokenizer.from_pretrained(
     'bert-base-uncased', num_labels = 4
) # Initialize your tokenizer
max_len = 32
dataset = TextClassificationDataset(texts, labels, tokenizer, max_len)
train_dataloader = DataLoader(dataset, batch_size=3)
valid_dataloader = DataLoader(dataset, batch_size=3)

# Iterate through batches
for batch in train_dataloader:
    input_ids, attention_mask, labels = batch
    print(input_ids)
    print(attention_mask)
    print(labels)

len(train_dataloader)

tensor([[[  101,  2040,  1005,  1055,  6069,  2663,  1996,  6156,  2452,  1029,
            102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]],

        [[  101, 18106, 14924, 26340,  1029,  2064, 10334,  2507,  2033,  2204,
           2609,  2005,  7760, 18471, 14924, 26340,  1029,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]],

        [[  101,  2054,  2064,  8135,  2175,  2000,  3109,  2005,  1029,   102,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]]])
tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 

3

In [225]:
from torch import nn
class BertClassifier(nn.Module):

    def __init__(self): #, dropout=0.5

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # self.dropout = nn.Dropout(dropout)
        # self.linear = nn.Linear(768, 4)
        # self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
    #     dropout_output = self.dropout(pooled_output)
    #     linear_output = self.linear(dropout_output)
    #     final_layer = self.relu(linear_output)

        return pooled_output

In [228]:
model = BertClassifier()

for i, b in enumerate(train_dataloader):
  b_input_ids, b_attention_mask, b_label = b
  print(input_ids)
  print(attention_mask)
  print(label)
  op = model(input_id = b_input_ids, mask=b_attention_mask)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[[ 101, 2339, 2024, 7588, 2200, 3928, 1029,  102,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0]]])
tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0]]])
tensor([1])


ValueError: ignored