In [1]:
import torch
import torch.optim as optim
from utils import *
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


# Load ELECTRA model and tokenizer

In [2]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/electra-base-discriminator')

# Load Electra Model
model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)

model.to(device)

Downloading pytorch_model.bin: 100%|██████████| 440M/440M [17:13<00:00, 426kB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassifica

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [3]:
# Freeze All layer except last layer and last 2 block
unfreeze_last_layer_last_9block(model)

print_layer(model)

electra.embeddings.word_embeddings.weight: False
electra.embeddings.position_embeddings.weight: False
electra.embeddings.token_type_embeddings.weight: False
electra.embeddings.LayerNorm.weight: False
electra.embeddings.LayerNorm.bias: False
electra.encoder.layer.0.attention.self.query.weight: False
electra.encoder.layer.0.attention.self.query.bias: False
electra.encoder.layer.0.attention.self.key.weight: False
electra.encoder.layer.0.attention.self.key.bias: False
electra.encoder.layer.0.attention.self.value.weight: False
electra.encoder.layer.0.attention.self.value.bias: False
electra.encoder.layer.0.attention.output.dense.weight: False
electra.encoder.layer.0.attention.output.dense.bias: False
electra.encoder.layer.0.attention.output.LayerNorm.weight: False
electra.encoder.layer.0.attention.output.LayerNorm.bias: False
electra.encoder.layer.0.intermediate.dense.weight: False
electra.encoder.layer.0.intermediate.dense.bias: False
electra.encoder.layer.0.output.dense.weight: False
elec

# Load Dataset SST-2 English

In [4]:
# Split the dataset into train, validation, and test sets
train_data = pd.read_csv("SST-2_datasets/train.tsv", delimiter='\t', names=['labels','sentence'])
val_data = pd.read_csv("SST-2_datasets/dev.tsv", delimiter='\t', names=['labels','sentence'])
test_data = pd.read_csv("SST-2_datasets/test.tsv", delimiter='\t', names=['labels','sentence'])

In [5]:
train_encoded_data = encoded_data(tokenizer, train_data)
val_encoded_data = encoded_data(tokenizer, val_data)
test_encoded_data = encoded_data(tokenizer, test_data)

In [6]:
train_dataset = create_dataset(train_encoded_data, train_data, device)
val_dataset = create_dataset(val_encoded_data, val_data, device)
test_dataset = create_dataset(test_encoded_data, test_data, device)

In [7]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Fine-tuning

In [8]:
# Define the optimizer and learning rate scheduler
num_epochs = 5
optimizer = optim.AdamW(model.parameters(), lr=6.68561343998775e-5, eps=1e-8)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [9]:
# Training Electra model
train_loss = []
val_losses = []
val_accuracies = []

start = time.time()
trainer(train_loss, val_losses, val_accuracies, num_epochs, train_loader, val_dataset, val_loader, model, optimizer, scheduler, device)
print("Time",time.time()-start)


--------------------------------------------
Epoch 1 / 5
Training loss:  0.044461920857429504
Validation loss:  0.2019630494926657
Validation accuracy:  0.926605504587156

--------------------------------------------
Epoch 2 / 5
Training loss:  0.007444287650287151
Validation loss:  0.1705740226233112
Validation accuracy:  0.9403669724770642

--------------------------------------------
Epoch 3 / 5
Training loss:  0.002492310479283333
Validation loss:  0.18429338436440698
Validation accuracy:  0.9541284403669725

--------------------------------------------
Epoch 4 / 5
Training loss:  0.0007966127595864236
Validation loss:  0.21045066969860013
Validation accuracy:  0.9529816513761468

--------------------------------------------
Epoch 5 / 5
Training loss:  0.0006084076012484729
Validation loss:  0.21496401263824996
Validation accuracy:  0.9495412844036697
Time 1007.3376495838165


In [10]:
# save the model
torch.save(model.state_dict(), 'electra_models/transformerELECTRA-11.pt')

#### Test model on test set

In [11]:
#Reload model
model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)
model.load_state_dict(torch.load('electra_models/transformerELECTRA-11.pt', map_location=device))

model.to(device)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [12]:
testing(model, test_loader, test_data, device)

Test loss 0.22678174592836417
Test accuracy: 94.73%
Time 25.322784185409546
