In [1]:
import torch
import torch.optim as optim
from utils import *
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Load ELECTRA model and tokenizer

In [2]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/electra-base-discriminator')

# Load Electra Model
model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)

model.to(device)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [3]:
# Freeze All layer except last layer
unfreeze_last_layer_last_3block(model)

print_layer(model)

electra.embeddings.word_embeddings.weight: False
electra.embeddings.position_embeddings.weight: False
electra.embeddings.token_type_embeddings.weight: False
electra.embeddings.LayerNorm.weight: False
electra.embeddings.LayerNorm.bias: False
electra.encoder.layer.0.attention.self.query.weight: False
electra.encoder.layer.0.attention.self.query.bias: False
electra.encoder.layer.0.attention.self.key.weight: False
electra.encoder.layer.0.attention.self.key.bias: False
electra.encoder.layer.0.attention.self.value.weight: False
electra.encoder.layer.0.attention.self.value.bias: False
electra.encoder.layer.0.attention.output.dense.weight: False
electra.encoder.layer.0.attention.output.dense.bias: False
electra.encoder.layer.0.attention.output.LayerNorm.weight: False
electra.encoder.layer.0.attention.output.LayerNorm.bias: False
electra.encoder.layer.0.intermediate.dense.weight: False
electra.encoder.layer.0.intermediate.dense.bias: False
electra.encoder.layer.0.output.dense.weight: False
elec

# Load Dataset SST-2 English

In [4]:
# Split the dataset into train, validation, and test sets
train_data = pd.read_csv("SST-2_datasets/train.tsv", delimiter='\t', names=['labels','sentence'])
val_data = pd.read_csv("SST-2_datasets/dev.tsv", delimiter='\t', names=['labels','sentence'])
test_data = pd.read_csv("SST-2_datasets/test.tsv", delimiter='\t', names=['labels','sentence'])

In [5]:
train_encoded_data = encoded_data(tokenizer, train_data)
val_encoded_data = encoded_data(tokenizer, val_data)
test_encoded_data = encoded_data(tokenizer, test_data)

In [6]:
train_dataset = create_dataset(train_encoded_data, train_data, device)
val_dataset = create_dataset(val_encoded_data, val_data, device)
test_dataset = create_dataset(test_encoded_data, test_data, device)

In [7]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Fine-tuning

In [8]:
# Define the optimizer and learning rate scheduler
num_epochs = 5
optimizer = optim.AdamW(model.parameters(), lr=6.68561343998775e-5, eps=1e-8)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [9]:
# Training Electra model
train_loss = []
val_losses = []
val_accuracies = []

start = time.time()
trainer(train_loss, val_losses, val_accuracies, num_epochs, train_loader, val_dataset, val_loader, model, optimizer, scheduler, device)
print("Time",time.time()-start)


--------------------------------------------
Epoch 1 / 5
Training loss:  0.05162408575415611
Validation loss:  0.18470357471544827
Validation accuracy:  0.926605504587156

--------------------------------------------
Epoch 2 / 5
Training loss:  0.08271484822034836
Validation loss:  0.1598846694853689
Validation accuracy:  0.9438073394495413

--------------------------------------------
Epoch 3 / 5
Training loss:  0.04600496590137482
Validation loss:  0.16242578805291227
Validation accuracy:  0.9380733944954128

--------------------------------------------
Epoch 4 / 5
Training loss:  0.029294222593307495
Validation loss:  0.2148058759048581
Validation accuracy:  0.9311926605504587

--------------------------------------------
Epoch 5 / 5
Training loss:  0.04863758385181427
Validation loss:  0.2071909078596426
Validation accuracy:  0.9392201834862385
Time 692.5910575389862


In [10]:
# save the model
torch.save(model.state_dict(), 'electra_models/transformerELECTRA-5.pt')

#### Test model on test set

In [11]:
#Reload model
model = AutoModelForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=2)
model.load_state_dict(torch.load('electra_models/transformerELECTRA-5.pt', map_location=device))

model.to(device)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [12]:
testing(model, test_loader, test_data, device)

Test loss 0.1790653596092996
Test accuracy: 94.34%
Time 24.82580542564392
