In [1]:
import tensorflow as tf 

# check for GPU 
device_name = tf.test.gpu_device_name()

if device_name == "/device:GPU:0":
  print("Found GPU at {}".format(device_name))
else:
  raise SystemError("GPU device not found")

Found GPU at /device:GPU:0


In [2]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
  print("There are {} GPU(s) available".format(torch.cuda.device_count()))
  print("We will use the GPU", torch.cuda.get_device_name(0))
else: 
  device = torch.device("cpu")
  print("No GPU available, we use the CPU instead")

There are 1 GPU(s) available
We will use the GPU Tesla K80


In [3]:
!pip install transformers 
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from transformers import *



In [0]:
url = "https://raw.githubusercontent.com/ant1code/tweet-sentiment/master/data/train.csv"
df = pd.read_csv(url)[:15000]

df.sentiment = pd.Categorical(df.sentiment)
df["label"]  = df.sentiment.cat.codes 
df.drop(["textID", "selected_text", "sentiment"], axis=1, inplace=True)
df = df[df.text.notna()]

df.text = df.text.str.lower()
max_length = max(df.text.apply(len))

x_train, x_test, y_train, y_test = train_test_split(df.text, df.label, test_size=0.1)

In [0]:
pretrained_weights = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights, do_lower_case=True)

In [0]:
token_ids = [] 
attention_masks = []

for text in x_train.values:
  encoded_dict = tokenizer.encode_plus(
      text, add_special_tokens=True, max_length=max_length,
      pad_to_max_length=True, return_attention_mask=True, return_tensors="pt"
  )

  token_ids.append(encoded_dict["input_ids"])
  attention_masks.append(encoded_dict["attention_mask"])

token_ids = torch.cat(token_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_train.values) 

In [0]:
from torch.utils.data import TensorDataset, random_split 

dataset = TensorDataset(token_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
valid_size = len(dataset) - train_size

train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

In [0]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
    train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size
)

valid_dataloader = DataLoader(
    valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=batch_size
)

In [9]:
model = DistilBertForSequenceClassification.from_pretrained(
    pretrained_weights, num_labels=3, output_attentions=False, output_hidden_states=False
)

model.cuda()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [0]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [0]:
epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

In [0]:
import time 
import datetime

def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
  elapsed_rounded = int(round(elapsed))
  return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
import random 

seed_val = 10 
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [14]:
training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs): 
  print("")
  print("======= Epoch {:} / {:} =======".format(epoch_i+1, epochs))
  print("Training...")

  t0 = time.time()

  total_train_loss = 0
  for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0)
      print("  Batch {:>5,} of {:>5,}.    Elapsed: {:}".format(step, len(train_dataloader), elapsed))

      b_input_ids  = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels     = batch[2].to(device)

      model.zero_grad()

      loss, logits = model(
          input_ids      = b_input_ids, 
          attention_mask = b_input_mask, 
          labels         = b_labels.long()
      ) #DistilBertForSequenceClassification does not take in token_type_ids

      total_train_loss += loss.item() 

      loss.backward() 

      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      optimizer.step() 

      scheduler.step() 

  avg_train_loss = total_train_loss / len(train_dataloader)
  
  training_time = format_time(time.time() - t0)

  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epoch took: {:}".format(training_time))

  print("")
  print("Running validation...")

  t0 = time.time() 

  model.eval()

  total_eval_accuracy = 0
  total_eval_loss = 0 
  nb_eval_steps = 0 

  for batch in valid_dataloader: 
    b_input_ids  = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels     = batch[2].to(device)

    with torch.no_grad():
      loss, logits = model(
          b_input_ids, attention_mask=b_input_mask, labels=b_labels.long()
      ) 
    
    total_eval_loss += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to("cpu").numpy()

    total_eval_accuracy += flat_accuracy(logits, label_ids)

  avg_val_accuracy = total_eval_accuracy / len(valid_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

  avg_val_loss = total_eval_loss / len(valid_dataloader)
  print("  Validation loss: {0:.2f}".format(avg_val_loss))
  
  valid_time = format_time(time.time() - t0)

  training_stats.append(
      {
          "epoch": epoch_i+1, 
          "training loss": avg_train_loss, 
          "valid. loss": avg_val_loss, 
          "valid. accuracy": avg_val_accuracy, 
          "training time": training_time, 
          "valid. time": valid_time
      }
  )

print("")
print("Training is complete.")
print("Training time: {:}".format(format_time(time.time() - total_t0)))


Training...
  Batch    40 of   380.    Elapsed: 0:00:00
  Batch    80 of   380.    Elapsed: 0:00:01
  Batch   120 of   380.    Elapsed: 0:00:02
  Batch   160 of   380.    Elapsed: 0:00:02
  Batch   200 of   380.    Elapsed: 0:00:03
  Batch   240 of   380.    Elapsed: 0:00:04
  Batch   280 of   380.    Elapsed: 0:00:05
  Batch   320 of   380.    Elapsed: 0:00:06
  Batch   360 of   380.    Elapsed: 0:00:06

  Average training loss: 0.03
  Training epoch took: 0:00:07

Running validation...
  Accuracy: 0.39
  Validation loss: 1.08

Training...
  Batch    40 of   380.    Elapsed: 0:00:00
  Batch    80 of   380.    Elapsed: 0:00:01
  Batch   120 of   380.    Elapsed: 0:00:02
  Batch   160 of   380.    Elapsed: 0:00:02
  Batch   200 of   380.    Elapsed: 0:00:03
  Batch   240 of   380.    Elapsed: 0:00:04
  Batch   280 of   380.    Elapsed: 0:00:05
  Batch   320 of   380.    Elapsed: 0:00:06
  Batch   360 of   380.    Elapsed: 0:00:06

  Average training loss: 0.03
  Training epoch took: 0: