In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

device

device(type='cuda')

In [3]:
LABELS = {
    '__label__temperature': 0,
    '__label__ethereum': 1,
    '__label__help': 2
}

with open('./utterances.txt', 'r') as data_file:
    lines = data_file.readlines()
    lines = [ [line.split(' ')[0], ' '.join(line.split(' ')[1:]).replace('\n', '') ] for line in lines ]
    data = pd.DataFrame(lines, columns=['label', 'text'])

data['labels'] = data['label'].apply(lambda v: LABELS[v])
    
dataset = Dataset.from_pandas(data)

dataset

Dataset({
    features: ['label', 'text', 'labels'],
    num_rows: 33
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(records):
    return tokenizer(records['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(['text', 'label'])
dataset.set_format('torch')

dataloader = DataLoader(dataset, shuffle=True, batch_size=4)

dataloader

  0%|          | 0/1 [00:00<?, ?ba/s]

<torch.utils.data.dataloader.DataLoader at 0x7f17908208>

In [5]:
embedding_model = AutoModel.from_pretrained('distilbert-base-uncased')

for parameter in embedding_model.parameters():
    parameter.requires_grad = False
    
embedding_model.eval()

embedding_model = embedding_model.to(device)

def embed(input_ids, attention_mask, labels=None):
    with torch.no_grad():
        return embedding_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

embedding_model

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [6]:
classifier = nn.Sequential(
    nn.Linear(768, 64),
    nn.LeakyReLU(),
    nn.Dropout(0.1),
    nn.Linear(64, 3))

classifier = classifier.to(device)

classifier

Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Dropout(p=0.1, inplace=False)
  (3): Linear(in_features=64, out_features=3, bias=True)
)

In [7]:
def to_device(batch):
    return {key: value.to(device) for key, value in batch.items()}

In [8]:
optimizer = AdamW(classifier.parameters(), lr=0.001)

loss = nn.CrossEntropyLoss()

classifier.train()
for epoch in range(10):
    for i, batch in enumerate(dataloader):
        batch = to_device(batch)
        labels = batch['labels']
        embeddings = embed(**batch)
        
        classifier.zero_grad()
        outputs = classifier(embeddings)
        error = loss(outputs, labels)

        error.backward()
        optimizer.step()

        print(f'Epoch: {epoch+1}/10, Batch: {i}, Error: {error.item()}')

classifier.eval()

Epoch: 1/10, Batch: 0, Error: 1.1225906610488892
Epoch: 1/10, Batch: 1, Error: 1.083631992340088
Epoch: 1/10, Batch: 2, Error: 1.0528229475021362
Epoch: 1/10, Batch: 3, Error: 1.0556509494781494
Epoch: 1/10, Batch: 4, Error: 1.1698664426803589
Epoch: 1/10, Batch: 5, Error: 1.0155810117721558
Epoch: 1/10, Batch: 6, Error: 1.127977728843689
Epoch: 1/10, Batch: 7, Error: 1.1153796911239624
Epoch: 1/10, Batch: 8, Error: 1.174458384513855
Epoch: 2/10, Batch: 0, Error: 1.0215530395507812
Epoch: 2/10, Batch: 1, Error: 0.9776033759117126
Epoch: 2/10, Batch: 2, Error: 1.0517340898513794
Epoch: 2/10, Batch: 3, Error: 0.8726167678833008
Epoch: 2/10, Batch: 4, Error: 1.093069076538086
Epoch: 2/10, Batch: 5, Error: 1.1211233139038086
Epoch: 2/10, Batch: 6, Error: 1.0025044679641724
Epoch: 2/10, Batch: 7, Error: 1.0986778736114502
Epoch: 2/10, Batch: 8, Error: 0.8576810359954834
Epoch: 3/10, Batch: 0, Error: 0.8332641124725342
Epoch: 3/10, Batch: 1, Error: 1.090179204940796
Epoch: 3/10, Batch: 2, Er

Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Dropout(p=0.1, inplace=False)
  (3): Linear(in_features=64, out_features=3, bias=True)
)

In [27]:
with torch.no_grad():
    tokens = tokenizer('hello', padding='max_length', truncation=True, return_tensors='pt')
    tokens = to_device(tokens)
    embeddings = embed(**tokens)
    classes = nn.functional.softmax(classifier(embeddings), dim=-1)
classes.cpu()[0, 1].item(), torch.argmax(classes).cpu().item()

(0.07090286165475845, 2)

In [16]:
torch.save(classifier, 'classifier.bin')

In [None]:
classifier = torch.load('classifier.bin')

classifier