In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 30 files to the new cache system


  0%|          | 0/30 [00:00<?, ?it/s]

In [3]:
LABELS = {
    '__label__temperature': 0,
    '__label__ethereum': 1,
    '__label__help': 2
}

with open('./Telegrambot/utterances.txt', 'r') as data_file:
    lines = data_file.readlines()
    lines = [ [line.split(' ')[0], ' '.join(line.split(' ')[1:]).replace('\n', '') ] for line in lines ]
    data = pd.DataFrame(lines, columns=['label', 'text'])

data['labels'] = data['label'].apply(lambda v: LABELS[v])
    
dataset = Dataset.from_pandas(data)

dataset

Dataset({
    features: ['label', 'text', 'labels'],
    num_rows: 33
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(records):
    return tokenizer(records['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(['text', 'label'])
dataset.set_format('torch')

dataloader = DataLoader(dataset, shuffle=True, batch_size=4)

dataloader

  0%|          | 0/1 [00:00<?, ?ba/s]

<torch.utils.data.dataloader.DataLoader at 0x11a0eac70>

In [5]:
embedding_model = AutoModel.from_pretrained('distilbert-base-uncased')

for parameter in embedding_model.parameters():
    parameter.requires_grad = False
    
embedding_model.eval()

def embed(input_ids, attention_mask, labels=None):
    with torch.no_grad():
        return embedding_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]

embedding_model

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [6]:
classifier = nn.Sequential(
    nn.Linear(768, 64),
    nn.LeakyReLU(),
    nn.Dropout(0.1),
    nn.Linear(64, 3))

classifier

Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Dropout(p=0.1, inplace=False)
  (3): Linear(in_features=64, out_features=3, bias=True)
)

In [7]:
optimizer = AdamW(classifier.parameters(), lr=0.001)

loss = nn.CrossEntropyLoss()

classifier.train()
for epoch in range(10):
    for i, batch in enumerate(dataloader):
        labels = batch['labels']
        embeddings = embed(**batch)
        
        classifier.zero_grad()
        outputs = classifier(embeddings)
        error = loss(outputs, labels)

        error.backward()
        optimizer.step()

        print(f'Epoch: {epoch+1}/10, Batch: {i}, Error: {error.item()}')

classifier.eval()

Epoch: 1/10, Batch: 0, Error: 1.0798757076263428
Epoch: 1/10, Batch: 1, Error: 1.0646246671676636
Epoch: 1/10, Batch: 2, Error: 1.0509616136550903
Epoch: 1/10, Batch: 3, Error: 1.0036509037017822
Epoch: 1/10, Batch: 4, Error: 1.051424264907837
Epoch: 1/10, Batch: 5, Error: 1.3579702377319336
Epoch: 1/10, Batch: 6, Error: 1.3963444232940674
Epoch: 1/10, Batch: 7, Error: 1.2807953357696533
Epoch: 1/10, Batch: 8, Error: 0.7543627023696899
Epoch: 2/10, Batch: 0, Error: 1.0581711530685425
Epoch: 2/10, Batch: 1, Error: 1.1236261129379272
Epoch: 2/10, Batch: 2, Error: 1.1315765380859375
Epoch: 2/10, Batch: 3, Error: 1.097712755203247
Epoch: 2/10, Batch: 4, Error: 0.9792672991752625
Epoch: 2/10, Batch: 5, Error: 1.0207762718200684
Epoch: 2/10, Batch: 6, Error: 0.9534856677055359
Epoch: 2/10, Batch: 7, Error: 0.9038772583007812
Epoch: 2/10, Batch: 8, Error: 0.8015783429145813
Epoch: 3/10, Batch: 0, Error: 1.0357482433319092
Epoch: 3/10, Batch: 1, Error: 0.8562745451927185
Epoch: 3/10, Batch: 2,

Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Dropout(p=0.1, inplace=False)
  (3): Linear(in_features=64, out_features=3, bias=True)
)

In [9]:
with torch.no_grad():
    tokens = tokenizer('can i buy silver', padding='max_length', truncation=True, return_tensors='pt')
    embeddings = embed(**tokens)
    classes = nn.functional.softmax(classifier(embeddings), dim=-1)
classes, torch.argmax(classes)

(tensor([[0.3021, 0.5700, 0.1280]]), tensor(1))

In [10]:
torch.save(classifier, 'classifier.bin')