In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, AdamW, get_scheduler
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [11]:
checkpoint = "vineetsharma/customer-support-intent-albert"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [12]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [13]:
dataset = load_dataset("csv", data_files="dataset_csv.csv", split='train')
full_dataset = dataset.class_encode_column("label").train_test_split(test_size=0.2, stratify_by_column="label")

In [14]:
def tokenize_function(example):
    return tokenizer(example["prompt"], truncation=True, return_tensors="pt", padding=True)
tokenized_datasets = full_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map: 100%|██████████| 200/200 [00:00<00:00, 2766.03 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 1664.79 examples/s]


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
tokenized_datasets = tokenized_datasets.remove_columns(["prompt"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [17]:
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)

In [18]:
labels = torch.tensor([1]).unsqueeze(0)

In [19]:
optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/125 [00:00<?, ?it/s]You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


 99%|█████████▉| 124/125 [00:10<00:00, 12.59it/s]

In [20]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 1.0}

In [34]:
intent_map = {0: "compose_mail", 1: "delete_mail", 2: "read_next", 3: "star_mail", 4: "view_inbox"}
input_text = ["I want to write an email. Can you help me?", "Please get rid of this mail.", "Read the following mail.", "Can you please star this email?", "I'd like to take a look at my inbox."]
input_one = " this"
encoded_input = tokenizer(input_one, return_tensors="pt").to("cuda")
with torch.no_grad():
    output = model(**encoded_input)
    logits = output.logits
    predicted_intent = logits.argmax(-1).item()
    print(f"{input_one} -> {intent_map[predicted_intent]}")
# for item in input_text:
#     encoded_input = tokenizer(item, return_tensors="pt").to("cuda")
#     with torch.no_grad():
#         output = model(**encoded_input)
#         logits = output.logits
#         predicted_intent = logits.argmax(-1).item()
#     print(f"{item} -> {intent_map[predicted_intent]}")

remove this -> delete_mail


In [22]:
intent_map = {0: "compose_mail", 1: "delete_mail", 2: "read_next", 3: "star_mail", 4: "view_inbox"}
input_text = [item['prompt'] for item in full_dataset['test']]
for item in input_text:
    encoded_input = tokenizer(item, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model(**encoded_input)
        logits = output.logits
        predicted_intent = logits.argmax(-1).item()
    print(f"{item} -> {intent_map[predicted_intent]}")

Tag up this message with a star. -> star_mail
Can you get to the next message? -> read_next
I have to write an email. -> compose_mail
Can we bookmark the email? -> star_mail
Can you access the next message? -> read_next
Let's get to the next message. -> read_next
I need to flag up the message with a star. -> star_mail
Remove this email. -> delete_mail
Go ahead and star the email. -> star_mail
I’d like to compose an email. -> compose_mail
I need to discard the message. -> delete_mail
Can you advance forward towards reading out loud, my following mail? -> read_next
Go through my mail -> view_inbox
Can you help me send a mail? -> compose_mail
Can you annihilate the message from my inbox. -> delete_mail
Can you show the email messages? -> view_inbox
I'd like to wipe out the message. -> delete_mail
Move forward towards reading out loud, my following mail. -> read_next
Can you show me what’s in my mailbox? -> view_inbox
Label this message as starred. -> star_mail
I need to check my inbox -> 

In [23]:
model.save_pretrained("trained_model")

In [24]:
tokenizer.save_pretrained("trained_tokenizer")

('trained_tokenizer\\tokenizer_config.json',
 'trained_tokenizer\\special_tokens_map.json',
 'trained_tokenizer\\tokenizer.json')

In [25]:
new_model = AutoModelForSequenceClassification.from_pretrained("trained_model")
device = "cuda" if torch.cuda.is_available else "cpu"
new_model.to(device=device)

AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [26]:
new_tokenizer = AutoTokenizer.from_pretrained("trained_tokenizer")

In [27]:
intent_map = {0: "compose_mail", 1: "delete_mail", 2: "read_next", 3: "star_mail", 4: "view_inbox"}
input_text = [item['prompt'] for item in full_dataset['test']]
for item in input_text:
    encoded_input = new_tokenizer(item, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = new_model(**encoded_input)
        logits = output.logits
        predicted_intent = logits.argmax(-1).item()
    print(f"{item} -> {intent_map[predicted_intent]}")

Tag up this message with a star. -> star_mail
Can you get to the next message? -> read_next
I have to write an email. -> compose_mail
Can we bookmark the email? -> star_mail
Can you access the next message? -> read_next
Let's get to the next message. -> read_next
I need to flag up the message with a star. -> star_mail
Remove this email. -> delete_mail
Go ahead and star the email. -> star_mail
I’d like to compose an email. -> compose_mail
I need to discard the message. -> delete_mail
Can you advance forward towards reading out loud, my following mail? -> read_next
Go through my mail -> view_inbox
Can you help me send a mail? -> compose_mail
Can you annihilate the message from my inbox. -> delete_mail
Can you show the email messages? -> view_inbox
I'd like to wipe out the message. -> delete_mail
Move forward towards reading out loud, my following mail. -> read_next
Can you show me what’s in my mailbox? -> view_inbox
Label this message as starred. -> star_mail
I need to check my inbox -> 

100%|██████████| 125/125 [00:29<00:00, 12.59it/s]