In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import Adam
import torch.nn.functional as F

In [None]:
device = torch.device("cuda")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased", do_lower_case=False)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=20)
model.to(device)

In [None]:
from google.colab import drive
drive.mount('./gdrive')

In [None]:
cd /content/gdrive/My\ Drive/DeepLearning

In [None]:
train_df = pd.read_csv('./petcharts/train.csv')
test_df = pd.read_csv('./petcharts/test.csv')
train_df.head(10)

In [None]:
class PetDataset(Dataset):
  def __init__(self, df):
    self.df = df
    self.pattern = r'[0-9]|[a-zA-Z]|[$-_@.&+!*?;:~\(\)\[\]<>/\'"\\]+'

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    text = self.df.iloc[idx, 8]
    label = self.df.iloc[idx, 10]
    return re.sub(self.pattern, '', text), label

In [None]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=2)

In [None]:
optimizer = Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 512
epochs = 64
total_loss = 0
total_len = 0
total_correct = 0

model.train()

for epoch in range(epochs):
  for text, label in train_loader:
    optimizer.zero_grad()
    
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    loss, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)
    total_loss += loss.item()
    loss.backward()
    optimizer.step()

    if itr % p_itr == 0:
      print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
      total_loss = 0
      total_len = 0
      total_correct = 0

    itr += 1

In [None]:
model.eval()

test_dataset = PetDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in test_loader:
  encoded_list = [tokenizer.encode(t, add_special_token=True) for t in text]
  padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
  sample = torch.tensor(padded_list)
  sample, label = sample.to(device), label.to(device)
  labels = torch.tensor(label)
  outputs = model(sample, labels=labels)
  _, logits = outputs
  
  pred = torch.argmax(F.softmax(logits), dim=1)
  print(labels, pred)
  correct = pred.eq(labels)
  total_correct += correct.sum().item()
  total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)