In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from dataset_loader import *
from utils import *
from datasets import Dataset, DatasetDict
import datasets
import logging
import warnings
logging.disable(logging.WARNING) 
datasets.disable_progress_bar()
warnings.simplefilter("ignore")

SEED = 84
torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
dataset_name = 'corona'
ds = get_ds(dataset_name)

In [3]:
model_name =  'huawei-noah/TinyBERT_General_4L_312D'
folder_name = 'tinybert'

In [8]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

14350874

In [6]:
set_seed()
model = load_model(model_name)
tokenized_data = tokenize_dataset(ds, tokenizer_name=model_name, max_length = 64)
_ = train(model, tokenized_data, path = f'{folder_name}/{dataset_name}', num_train_epochs=10)
#model.save_pretrained(f'{folder_name}/{dataset_name}/model')

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3952,0.262703,0.892344
2,0.2318,0.246501,0.899043


### save best model's folder as 'model'

In [4]:
model = load_model(f'{folder_name}/{dataset_name}/model').to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [119]:
traced_model = torch.jit.load(f"{folder_name}/{dataset_name}/traced.pt")

In [None]:
tokenized_data = tokenize_dataset(ds, tokenizer_name=model_name, max_length = 64)
train(model, tokenized_data, path=f'{folder_name}/{dataset_name}', evaluate = True)['eval_accuracy']

In [7]:
encoded = tokenizer.encode("i love movies", add_special_tokens=True, return_tensors="pt").to(device)
traced_model = torch.jit.trace(model, encoded)

In [8]:
torch.jit.save(traced_model, f"{folder_name}/{dataset_name}/traced.pt")
traced_model = torch.jit.load(f"{folder_name}/{dataset_name}/traced.pt")

In [5]:
with torch.no_grad():
    for i in range(torch.cuda.device_count()):
        device = torch.device(f'cuda:{i}' if torch.cuda.is_available() else 'cpu')
        model = model.to(device).eval()
        encoded = tokenizer.encode("i love movies", add_special_tokens=True, return_tensors="pt").to(device)
        traced_model = torch.jit.trace(model, encoded)
        torch.jit.save(traced_model, f"{folder_name}/{dataset_name}/traced_{i}.pt")

In [26]:
import transformers
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=False)
model = transformers.DistilBertForMaskedLM.from_pretrained('distilbert-base-cased', torchscript=True)
inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
with torch.no_grad():
    for i in range(torch.cuda.device_count()):
        device = torch.device(f'cuda:{i}' if torch.cuda.is_available() else 'cpu')
        model = model.to(device).eval()
        encoded = inputs.to(device)['input_ids']
        traced_model = torch.jit.trace(model, encoded)
        torch.jit.save(traced_model, f"mlm_models/distil_mlm_{i}.pt")

In [10]:
predicting_sentences(model, tokenizer, device, [tokenizer.tokenize("i hate movies")])

tensor([[ 2.6802, -2.5975]], device='cuda:7', grad_fn=<AddmmBackward0>)


array([0])

In [16]:
predicting_sentences(model, tokenizer, device, [tokenizer.tokenize("i love movies")])

tensor([[-1.4474,  1.4066]], device='cuda:0', grad_fn=<AddmmBackward0>)


array([1])

In [13]:
per_class_accuracy(folder_name, model_name, 'corona')

positive accuracy 0.8960138648180243


negative_accuracy 0.9149191132414619


In [18]:
per_class_accuracy(folder_name, model_name, 'sentiment')

positive accuracy 0.9130434782608695
negative_accuracy 0.9261418853255587


In [19]:
per_class_accuracy(folder_name, model_name, 'dilemma')

positive accuracy 0.9247706422018349
negative_accuracy 0.9392789373814042


In [17]:
per_class_accuracy(folder_name, model_name, 'spam')

positive accuracy 0.8930635838150289


negative_accuracy 0.9045771916214119
