In [1]:
from IPython.display import clear_output

!pip install pytorch_lightning transformers

clear_output()

In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, DistilBertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import pickle
import os

from tqdm import tqdm
tqdm.pandas()

In [3]:
from typing import Any
import pytorch_lightning as pl
from transformers import get_scheduler
from torch.optim import AdamW, Optimizer

class BERTLightningModule(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.save_hyperparameters()
        self.l1 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels = 6)
        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        return output_1.logits

    def training_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids, mask)
        loss = self.loss_function(outputs, targets)

        self.log('train_loss', loss, prog_bar = True, logger = True)
        self.log('train_acc', self.calculate_accuracy(outputs, targets), prog_bar=True, logger = True)

        return loss

    def test_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids, mask)
        loss = self.loss_function(outputs, targets)

        self.log('test_loss', loss, prog_bar = True, logger = True)
        self.log('test_acc', self.calculate_accuracy(outputs, targets), prog_bar=True, logger = True)

    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        targets = batch['targets']

        outputs = self(ids, mask)
        loss = self.loss_function(outputs, targets)

        self.log('val_loss', loss, prog_bar = True, logger = True)
        self.log('val_acc', self.calculate_accuracy(outputs, targets), prog_bar=True, logger = True)

    def configure_optimizers(self) -> tuple[list[Optimizer], list[dict[str, Any]]]:
        do_decay = [p for p in self.parameters() if p.requires_grad and p.ndim >= 2]
        no_decay = [p for p in self.parameters() if p.requires_grad and p.ndim < 2]
        param_groups = [{"params": do_decay}, {"params": no_decay, "weight_decay": 0.0}]

        optimizer = AdamW(param_groups, **self.config.optim.optimizer)
        scheduler = get_scheduler(optimizer=optimizer, **self.config.optim.scheduler)
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]

    @staticmethod
    def calculate_accuracy(preds, targets):
        _, predicted = torch.max(preds, dim=1)
        correct = torch.sum(predicted == targets)
        acc = correct.float() / targets.size(0)
        return acc

    def train_dataloader(self):
        return train_loader

    def test_dataloader(self):
        return test_loader

    def val_dataloader(self):
        return val_loader

In [4]:
from easydict import EasyDict

config = EasyDict({})
config.optim = {}
config.optim.optimizer = {}
config.optim.optimizer.lr = 1e-4
config.optim.optimizer.eps = 1e-6
config.optim.optimizer.weight_decay = 1e-6
config.optim.scheduler = {}
config.optim.scheduler.name = "linear"
config.optim.scheduler.num_warmup_steps = 1500
config.optim.scheduler.num_training_steps = 15000

In [5]:
from google.colab import drive
drive.mount('/content/drive')

checkpoint_path = "/content/drive/MyDrive/study_session/week3/classification_best.ckpt"
checkpoint = torch.load(checkpoint_path)
model = BERTLightningModule(config)
model.load_state_dict(checkpoint['state_dict'])

drive.flush_and_unmount()

Mounted at /content/drive


Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')

enc = {0 : 'sadness', 1 : 'joy', 2 : 'love',
       3 : 'anger', 4 : 'fear', 5 : 'surprise'}

example = '''
I recall no one that loved me whole before
And I've not been in love with anybody before
Now, the lies I'm making up
I'm hoping that a day comes when they all become true
And I keep wishing they do
'''

# 토큰화 및 인코딩
tokens = tokenizer.encode_plus(
    example,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask)

predictions = torch.argmax(outputs, dim=1)

print('\n##############')
print('Original Text : ')
print(example)
print('')
print('결과 :', end = ' ')
print(enc[predictions.cpu().numpy()[0]])

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]


##############
Original Text : 

I recall no one that loved me whole before
And I've not been in love with anybody before
Now, the lies I'm making up
I'm hoping that a day comes when they all become true
And I keep wishing they do


결과 : love


In [8]:
huggingface_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels = 6)
huggingface_model.load_state_dict(model.l1.state_dict())

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [9]:
model_name = "emotion_classification"

huggingface_model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

('emotion_classification/tokenizer_config.json',
 'emotion_classification/special_tokens_map.json',
 'emotion_classification/vocab.txt',
 'emotion_classification/added_tokens.json',
 'emotion_classification/tokenizer.json')

In [10]:
!pip install huggingface

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: huggingface
Successfully installed huggingface-0.0.1


In [11]:
from huggingface_hub import hf_hub_download, HfApi, HfFolder

HfFolder.save_token('####')

In [13]:
huggingface_model.push_to_hub("a2ran/emotion_classification", use_auth_token=True)
tokenizer.push_to_hub("a2ran/emotion_classification", use_auth_token=True)

pytorch_model.bin:   0%|          | 0.00/541M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/a2ran/emotion_classification/commit/71ad804669f94f9e8c9865acbe8a916d2d77ca2e', commit_message='Upload tokenizer', commit_description='', oid='71ad804669f94f9e8c9865acbe8a916d2d77ca2e', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("a2ran/emotion_classification")
model = DistilBertForSequenceClassification.from_pretrained('a2ran/emotion_classification', num_labels = 6)

Downloading (…)okenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/869 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/541M [00:00<?, ?B/s]