In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import os
os.chdir("drive/")
os.chdir('My Drive')
os.chdir('Experiment')
os.chdir('TransformerBased')

In [3]:
!pip install wandb



In [4]:
!pip install torchtext portalocker



In [5]:
!pip install transformers tokenizers datasets



In [6]:
!pip install accelerate



In [7]:
from torchtext.datasets import IMDB

train_iter, test_iter = IMDB(split='train'), IMDB(split='test')

In [8]:
RANDOM_STATE = 42
OUTPUT_DIR = 'results'
LOG_DIR = './logs'
model_name = 'distilbert-base-uncased'

In [9]:
import os

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [10]:
import random

random.seed(6)

train_lists = list(train_iter)
test_lists = list(test_iter)

# random sampling
train_lists_small = random.sample(train_lists, 1000)
test_lists_small = random.sample(test_lists, 1000)

print(train_lists_small[0])
print(test_lists_small[0])

(2, "I LOVED this movie! I am biased seeing as I am a huge Disney fan, but I really enjoyed myself. The action takes off running in the beginning of the film and just keeps going! This is a bit of a departure for Disney, they don't spend quite as much time on character development (my husband pointed this out)and there are no musical numbers. It is strictly action adventure. I thoroughly enjoyed it and recommend it to anyone who loves Disney, be they young or old.")
(1, 'This was an abysmal show. In short it was about this kid called Doug who guilt-tripped a lot. Seriously he could feel guilty over killing a fly then feeling guilty over feeling guilty for killing the fly and so forth. The animation was grating and unpleasant and the jokes cheap. <br /><br />It aired here in Sweden as a part of the "Disney time" show and i remember liking it some what but then i turned 13.<br /><br />I never got why some of the characters were green and purple too. What was up with that? <br /><br />Tru

In [11]:
train_labels = []
train_texts = []

test_labels = []
test_texts = []


# train data로부터 label과 text를 꺼냄
for label, text in train_lists_small:
    train_labels.append(1 if label == 2 else 0)
    train_texts.append(text)


# train data로부터 label과 text를 꺼냄
for label, text in test_lists_small:
    test_labels.append(1 if label == 2 else 0)
    test_texts.append(text)

In [12]:
from sklearn.model_selection import train_test_split


train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=.2, random_state=RANDOM_STATE
)

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def encode_texts(tokenizer_, texts):
    return tokenizer_(
        texts,
        truncation=True,
        padding=True
    )

train_encodings = encode_texts(tokenizer, train_texts)
val_encodings = encode_texts(tokenizer, val_texts)
test_encodings = encode_texts(tokenizer, test_texts)

In [14]:
import torch


class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = IMDBDataset(train_encodings, train_labels)
val_dataset = IMDBDataset(val_encodings, val_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

In [15]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=LOG_DIR,
    logging_steps=10,
)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [18]:
label_dict = {
    0: 'positive',
    1: 'negative'
}

In [19]:
try:
    import wandb
    from wandb import init, log, join  # test that these are available
except ImportError:
    print("msg")

In [20]:
from transformers import Trainer


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
10,0.6928
20,0.7077
30,0.6923
40,0.6878
50,0.6829
60,0.6781
70,0.675
80,0.6578
90,0.6538
100,0.5866


TrainOutput(global_step=400, training_loss=0.2731656039506197, metrics={'train_runtime': 321.6393, 'train_samples_per_second': 19.898, 'train_steps_per_second': 1.244, 'total_flos': 847791351398400.0, 'train_loss': 0.2731656039506197, 'epoch': 8.0})