# INSTALL REQUIREMENTS

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 53.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [2]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 29.5 MB/s eta 0:00:01[K     |▌                               | 20 kB 9.2 MB/s eta 0:00:01[K     |▉                               | 30 kB 8.1 MB/s eta 0:00:01[K     |█                               | 40 kB 7.6 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.5 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.3 MB/s eta 0:00:01[K     |██                              | 71 kB 5.4 MB/s eta 0:00:01[K     |██▏                             | 81 kB 4.1 MB/s eta 0:00:01[K     |██▍                             | 92 kB 4.6 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.1 MB/s eta 0:00:01[K     |███                             | 112 kB 5.1 MB/s eta 0:00:01[K     |███▎                            | 122 kB 5.1 MB/s eta 0:00:01[K     |███▌         

In [3]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.11-py2.py3-none-any.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 32.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 9.7 MB/s eta 0:00:01[K     |▋                               | 30 kB 8.5 MB/s eta 0:00:01[K     |▊                               | 40 kB 7.8 MB/s eta 0:00:01[K     |█                               | 51 kB 4.2 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.0 MB/s eta 0:00:01[K     |█▎                              | 71 kB 5.3 MB/s eta 0:00:01[K     |█▌                              | 81 kB 5.5 MB/s eta 0:00:01[K     |█▊                              | 92 kB 6.1 MB/s eta 0:00:01[K     |█▉                              | 102 kB 5.1 MB/s eta 0:00:01[K     |██                              | 112 kB 5.1 MB/s eta 0:00:01[K     |██▎                             | 122 kB 5.1 MB/s eta 0:00:01[K     |██▍                             | 133 kB 5.1 MB/s eta 0:00:01[K

# GOOGLE MOUNT

In [4]:
from google.colab import drive
drive.mount('/content/drive')
PATH = './drive/MyDrive/datasets/'

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/groom_project1/

/content/drive/MyDrive/groom_project1


In [6]:
%pwd

'/content/drive/MyDrive/groom_project1'

# IMPORT REQUIREMENTS

In [7]:
import os
import sys
import random
import pickle

import numpy as np
from tqdm import tqdm

import torch
from torch.nn.utils.rnn import pad_sequence

from transformers import (
    AdamW
)

import wandb

from compute import compute_acc
from visualize_score import plot_graph
from dump_datasets import mk_dataset, mk_dataset_xlnet
from dump_models import load_model, load_model_xlnet
from evaluate import test_model

#CREATE FOLDER

In [8]:
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
 
createFolder('./best_models')
createFolder('./dump_datasets')
createFolder('./dump_models_tokenizer')
createFolder('./scores')
createFolder('./submissions')

# FIX SEED

In [9]:
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(42)

# MODEL

In [10]:
MODEL_NAME = 'bert-base-uncased'
# MODEL_NAME = 'bert-large-uncased'
# MODEL_NAME = 'xlnet-base-cased'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert str(device) == 'cuda'

try:
    with open('./dump_models_tokenizer/' + MODEL_NAME + '.p', 'rb') as f:
        model = pickle.load(f)
        tokenizer = pickle.load(f)
        print('./dump_models_tokenizer/' + MODEL_NAME + '.p')
    print('model exists => just load model')
except:
    print('exeption occur => download model')
    if MODEL_NAME == 'bert-base-uncased':
        model, tokenizer = load_model(MODEL_NAME)
    elif MODEL_NAME == 'xlnet-base-cased':
        model, tokenizer = load_model_xlnet(MODEL_NAME)

model.to(device)

exeption occur => download model


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# HYPER PARAMETERS

In [11]:
TRAIN_BATCH_SIZE=256
EVAL_BATCH_SIZE=256

LEARNING_RATE = 5e-5
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
TRAIN_EPOCH = 3



In [12]:
# wandb.init(project="my-test-project", entity="chohs1221")

# wandb.config = {
#   "learning_rate": LEARNING_RATE,
#   "epochs": TRAIN_EPOCH,
#   "batch_size": TRAIN_BATCH_SIZE
# }
# ########################################
# sweep_config = {'method': 'random'}
# metric = {'name': 'loss',
#     'goal': 'minimize'}
# sweep_config['metric'] = metric
# #########################################
# parameters_dict = {'optimizer': {'values': ['adam', 'sgd']},
#                    'fc_layer_size': {'values': [128, 256, 512]},
#                    'dropout': {'values': [0.3, 0.4, 0.5]},}
# sweep_config['parameters'] = parameters_dict
# #####################################
# parameters_dict.update({
#     'epochs': {
#         'value': 1}
#     })
# #####################################
# import math
# parameters_dict.update({'learning_rate': {'distribution': 'uniform', 'min': 0, 'max': 0.1}, # a flat distribution between 0 and 0.1
#                         'batch_size': {'distribution': 'q_log_uniform', 'q': 1, 'min': math.log(32), 'max': math.log(256),}})   # integers between 32 and 256 with evenly-distributed logarithms
# #####################################
# sweep_id = wandb.sweep(sweep_config, project="pytorch-sweeps-demo")

# LOAD DATASETS

In [14]:
try:
    if  MODEL_NAME == 'bert-base-uncased':
        with open('./dump_datasets/train_dev_dumps.p', 'rb') as f:
            train_pos = pickle.load(f)
            train_neg = pickle.load(f)
            dev_pos = pickle.load(f)
            dev_neg = pickle.load(f)
        print('dataset exists => just load datasets')
    elif  MODEL_NAME == 'xlnet-base-cased':
        with open('./dump_datasets/train_dev_dumps_xlnet.p', 'rb') as f:
            train_pos = pickle.load(f)
            train_neg = pickle.load(f)
            dev_pos = pickle.load(f)
            dev_neg = pickle.load(f)
        print('dataset exists => just load datasets')
except:
    print('exeption occur => make datasets')
    train_pos, train_neg, dev_pos, dev_neg = mk_dataset()
    if MODEL_NAME == 'bert-base-uncased':
        train_pos, train_neg, dev_pos, dev_neg = mk_dataset()
    elif MODEL_NAME == 'xlnet-base-cased':
        train_pos, train_neg, dev_pos, dev_neg = mk_dataset_xlnet()

exeption occur => make datasets


# DATA PREPROCESSING

In [None]:
'''
전처리
'''

# TOKENIZE

In [15]:
train_pos = [tokenizer.encode(line) for line in train_pos]
train_neg = [tokenizer.encode(line) for line in train_neg]
dev_pos = [tokenizer.encode(line) for line in dev_pos]
dev_neg = [tokenizer.encode(line) for line in dev_neg]

# MAKE DATASETS

In [16]:
class SentimentDataset(object):
    def __init__(self, pos, neg):
        self.data = [pos_sent for pos_sent in pos] + [neg_sent for neg_sent in neg]
        self.label = [[1] for _ in range(len(pos))] + [[0] for _ in range(len(neg))]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        return np.array(sample), np.array(self.label[index])

train_dataset = SentimentDataset(train_pos, train_neg)
dev_dataset = SentimentDataset(dev_pos, dev_neg)

# DATA LOADER

In [17]:
def collate_fn_style(samples):
    input_ids, labels = zip(*samples)
    max_len = max(len(input_id) for input_id in input_ids)

    attention_mask = torch.tensor([[1] * len(input_id) + [0] * (max_len - len(input_id)) for input_id in input_ids])
    input_ids = pad_sequence([torch.tensor(input_id) for input_id in input_ids], batch_first=True)
    token_type_ids = torch.tensor([[0] * len(input_id) for input_id in input_ids])
    position_ids = torch.tensor([list(range(len(input_id))) for input_id in input_ids])
    labels = torch.tensor(np.stack(labels, axis=0))

    return input_ids, attention_mask, token_type_ids, position_ids, labels

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=TRAIN_BATCH_SIZE,
                                           shuffle=True, 
                                           collate_fn=collate_fn_style,
                                           pin_memory=True, num_workers=2)

dev_loader = torch.utils.data.DataLoader(dev_dataset, 
                                         batch_size=EVAL_BATCH_SIZE,
                                         shuffle=False, 
                                         collate_fn=collate_fn_style,
                                         num_workers=2)

# TRAIN

In [18]:
lowest_valid_loss = 9999.
highest_valid_acc = 0.
train_acc = []
train_loss = []
valid_acc = []
valid_loss = []

model.train()
for epoch in range(TRAIN_EPOCH):
    with tqdm(train_loader, unit="batch") as tepoch:
        for iteration, (input_ids, attention_mask, token_type_ids, position_ids, labels) in enumerate(tepoch):

            tepoch.set_description(f"Epoch {epoch}")

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            position_ids = position_ids.to(device)
            labels = labels.to(device, dtype=torch.long)

            output = model(input_ids=input_ids,
                           attention_mask=attention_mask,
                           token_type_ids=token_type_ids,
                           position_ids=position_ids,
                           labels=labels)

            loss = output.loss
            
            logits = output.logits
            batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
            batch_labels = [int(example) for example in labels]
            
            acc = compute_acc(batch_predictions, batch_labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


            tepoch.set_postfix(acc=acc, loss=loss.item())
            
            if iteration != 0 and iteration % int(len(train_loader) / 100) == 0:
                train_acc.append(acc)
                train_loss.append(loss.item())

                model.eval()
                with torch.no_grad():
                    val_acc = []
                    val_loss = []
                    for input_ids, attention_mask, token_type_ids, position_ids, labels in dev_loader:
                        input_ids = input_ids.to(device)
                        attention_mask = attention_mask.to(device)
                        token_type_ids = token_type_ids.to(device)
                        position_ids = position_ids.to(device)
                        labels = labels.to(device, dtype=torch.long)

                        output = model(input_ids=input_ids,
                                    attention_mask=attention_mask,
                                    token_type_ids=token_type_ids,
                                    position_ids=position_ids,
                                    labels=labels)

                        logits = output.logits
                        batch_predictions = [0 if example[0] > example[1] else 1 for example in logits]
                        batch_labels = [int(example) for example in labels]

                        val_acc.append(compute_acc(batch_predictions, batch_labels))
                        val_loss.append(output.loss)

                mean_val_acc = sum(val_acc) / len(val_acc)
                mean_val_loss = sum(val_loss) / len(val_loss)

                valid_acc.append(mean_val_acc)
                valid_loss.append(mean_val_loss)
                
                # WANDB
                # wandb.log({"train_loss": loss.item(),
                #            'train_acc': acc,
                #            'valid_loss': mean_val_loss,
                #            'valid_acc': mean_val_acc})

                if highest_valid_acc < mean_val_acc:
                    highest_valid_acc = mean_val_acc
                    print('ACCURACY for lowest valid acc: ', mean_val_acc)
                    print('LOSS for lowest valid acc: ', mean_val_loss)
                    model.save_pretrained('./best_models/model' + str(int(mean_val_acc*100)) + str(int(mean_val_loss*1000)))

                elif lowest_valid_loss > mean_val_loss:
                    lowest_valid_loss = mean_val_loss
                    print('ACCURACY for lowest valid loss: ', mean_val_acc)
                    print('LOSS for lowest valid loss: ', mean_val_loss)
                    model.save_pretrained('./best_models/model' + str(int(mean_val_acc*100)) + str(int(mean_val_loss*1000)))
                                        
                model.train()

Epoch 0:   1%|          | 17/1732 [00:09<15:09,  1.89batch/s, acc=0.926, loss=0.177] 

ACCURACY for lowest valid acc:  0.94462890625
LOSS for lowest valid acc:  tensor(0.1564, device='cuda:0')


Epoch 0:   2%|▏         | 34/1732 [00:23<15:40,  1.81batch/s, acc=0.965, loss=0.104] 

ACCURACY for lowest valid acc:  0.96220703125
LOSS for lowest valid acc:  tensor(0.1072, device='cuda:0')


Epoch 0:   3%|▎         | 51/1732 [00:37<14:56,  1.88batch/s, acc=0.941, loss=0.156]

ACCURACY for lowest valid acc:  0.96396484375
LOSS for lowest valid acc:  tensor(0.1071, device='cuda:0')


Epoch 0:   4%|▍         | 68/1732 [00:50<15:01,  1.85batch/s, acc=0.977, loss=0.0929]

ACCURACY for lowest valid loss:  0.959130859375
LOSS for lowest valid loss:  tensor(0.1098, device='cuda:0')


Epoch 0:   5%|▍         | 85/1732 [01:04<14:59,  1.83batch/s, acc=0.965, loss=0.0904]

ACCURACY for lowest valid acc:  0.96962890625
LOSS for lowest valid acc:  tensor(0.0884, device='cuda:0')


Epoch 0:   6%|▌         | 102/1732 [01:17<14:57,  1.82batch/s, acc=0.941, loss=0.163] 

ACCURACY for lowest valid acc:  0.9708984375
LOSS for lowest valid acc:  tensor(0.0855, device='cuda:0')


Epoch 0:   7%|▋         | 119/1732 [01:30<14:30,  1.85batch/s, acc=0.977, loss=0.0503]

ACCURACY for lowest valid loss:  0.966552734375
LOSS for lowest valid loss:  tensor(0.1005, device='cuda:0')


Epoch 0:   8%|▊         | 136/1732 [01:44<14:17,  1.86batch/s, acc=0.961, loss=0.14] 

ACCURACY for lowest valid acc:  0.973193359375
LOSS for lowest valid acc:  tensor(0.0855, device='cuda:0')


Epoch 0:   9%|▉         | 153/1732 [01:58<13:49,  1.90batch/s, acc=0.949, loss=0.0996]

ACCURACY for lowest valid loss:  0.967919921875
LOSS for lowest valid loss:  tensor(0.0871, device='cuda:0')


Epoch 0:  11%|█         | 187/1732 [02:23<14:15,  1.81batch/s, acc=0.969, loss=0.114] 

ACCURACY for lowest valid loss:  0.969775390625
LOSS for lowest valid loss:  tensor(0.0824, device='cuda:0')


Epoch 0:  14%|█▎        | 238/1732 [03:01<13:27,  1.85batch/s, acc=0.98, loss=0.0791] 

ACCURACY for lowest valid loss:  0.972998046875
LOSS for lowest valid loss:  tensor(0.0749, device='cuda:0')


Epoch 0:  15%|█▍        | 255/1732 [03:14<13:03,  1.89batch/s, acc=0.984, loss=0.0454]

ACCURACY for lowest valid acc:  0.973583984375
LOSS for lowest valid acc:  tensor(0.0732, device='cuda:0')


Epoch 0:  16%|█▌        | 272/1732 [03:28<13:07,  1.85batch/s, acc=0.988, loss=0.0304]

ACCURACY for lowest valid acc:  0.97470703125
LOSS for lowest valid acc:  tensor(0.0712, device='cuda:0')


Epoch 0:  17%|█▋        | 289/1732 [03:41<12:56,  1.86batch/s, acc=0.984, loss=0.0644]

ACCURACY for lowest valid loss:  0.97099609375
LOSS for lowest valid loss:  tensor(0.0733, device='cuda:0')


Epoch 0:  18%|█▊        | 306/1732 [03:55<12:46,  1.86batch/s, acc=0.984, loss=0.0818]

ACCURACY for lowest valid loss:  0.974560546875
LOSS for lowest valid loss:  tensor(0.0723, device='cuda:0')


Epoch 0:  20%|█▉        | 340/1732 [04:20<12:13,  1.90batch/s, acc=0.977, loss=0.0467]

ACCURACY for lowest valid loss:  0.97333984375
LOSS for lowest valid loss:  tensor(0.0683, device='cuda:0')


Epoch 0:  23%|██▎       | 391/1732 [04:58<12:21,  1.81batch/s, acc=0.973, loss=0.121]

ACCURACY for lowest valid acc:  0.9748046875
LOSS for lowest valid acc:  tensor(0.0714, device='cuda:0')


Epoch 0:  26%|██▌       | 442/1732 [05:37<11:44,  1.83batch/s, acc=0.98, loss=0.0525]

ACCURACY for lowest valid acc:  0.97578125
LOSS for lowest valid acc:  tensor(0.0651, device='cuda:0')


Epoch 0:  27%|██▋       | 476/1732 [06:03<11:06,  1.88batch/s, acc=0.969, loss=0.0776]

ACCURACY for lowest valid acc:  0.97626953125
LOSS for lowest valid acc:  tensor(0.0650, device='cuda:0')


Epoch 0:  28%|██▊       | 486/1732 [06:12<15:56,  1.30batch/s, acc=0.969, loss=0.0981]


KeyboardInterrupt: ignored

# SAVE SCORES

In [None]:
accloss_filename = 'accloss' + str(int(mean_val_acc*100)) + str(int(mean_val_loss*1000)) + '.p'
with open('./scores/' + accloss_filename,'wb') as f:
    pickle.dump(train_acc, f)
    pickle.dump(train_loss, f)
    pickle.dump(valid_acc, f)
    pickle.dump(valid_loss, f)

# TEST

In [None]:
test_model(model, tokenizer, mean_val_acc, mean_val_loss, file_name = 'test_no_label', device='cuda')

# SCORE VISUALIZE

In [None]:
plot_graph(accloss_filename)