In [49]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
from typing import Dict, Iterable, List, Any, Tuple
import pandas as pd
from tqdm import tqdm
from random import randint
from sklearn.metrics import f1_score, accuracy_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [50]:
MODEL_NAME = 'kykim/bert-kor-base'
L_RATE = 0.001
BATCH_SIZE = 100

In [51]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model :torch.nn.Module= AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.bert.embeddings.position_embeddings = torch.nn.Embedding(128, 768)
model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(3, 768)
model.classifier = torch.nn.Linear(768, 30, bias=True)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [52]:
df = pd.read_csv('../dataset/train/train.csv')
df['subject_entity'] = df['subject_entity'].map(lambda x: eval(x)['word'])
df['object_entity'] = df['object_entity'].map(lambda x: eval(x)['word'])
df.sample(5)

Unnamed: 0,id,sentence,subject_entity,object_entity,label,source
30689,30689,소크라치스 브라질레이루 삼파이우 지 소자 비에이라 지 올리베이라(1954년 2월 1...,소크라치스,2011년 12월 4일,per:date_of_death,wikipedia
19189,19189,"해당 웹툰을 접한 커뮤니티 이용자들은 ""웹툰 그리라고 했더니 혼자 '스튜디오 지브리...",미야자키 하야오,모노노케 히메,per:product,wikitree
1478,1478,한편 대한민국 정부는 문화체육관광부 와 한국관광공사 가 대한민국 내 관광산업의 지원...,한국관광공사,문화체육관광부,no_relation,wikipedia
14880,14880,둘 다 프랑스 자동차경주 연맹의 지원을 받았고 세계 랠리 선수권대회의 1600cc카...,시트로앵,자동차,org:product,wikipedia
29563,29563,그녀의 대모(代母)는 당시 스페인 왕비 바텐베르크의 빅토리아 에우헤니아였다.,바텐베르크의 빅토리아 에우헤니아,스페인 왕비,per:title,wikipedia


In [53]:
type(df)

pandas.core.frame.DataFrame

In [54]:
class MyDataset(torch.utils.data.Dataset ):
    def __init__(
        self, 
        df : pd.DataFrame,
        train_mode :bool = False,  # 학습용 셋인지 테스트용 셋인지를 판단하는 지표
    )->None :
        assert type(df) == pd.core.frame.DataFrame

        self.train_mode = train_mode
        if train_mode :
            self.idx2class :Dict[int, str]= { 
                idx : classss for idx, classss in enumerate(
                    df['label'].unique()
                )
            }
            self.class2idx = {
                classss : idx for idx, classss in self.idx2class.items()
            }    
            self.labels :Iterable[int] = df['label'].map(lambda x: self.class2idx[x]).values
    

        sentence_list :List[str] = []

        for idx in range(len(df)):
            new_sentence :str = df.loc[idx,'subject_entity'] + "[SEP]"
            new_sentence += df.loc[idx, 'object_entity'] + "[SEP]" 
            new_sentence += df.loc[idx, 'sentence' ]
            sentence_list.append(new_sentence)
        self.sentence_tensor_list :Dict[str , torch.Tensor] = tokenizer(
            sentence_list, 
            padding=True, 
            truncation=True, 
            return_tensors='pt',
            max_length=128
        )

        
    def __getitem__(self, key :int) -> Tuple[Any]:
        segment_token_list = []
        segment_token = 0
        for token in self.sentence_tensor_list['input_ids'][key]:
            segment_token_list.append(segment_token)
            if token.item() == 3 and segment_token <2 :
                segment_token += 1
            
        segment_token_list = torch.LongTensor(segment_token_list)

        return (
            self.sentence_tensor_list['input_ids'][key],
            segment_token_list, #self.sentence_tensor_list['token_type_ids'][key], 
            self.sentence_tensor_list['attention_mask'][key],
            self.labels[key] if self.train_mode else None,
        )
    def __len__(self) :
        return len(self.sentence_tensor_list['input_ids'])


In [55]:
# exam_dataset = MyDataset(df=df, train_mode=True)

In [56]:
# exam_dataset[10][0]

In [57]:
# exam_dataset[10][1]


In [58]:

def split_data():
    TOTAL_DATA_NUM :int = df.__len__()

    oob = list(range(TOTAL_DATA_NUM))
    train_idx = []
    for _ in range(TOTAL_DATA_NUM): # 32470
        extracted = randint(0, TOTAL_DATA_NUM)
        if extracted in oob :
            oob[extracted] = None
            train_idx.append(extracted)
        
    new_oob = []
    for x in oob :
        if x :
            new_oob.append(x)
    oob = new_oob

    valid_dataset = MyDataset(
        df=df.loc[ oob , : ].reset_index( drop=True),
        train_mode=True
    )
    train_dataset = MyDataset(
        df=df.loc[ train_idx , : ].reset_index( drop=True),
        train_mode=True
    )

    train_batch = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        num_workers=4
    )

    valid_batch = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        num_workers=4
    )

    return train_batch, valid_batch


In [59]:
optimizer = torch.optim.Adam(model.parameters(), lr=L_RATE )
criterion = torch.nn.CrossEntropyLoss().to(device)

model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(128, 768)
      (token_type_embeddings): Embedding(3, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [60]:
# train, valid = split_data()
# for x in train:
#     print(x[1][0:2].dtype)
#     print(x[0][0:2].dtype)

#     break

In [61]:
def lets_train():
    train_batch, valid_batch = split_data()

    model.train()
    pred_list = []
    label_list = []
    for idx,( input_ids, token_type_ids, attention_mask, label ) in tqdm( enumerate( train_batch ) ):
        optimizer.zero_grad()
        pred = model(
            input_ids=input_ids.to(device),
            token_type_ids=token_type_ids.to(device),
            attention_mask=attention_mask.to(device),
        ).logits
        # print(pred, label)
        
        pred_list += list(torch.argmax(pred,dim=-1).cpu().numpy()) 
        label_list += list(label.numpy())
        loss = criterion(pred, label.to(device))
        if idx % 100 ==0 :
            print(
                "loss : ", 
                loss.item(), 
                f"\nf1score : {f1_score(y_true=label_list, y_pred = pred_list, average ='macro')}",
                f"\nacc : {accuracy_score(y_true=label_list, y_pred = pred_list)}"
            )
        # break    
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        pred_list = []
        label_list = []
        for idx, (input_ids, token_type_ids, attention_mask, label) in tqdm( enumerate(valid_batch) ):
        
            pred = model(
                input_ids=input_ids.to(device),
                token_type_ids=token_type_ids.to(device),
                attention_mask=attention_mask.to(device),
            ).logits

            pred_list += list(torch.argmax(pred,dim=-1).cpu().numpy()) 
            label_list += list(label.numpy())
            if idx % 100 ==0 :
                print(
                # "loss : ", 
                # loss.item(), 
                f"\nf1score : {f1_score(y_true=label_list, y_pred = pred_list, average ='macro')}",
                f"\nacc : {accuracy_score(y_true=label_list, y_pred = pred_list)}"

                )



In [62]:
EPOCH = 100
BATCH_SIZE = 100
if __name__ == "__main__":
    for ep in range(EPOCH):
        print(ep+1, '번째 에폭')
        lets_train()
        torch.save(model, f'./2021-10-02_BERT_Test({ep+1}_{EPOCH}).pt')

1 번째 에폭


0it [00:00, ?it/s]

loss :  3.4718167781829834 
f1score : 0.006238185255198488 
acc : 0.03


64it [00:44,  1.43it/s]


KeyboardInterrupt: 

In [None]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [None]:
tokenizer.all_special_ids

[1, 3, 0, 2, 4]

In [None]:
model.save_pretrained(save_directory='./')