In [1]:
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pandas as pd
import torch
import sys


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append('./code/')

In [3]:
cuda_available = torch.cuda.is_available()
if cuda_available:
    device = torch.device("cuda")  # Use CUDA device
else:
    device = torch.device("cpu")  # Use CPU if CUDA is not available
print(device)

cuda


In [4]:
MODEL_NAME = "klue/roberta-small"
model_base = AutoModel.from_pretrained(MODEL_NAME).to(device)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [5]:
REClassifier = torch.nn.Sequential(
        torch.nn.Linear(768, 768),
        torch.nn.Dropout(p=0.1, inplace=False),
        torch.nn.Linear(768, 30)
).to(device)
subwordClassifier = torch.nn.Sequential(
        torch.nn.Linear(768, 768),
        torch.nn.Dropout(p=0.1, inplace=False),
        torch.nn.Linear(768, 2)
).to(device)
objwordClassifier = torch.nn.Sequential(
        torch.nn.Linear(768, 768),
        torch.nn.Dropout(p=0.1, inplace=False),
        torch.nn.Linear(768, 6)
).to(device)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_df = pd.read_csv('dataset/train/train.csv')

In [7]:
import importlib
import load_data_only_marker
importlib.reload(load_data_only_marker)

<module 'load_data_only_marker' from './code/load_data_only_marker.py'>

In [8]:
# add new type columns
train_df['subwordType'] = train_df["subject_entity"].apply(lambda x: eval(x)['type'])
train_df['objwordType'] = train_df["object_entity"].apply(lambda x: eval(x)['type'])


In [12]:
eval(train_df["subject_entity"][0])['type']

'ORG'

In [16]:
main_labels = {value:id for id, value in enumerate(train_df["label"].unique())}
sub_labels = {value:id for id, value in enumerate(train_df["subwordType"].unique())}
obj_labels = {value:id for id, value in enumerate(train_df["objwordType"].unique())}

In [17]:
print(main_labels, sub_labels, obj_labels)

{'no_relation': 0, 'org:member_of': 1, 'org:top_members/employees': 2, 'org:alternate_names': 3, 'per:date_of_birth': 4, 'org:place_of_headquarters': 5, 'per:employee_of': 6, 'per:origin': 7, 'per:title': 8, 'org:members': 9, 'per:schools_attended': 10, 'per:colleagues': 11, 'per:alternate_names': 12, 'per:spouse': 13, 'org:founded_by': 14, 'org:political/religious_affiliation': 15, 'per:children': 16, 'org:founded': 17, 'org:number_of_employees/members': 18, 'per:place_of_birth': 19, 'org:dissolved': 20, 'per:parents': 21, 'per:religion': 22, 'per:date_of_death': 23, 'per:place_of_residence': 24, 'per:other_family': 25, 'org:product': 26, 'per:siblings': 27, 'per:product': 28, 'per:place_of_death': 29} {'ORG': 0, 'PER': 1} {'PER': 0, 'ORG': 1, 'DAT': 2, 'LOC': 3, 'POH': 4, 'NOH': 5}


In [18]:
train_df["main_labels"] = train_df["label"].apply(lambda x: main_labels[x])
train_df["sub_labels"] = train_df["subwordType"].apply(lambda x: sub_labels[x])
train_df["obj_labels"] = train_df["objwordType"].apply(lambda x: obj_labels[x])

In [18]:
inp = load_data_only_marker.tokenized_dataset(train_df.iloc[:32], tokenizer)

AttributeError: 'DataFrame' object has no attribute 'to'

In [21]:
pooler_output = model_base(inp.input_ids.to(device)).pooler_output.to(device)
main_pred = REClassifier(pooler_output.to(device))
subword_pred = subwordClassifier(pooler_output.to(device))
objword_pred = objwordClassifier(pooler_output.to(device))

In [22]:
celoss = torch.nn.CrossEntropyLoss()

In [25]:
train_df["main_labels"].iloc[:32].to_numpy()

array([ 0,  0,  1,  2,  0,  3,  0,  4,  5,  6,  7,  8,  2,  9,  0,  6,  0,
        1, 10,  0, 11, 12, 13,  1,  8, 13,  0,  0,  6,  3,  3,  3])

In [28]:
main_loss = celoss(main_pred, torch.from_numpy(train_df["main_labels"].iloc[:32].to_numpy()).to(device))

array(['〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다.',
       '호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으로 재탄생한다.',
       'K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터 관중 유치 성과와 마케팅 성과를 인정받아 ‘풀 스타디움상’과 ‘플러스 스타디움상’을 수상했다.'],
      dtype=object)

In [21]:
a = load_data_only_marker.RE_Dataset(
    train_df["sentence"].iloc[:3].to_numpy(),
    train_df["sub_labels"].iloc[:3].to_numpy()
)

In [27]:
b = load_data_only_marker.tokenized_dataset(train_df, tokenizer)

In [32]:
a.labels

array([0, 0, 0])