In [2]:
import torch
import pandas as pd
import sklearn
import random
import numpy as np
import wandb

from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from transformers import DebertaV2ForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification

from dataset.datasets import SentimentalDataset
from metrics.metrics import compute_metrics

from sklearn.datasets import load_iris # 샘플 데이터 로딩
from sklearn.model_selection import train_test_split

from utils.utils import config_seed

import json
import re

  from .autonotebook import tqdm as notebook_tqdm


### 설정

In [4]:
SEED = 42
config_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

#### 모델 및 토크나이저

In [5]:
MODEL_NAME = 'klue/roberta-large'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

special_tokens_dict = {'additional_special_tokens': ['[COMPANY]','[/COMPANY]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
Y

Embedding(32002, 1024)

In [5]:
# MODEL_NAME = "team-lucid/deberta-v3-base-korean"

# model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### 데이터셋 구성

#### 1) 기사 전체 학습

In [6]:
# data = pd.read_csv('/opt/ml/finance_sentiment_corpus/merged_samsung_filtered.csv')

# def extract_label(json_str) :
#     data_dict = eval(json_str)  # JSON 문자열을 파이썬 딕셔너리로 변환
#     return data_dict["label"]

# # "label" 값을 추출하여 새로운 Series 생성
# data['labels'] = data["labels"].apply(extract_label)
# data['labels'] = data['labels'].map({'부정':0, '긍정':1})

#### 2) 기사 앞뒤 학습

In [122]:
data = pd.read_csv("/opt/ml/finance_sentiment_corpus/merged/merged_all.csv")
data.head()

Unnamed: 0,company,title,date,content_corpus,labels
0,삼성전자,"데이터센터 전력 40% 차지하는 D램… 삼성·SK하이닉스, ‘전성비’ ...",2023.07.10 15:29,"챗GPT 시대 화두로 떠오른 전력효율성 문제 ”전력 먹는 하마, D램 전력효율성 개...","{""label"": ""긍정"", ""reason"": ""전력효율성 개선을 위한 솔루션 개발..."
1,삼성전자,“삼성전자가 식품도 팔았어?”…신규 가입 일단 종료한 사연,2023.07.10 15:07,삼성 가전제품 구매고객에 삼성닷컴 내 e-식품관에서 할인혜택 주며 ‘락인’ 기대 ...,"{""label"": ""부정"", ""reason"": ""삼성전자 멤버십 플랜 신규 고객 모..."
2,삼성전자,SGC솔루션 '도어글라스'…삼성·LG 세탁기·건조기에 공급,2023.07.10 15:05,해외 가전 브랜드 공략…B2B 사업 확장 SGC솔루션 논산 공장. . 생활유리...,"{""label"": ""긍정"", ""reason"": ""해외 가전 브랜드들을 공략하며 전 ..."
3,삼성전자,‘페이커’ 내세운 삼성 OLED 게이밍 모니터 글로벌 3천대 돌파,2023.07.10 14:58,북미·유럽 등 예약 판매 3000대 돌파 10일 오후 6시 삼성닷컴 ‘페이커’ 출연...,"{""label"": ""긍정"", ""reason"": ""오디세이 OLED G9의 글로벌 예..."
4,삼성전자,"네이처 게재 등 성과…삼성휴먼테크논문대상, 30년 맞았다",2023.07.10 14:48,29년간 3만6558편 논문 접수…수상자 5312명 9월1일부터 올해 대상 접수…상...,"{""label"": ""긍정"", ""reason"": ""삼성휴먼테크논문대상이 올해로 시행 ..."


In [3]:
data = pd.read_csv("/opt/ml/finance_sentiment_corpus/merged/merged_all.csv")
# data = pd.read_csv("/opt/ml/finance_sentiment_corpus/merged/merged_NAVER.csv")

def remove_idx_row(data) : 
    patterns = [r'idx\s*:?\s*.+?', r'라벨링\s*:?\s*.+?']
    
    for pattern in patterns :
        mask = data['labels'].str.match(pattern)
        data = data.drop(data[mask].index)

    return data
    
# title과 content_corpus에서 원하는 문장 추출
def extract_sentences(text):
    sentences = text.split('. ')
    if len(sentences) >= 5 :
        return '. '.join([sentences[0], sentences[1], sentences[-2], sentences[-1]])
    else :
        return '. '+text
    
def extract_label(json_str) :
    json_str = json_str.replace("'", "\"")
    try:
        json_data = json.loads(json_str)

    except json.JSONDecodeError as e:
        if json_str[-2:] == '.}' :
            json_str = json_str[:-2] + ".\"}"
        elif json_str[-1] == "\"" :
            json_str = json_str + "}"
        else:
            json_str += "\"}"
    
    try:
        data_dict = json.loads(json_str)
    except json.JSONDecodeError as e:
        return None

    return data_dict["label"]
            
def preprocessing_label(json_str) :
    json_str = re.sub(r"^.*### 출력\s?:?\s?\n?\s?", "", str(json_str))
    # json_str= json_str.replace("\"", "'")
    json_str = json_str.replace("'", "\\'") # python interpreter에서 한번, json에서 한번
    
    return json_str

data = remove_idx_row(data)

# "label" column만 있다면 바꾸자.
if "label" in data.columns and "labels" not in data.columns  :
    data["labels"] = data["label"]

data["labels"] = data["labels"].apply(preprocessing_label)
data['labels'] = data["labels"].apply(extract_label)
data['content_corpus_company'] = data.apply(lambda row: '이 기사는 [COMPANY]'+ str(row["company"]) +'[/COMPANY]에 대한 기사. [SEP]'+ extract_sentences(row['title']) + ' ' + extract_sentences(row['content_corpus']), axis=1)
data['labels'] = data['labels'].map({'부정':0, '긍정':1})
data = data[["title", "date", "content_corpus", "labels", "content_corpus_company"]]
print(data.shape)
data = data[data['labels'].notna()]
print(data.shape)
data.head()

(3778, 5)
(3777, 5)


Unnamed: 0,title,date,content_corpus,labels,content_corpus_company
0,"데이터센터 전력 40% 차지하는 D램… 삼성·SK하이닉스, ‘전성비’ ...",2023.07.10 15:29,"챗GPT 시대 화두로 떠오른 전력효율성 문제 ”전력 먹는 하마, D램 전력효율성 개...",1.0,이 기사는 [COMPANY]삼성전자[/COMPANY]에 대한 기사. [SEP]. 데...
1,“삼성전자가 식품도 팔았어?”…신규 가입 일단 종료한 사연,2023.07.10 15:07,삼성 가전제품 구매고객에 삼성닷컴 내 e-식품관에서 할인혜택 주며 ‘락인’ 기대 ...,0.0,이 기사는 [COMPANY]삼성전자[/COMPANY]에 대한 기사. [SEP]. “...
2,SGC솔루션 '도어글라스'…삼성·LG 세탁기·건조기에 공급,2023.07.10 15:05,해외 가전 브랜드 공략…B2B 사업 확장 SGC솔루션 논산 공장. . 생활유리...,1.0,이 기사는 [COMPANY]삼성전자[/COMPANY]에 대한 기사. [SEP]. S...
3,‘페이커’ 내세운 삼성 OLED 게이밍 모니터 글로벌 3천대 돌파,2023.07.10 14:58,북미·유럽 등 예약 판매 3000대 돌파 10일 오후 6시 삼성닷컴 ‘페이커’ 출연...,1.0,이 기사는 [COMPANY]삼성전자[/COMPANY]에 대한 기사. [SEP]. ‘...
4,"네이처 게재 등 성과…삼성휴먼테크논문대상, 30년 맞았다",2023.07.10 14:48,29년간 3만6558편 논문 접수…수상자 5312명 9월1일부터 올해 대상 접수…상...,1.0,이 기사는 [COMPANY]삼성전자[/COMPANY]에 대한 기사. [SEP]. 네...


### Cross validation

In [8]:
# # dataset = train_test_split(data['content_corpus'], data['labels'],

# # train_dataset, test_dataset = train_test_split(data['new_column'], data['labels'],
# #                             test_size=0.2, shuffle=True, stratify=data['labels'], # label에 비율을 맞춰서 분리
# #                             random_state=SEED)

# train_dataset, test_dataset = train_test_split(data,
#                             test_size=0.3, shuffle=True, stratify=data['labels'], # label에 비율을 맞춰서 분리
#                             random_state=SEED)

# train_dataset, val_dataset = train_test_split(train_dataset,
#                             test_size=0.2, shuffle=True, stratify=train_dataset['labels'], # label에 비율을 맞춰서 분리
#                             random_state=SEED)

# corpus_train, label_train = train_dataset["new_column"], train_dataset["labels"]
# corpus_val, label_val = val_dataset["new_column"], val_dataset["labels"]
# corpus_test, label_test = test_dataset["new_column"], test_dataset["labels"]

# # sentence_train, sentence_val, label_train, label_val = dataset


# max_length=40
# stride=10
# ## TODO 임의의 값으로 차후 수정
# train_encoding = tokenizer(corpus_train.tolist(), ## pandas.Series -> list
#                             return_tensors='pt',
#                             padding=True,
#                             truncation=True,
#                             ##
#                             max_length=max_length,
#                             stride=stride,
#                             return_overflowing_tokens=True,
#                             return_offsets_mapping=False
#                             )

# val_encoding = tokenizer(corpus_val.tolist(),
#                         return_tensors='pt',
#                         padding=True,
#                         truncation=True,
#                         ##
#                         max_length=max_length,
#                         stride=stride,
#                         return_overflowing_tokens=True,
#                         return_offsets_mapping=False
#                         )

# train_set = SentimentalDataset(train_encoding, label_train.reset_index(drop=True))
# val_set = SentimentalDataset(val_encoding, label_val.reset_index(drop=True))
# test_set = SentimentalDataset(val_encoding, label_test.reset_index(drop=True))

In [7]:
dataset = train_test_split(data['content_corpus_company'], data['labels'],
                            test_size=0.2, shuffle=True, stratify=data['labels'], # label에 비율을 맞춰서 분리
                            random_state=SEED)


sentence_train, sentence_val, label_train, label_val = dataset


max_length=500
# max_length = 2000
stride=10
## TODO 임의의 값으로 차후 수정
train_encoding = tokenizer(sentence_train.tolist(), ## pandas.Series -> list
                            return_tensors='pt',
                            padding=True,
                            truncation=True,
                            ##
                            max_length=max_length,
                            stride=stride,
                            # return_overflowing_tokens=True,
                            return_offsets_mapping=False
                            )

val_encoding = tokenizer(sentence_val.tolist(),
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        ##
                        max_length=max_length,
                        stride=stride,
                        # return_overflowing_tokens=True,
                        return_offsets_mapping=False
                        )

train_set = SentimentalDataset(train_encoding, label_train.reset_index(drop=True))
val_set = SentimentalDataset(val_encoding, label_val.reset_index(drop=True))

### 학습 (huggingface)
#### hyperparameter
- max_length
- stride
- num_train_epoch
- learning_rate
- per_device_train_batch_size
- per_device_eval_batch_size

In [8]:
logging_steps = 200
num_train_epochs = 3
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
learning_rate = 5e-6

In [57]:
# !wandb online

In [131]:
# run = wandb.init(project="final_sentimental", entity="nlp-10")

# run.name = f"model: {MODEL_NAME} / batch_size: {per_device_train_batch_size} / lr: {learning_rate}"

training_args = TrainingArguments(
    output_dir = './outputs',
    logging_steps = logging_steps,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    learning_rate = learning_rate,
    evaluation_strategy="epoch", 
    fp16=True,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics
)

print('---train start---')
trainer.train()
# wandb.finish()

Epoch,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1
1,0.3565,0.34227,0.912814,0.912814,0.893988
2,0.2326,0.395361,0.918098,0.918098,0.895991
3,0.0999,0.461685,0.919419,0.919419,0.897826


TrainOutput(global_step=2271, training_loss=0.2475055377948447, metrics={'train_runtime': 534.9292, 'train_samples_per_second': 16.982, 'train_steps_per_second': 4.245, 'total_flos': 8267250492648000.0, 'train_loss': 0.2475055377948447, 'epoch': 3.0})

In [22]:
# torch.save(model, "/opt/ml/input/model-roberta_large-sota")
trainer.save_model("/opt/ml/input/model-roberta_large-sota_trainer_company-name") 

In [17]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sat Jul 22 12:44:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:00:05.0 Off |                  Off |
| N/A   41C    P0    44W / 250W |  32455MiB / 32510MiB |      0%      Default |
|                               |            

In [10]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

### 평가

In [None]:
# run.finish()

In [1]:
print('---val evaulate start---')
# wandb.init()
# trainer.evaluate(eval_dataset=val_set, metric_key_prefix='val1')
# wandb.finish()

---val evaulate start---


NameError: name 'model' is not defined

### 평가

In [10]:
trainer.evaluate(eval_dataset=val_set, metric_key_prefix='val1')

  acc = load_metric('accuracy').compute(predictions=preds, references=labels)['accuracy']


{'val1_loss': 0.5156943798065186,
 'val1_accuracy': 0.8489475856376393,
 'val1_f1': 0.8489475856376393,
 'val1_runtime': 12.6519,
 'val1_samples_per_second': 191.513,
 'val1_steps_per_second': 23.949,
 'epoch': 2.0}

### inference

In [43]:
print('---inference start---')
text = '이 기사는 [COMPANY]'+ '삼성전자' +'[/COMPANY]에 대한 기사. [SEP] "
corpus = "삼성전자가 테슬라와 4000억 규모의 협약이 미뤄졌다."
# MODEL_PATH = "/opt/ml/input/model-roberta_large-sota_trainer"
MODEL_PATH = "/opt/ml/input/model-roberta_large-sota_trainer_company-name"
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)


# # model = torch.load(PATH)
model.eval()
with torch.no_grad() :
    temp = tokenizer(
        my_text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        ##
        max_length=100,
        # stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=False
        )

    
    temp = {
        'input_ids':temp['input_ids'],
        'token_type_ids':temp['token_type_ids'],
        'attention_mask':temp['attention_mask'],
    }
    # print(temp)
    
    print("######################################")
    predicted_label = model(temp['input_ids'])
    print(predicted_label.logits)
    print(torch.nn.Softmax(dim=-1)(predicted_label.logits).mean(dim=0))
    [20 80] => [50]
    [[20, 80], [30, 70]]
    

torch.nn.Softmax(dim=-1)(predicted_label.logits).mean(dim=0)
    

---inference start---
######################################
tensor([[ 3.0807, -3.1782]])
tensor([0.9981, 0.0019])


tensor([0.9981, 0.0019])

In [None]:
## 위에 결과에서 앞의 것이 부정 뒤에것이 긍정

In [37]:
result = torch.nn.Softmax(dim=-1)(predicted_label.logits).mean(dim=0)

if result[0] > result[1] :
    print("부정")
else :
    print("긍정")

부정
