In [30]:
import torch
import pandas as pd
import sklearn
import random
import numpy as np
import wandb

from transformers import AutoModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from transformers import DebertaV2ForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification

from dataset.datasets import SentimentalDataset
from metrics.metrics import compute_metrics

from sklearn.datasets import load_iris # 샘플 데이터 로딩
from sklearn.model_selection import train_test_split

from utils.utils import config_seed

### 설정

In [2]:
SEED = 42
config_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

#### 모델 및 토크나이저

In [12]:
MODEL_NAME = 'klue/roberta-large'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
Y

In [4]:
# MODEL_NAME = "team-lucid/deberta-v3-base-korean"

# model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

### 데이터셋 구성

#### 1) 기사 전체 학습

In [5]:
# data = pd.read_csv('/opt/ml/finance_sentiment_corpus/merged_samsung_filtered.csv')

# def extract_label(json_str) :
#     data_dict = eval(json_str)  # JSON 문자열을 파이썬 딕셔너리로 변환
#     return data_dict["label"]

# # "label" 값을 추출하여 새로운 Series 생성
# data['labels'] = data["labels"].apply(extract_label)
# data['labels'] = data['labels'].map({'부정':0, '긍정':1})

#### 2) 기사 앞뒤 학습

In [6]:
data = pd.read_csv('/opt/ml/finance_sentiment_corpus/merged_samsung_filtered.csv')
data = pd.DataFrame(data)

# title과 content_corpus에서 원하는 문장 추출
def extract_sentences(text):
    sentences = text.split('. ')
    if len(sentences) >= 5 :
        return '. '.join([sentences[0], sentences[1], sentences[-2], sentences[-1]])
    else :
        return '. '+text
    
def extract_label(json_str) :
    data_dict = eval(json_str)  # JSON 문자열을 파이썬 딕셔너리로 변환
    return data_dict["label"]

# "label" 값을 추출하여 새로운 Series 생성
data['labels'] = data["labels"].apply(extract_label)
data['new_column'] = data.apply(lambda row: extract_sentences(row['title']) + ' ' + extract_sentences(row['content_corpus']), axis=1)
data['labels'] = data['labels'].map({'부정':0, '긍정':1})
data.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,date,content,content_corpus,content_len,content_corpus_len,labels,new_column
0,0,0,0,0,"데이터센터 전력 40% 차지하는 D램… 삼성·SK하이닉스, ‘전성비’ ...",2023.07.10 15:29,"챗GPT 시대 화두로 떠오른 전력효율성 문제 ”전력 먹는 하마, D램 전력효율성 개...","챗GPT 시대 화두로 떠오른 전력효율성 문제 ”전력 먹는 하마, D램 전력효율성 개...",1813,1651,1,". 데이터센터 전력 40% 차지하는 D램… 삼성·SK하이닉스, ‘전성비’ ... 챗..."
1,2,2,2,2,“삼성전자가 식품도 팔았어?”…신규 가입 일단 종료한 사연,2023.07.10 15:07,삼성 가전제품 구매고객에 삼성닷컴 내 e-식품관에서 할인혜택 주며 ‘락인’ 기대 ...,삼성 가전제품 구매고객에 삼성닷컴 내 e-식품관에서 할인혜택 주며 ‘락인’ 기대 ...,1749,1698,0,. “삼성전자가 식품도 팔았어?”…신규 가입 일단 종료한 사연 삼성 가전제품 구매고...
2,3,3,3,3,SGC솔루션 '도어글라스'…삼성·LG 세탁기·건조기에 공급,2023.07.10 15:05,해외 가전 브랜드 공략…B2B 사업 확장[서울=뉴시스] SGC솔루션 논산 공장. (...,해외 가전 브랜드 공략…B2B 사업 확장 SGC솔루션 논산 공장. . 생활유리...,547,476,1,. SGC솔루션 '도어글라스'…삼성·LG 세탁기·건조기에 공급 해외 가전 브랜드 공...
3,4,4,4,4,‘페이커’ 내세운 삼성 OLED 게이밍 모니터 글로벌 3천대 돌파,2023.07.10 14:58,북미·유럽 등 예약 판매 3000대 돌파 10일 오후 6시 삼성닷컴 ‘페이커’ 출연...,북미·유럽 등 예약 판매 3000대 돌파 10일 오후 6시 삼성닷컴 ‘페이커’ 출연...,1096,1029,1,. ‘페이커’ 내세운 삼성 OLED 게이밍 모니터 글로벌 3천대 돌파 북미·유럽 등...
4,5,5,5,5,"네이처 게재 등 성과…삼성휴먼테크논문대상, 30년 맞았다",2023.07.10 14:48,29년간 3만6558편 논문 접수…수상자 5312명 9월1일부터 올해 대상 접수…상...,29년간 3만6558편 논문 접수…수상자 5312명 9월1일부터 올해 대상 접수…상...,1759,1659,1,". 네이처 게재 등 성과…삼성휴먼테크논문대상, 30년 맞았다 29년간 3만6558편..."


### Cross validation

In [7]:
# # dataset = train_test_split(data['content_corpus'], data['labels'],

# # train_dataset, test_dataset = train_test_split(data['new_column'], data['labels'],
# #                             test_size=0.2, shuffle=True, stratify=data['labels'], # label에 비율을 맞춰서 분리
# #                             random_state=SEED)

# train_dataset, test_dataset = train_test_split(data,
#                             test_size=0.3, shuffle=True, stratify=data['labels'], # label에 비율을 맞춰서 분리
#                             random_state=SEED)

# train_dataset, val_dataset = train_test_split(train_dataset,
#                             test_size=0.2, shuffle=True, stratify=train_dataset['labels'], # label에 비율을 맞춰서 분리
#                             random_state=SEED)

# corpus_train, label_train = train_dataset["new_column"], train_dataset["labels"]
# corpus_val, label_val = val_dataset["new_column"], val_dataset["labels"]
# corpus_test, label_test = test_dataset["new_column"], test_dataset["labels"]

# # sentence_train, sentence_val, label_train, label_val = dataset


# max_length=40
# stride=10
# ## TODO 임의의 값으로 차후 수정
# train_encoding = tokenizer(corpus_train.tolist(), ## pandas.Series -> list
#                             return_tensors='pt',
#                             padding=True,
#                             truncation=True,
#                             ##
#                             max_length=max_length,
#                             stride=stride,
#                             return_overflowing_tokens=True,
#                             return_offsets_mapping=False
#                             )

# val_encoding = tokenizer(corpus_val.tolist(),
#                         return_tensors='pt',
#                         padding=True,
#                         truncation=True,
#                         ##
#                         max_length=max_length,
#                         stride=stride,
#                         return_overflowing_tokens=True,
#                         return_offsets_mapping=False
#                         )

# train_set = SentimentalDataset(train_encoding, label_train.reset_index(drop=True))
# val_set = SentimentalDataset(val_encoding, label_val.reset_index(drop=True))
# test_set = SentimentalDataset(val_encoding, label_test.reset_index(drop=True))

In [8]:
# dataset = train_test_split(data['content_corpus'], data['labels'],

dataset = train_test_split(data['new_column'], data['labels'],
                            test_size=0.2, shuffle=True, stratify=data['labels'], # label에 비율을 맞춰서 분리
                            random_state=SEED)


sentence_train, sentence_val, label_train, label_val = dataset


max_length=500
# max_length = 2000
stride=10
## TODO 임의의 값으로 차후 수정
train_encoding = tokenizer(sentence_train.tolist(), ## pandas.Series -> list
                            return_tensors='pt',
                            padding=True,
                            truncation=True,
                            ##
                            max_length=max_length,
                            stride=stride,
                            return_overflowing_tokens=True,
                            return_offsets_mapping=False
                            )

val_encoding = tokenizer(sentence_val.tolist(),
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        ##
                        max_length=max_length,
                        stride=stride,
                        return_overflowing_tokens=True,
                        return_offsets_mapping=False
                        )

train_set = SentimentalDataset(train_encoding, label_train.reset_index(drop=True))
val_set = SentimentalDataset(val_encoding, label_val.reset_index(drop=True))

### 학습 (huggingface)
#### hyperparameter
- max_length
- stride
- num_train_epoch
- learning_rate
- per_device_train_batch_size
- per_device_eval_batch_size

In [9]:
logging_steps = 200
num_train_epochs = 3
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
learning_rate = 5e-6

In [10]:
# !wandb online

In [13]:
# run = wandb.init(project="final_sentimental", entity="nlp-10")

# run.name = f"model: {MODEL_NAME} / batch_size: {per_device_train_batch_size} / lr: {learning_rate}"

training_args = TrainingArguments(
    output_dir = './outputs',
    logging_steps = logging_steps,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    learning_rate = learning_rate,
    evaluation_strategy="epoch", 
    fp16=True,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_metrics
)

print('---train start---')
trainer.train()
# wandb.finish()

Epoch,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1
1,0.4765,0.297409,0.920548,0.920548,0.899829
2,0.2876,0.370972,0.915068,0.915068,0.890825
3,0.1752,0.375522,0.934247,0.934247,0.918363


TrainOutput(global_step=1095, training_loss=0.3064597926727713, metrics={'train_runtime': 309.5533, 'train_samples_per_second': 14.149, 'train_steps_per_second': 3.537, 'total_flos': 3986190792360000.0, 'train_loss': 0.3064597926727713, 'epoch': 3.0})

In [29]:
# torch.save(model, "/opt/ml/input/model-roberta_large-sota")
trainer.save_model("/opt/ml/input/model-roberta_large-sota_trainer") 

In [17]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sat Jul 22 12:44:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:00:05.0 Off |                  Off |
| N/A   41C    P0    44W / 250W |  32455MiB / 32510MiB |      0%      Default |
|                               |            

In [10]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

### 평가

In [None]:
# run.finish()

In [1]:
print('---val evaulate start---')
# wandb.init()
# trainer.evaluate(eval_dataset=val_set, metric_key_prefix='val1')
# wandb.finish()

---val evaulate start---


NameError: name 'model' is not defined

### 학습(wandb_sweep)

In [39]:
sweep_config = {'name': f"{MODEL_NAME}_1",  # name : sweep_name
                    'method': 'grid',  # 'grid', 'uniform', 'bayesian'
                    'parameters': {
                        'lr': {  # parameter  작성방식 여러개 있으니까, 노션 문서 참고
                            'values': [4e-6, 5e-6, 6e-6]
                        },
                        'warmup_step': {
                            "values": [200]
                        },
                        'logging_steps': {
                            "values": [200]
                        },
                        "batch_size": {
                            "values": [8]
                        },
                        "max_epoch": {
                            "values": [2]
                        },
                    },
                    # goal : maximize, minimize
                    'metric': {'name': 'val1_accuracy', 'goal': 'maximize'}
                    }

In [40]:
# import json

# print("Training with sweep mode")

# sweep_id = wandb.sweep(json.load(open("/opt/ml/level3_nlp_finalproject-nlp-04/sentence-sentimental/sweep.json", "r"), object_hook=_decode), project="final_sentimental", entity="nlp-10")
# wandb.agent(sweep_id, trainWithSweep, count=int(5))
# wandb.finish()
from train import sweep_train

print("Train mode")
sweep_id = wandb.sweep(sweep_config, project="final_sentimental", entity="nlp-10")
wandb.agent(sweep_id, sweep_train, count=3)
wandb.finish()

Train mode
Create sweep with ID: tdlke131
Sweep URL: https://wandb.ai/nlp-10/final_sentimental/sweeps/tdlke131


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


### 평가

  acc = load_metric('accuracy').compute(predictions=preds, references=labels)['accuracy']


{'val1_loss': 0.5156943798065186,
 'val1_accuracy': 0.8489475856376393,
 'val1_f1': 0.8489475856376393,
 'val1_runtime': 12.6519,
 'val1_samples_per_second': 191.513,
 'val1_steps_per_second': 23.949,
 'epoch': 2.0}

### inference

In [46]:
print('---inference start---')
my_text = '삼성전자 진짜 무조건 오를 듯'*20

MODEL_PATH = "/opt/ml/input/model-roberta_large-sota_trainer"
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)


# # model = torch.load(PATH)
model.eval()
with torch.no_grad() :
    temp = tokenizer(
        my_text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        ##
        max_length=100,
        # stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=False
        )

    
    temp = {
        'input_ids':temp['input_ids'],
        'token_type_ids':temp['token_type_ids'],
        'attention_mask':temp['attention_mask'],
    }
    # print(temp)
    
    print("######################################")
    predicted_label = model(temp['input_ids'])
    print(predicted_label.logits)
    print(torch.nn.Softmax(dim=-1)(predicted_label.logits).mean(dim=0))

torch.nn.Softmax(dim=-1)(predicted_label.logits).mean(dim=0)
    

---inference start---


OSError: Can't load tokenizer for '/opt/ml/input/model-roberta_large-sota_trainer'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/opt/ml/input/model-roberta_large-sota_trainer' is the correct path to a directory containing all relevant files for a BertTokenizerFast tokenizer.

In [None]:
## 위에 결과에서 앞의 것이 부정 뒤에것이 긍정

In [49]:
result = torch.nn.Softmax(dim=-1)(predicted_label.logits).mean(dim=0)

if result[0] > result[1] :
    print("부정")

부정
