In [None]:
import torch.cuda.amp as amp
import os
from transformers import XLMPreTrainedModel, XLMRobertaModel, XLMRobertaConfig, XLMRobertaTokenizer
from transformers import XLMRobertaForSequenceClassification, BertForSequenceClassification
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, XLNetForSequenceClassification,\
XLMRobertaForSequenceClassification, XLMForSequenceClassification, RobertaForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
import numpy as np
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 데이터 파일 경로 설정
model_path = '/content/drive/MyDrive/klue_base_fold3_s.pth'
file_name1 = '/content/drive/MyDrive/News_data.csv'
df = pd.read_csv(file_name1)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df.shape

(254319, 18)

In [None]:
tf.random.set_seed(1234)
np.random.seed(1234)

# class args
class args:
    # -----factor-----#
    debug = False
    amp = True
    gpu = 'O'

    batch_size = 8
    max_len = 512
    start_lr = 1e-5 #1e-3,5e-5
    # -----Dataset-----#

    # -----Else-----
    num_workers = 8
    seed = 2024
    scheduler = None # 'get_linear_schedule_with_warmup'

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base", cache_dir='bert_ckpt', do_lower_case = False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [None]:
class TestDataset(Dataset):
    def __init__(self, df):
        self.df_data = df

    def __getitem__(self, index):
        sentence = self.df_data.loc[index, 'Article'] # 감성분석 열 이름

        # 예외 처리: 문자열이 아닌 경우 빈 문자열로 대체
        if not isinstance(sentence, str) or sentence.strip() == "":
            sentence = ""

        # 토크나이징
        encoded_dict = tokenizer(
            text=sentence,
            add_special_tokens=True,
            max_length=args.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        padded_token_list = encoded_dict['input_ids'][0]
        token_type_id = encoded_dict.get('token_type_ids', torch.tensor([0] * args.max_len))[0]
        att_mask = encoded_dict['attention_mask'][0]

        return padded_token_list, token_type_id, att_mask

    def __len__(self):
        return len(self.df_data)


test_data = TestDataset(df)

test_dataloader = torch.utils.data.DataLoader(test_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=2)

In [None]:
def do_predict(net, valid_loader):
    pred_lst = []
    logit = []
    net.eval()
    with torch.no_grad():
        for input_id, token_type_id, attention_mask in tqdm(valid_loader):
            input_id = input_id.to(device)
            token_type_id = token_type_id.to(device)
            attention_mask = attention_mask.to(device)

            if args.amp:
                with amp.autocast():
                    output = net(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)[0]
            else:
                output = net(input_ids=input_id, token_type_ids=token_type_id, attention_mask=attention_mask)[0]

            pred_lst.extend(output.argmax(dim=1).tolist())
            logit.extend(output.tolist())

    return pred_lst, logit


def run_predict(model_path, test_dataloader, df):
    print('Set testloader')

    # 모델 로드
    net = RobertaForSequenceClassification.from_pretrained('klue/roberta-base', num_labels=3)
    net.to(device)

    # 다중 GPU 설정 (Colab에서는 주로 단일 GPU 사용)
    if torch.cuda.device_count() > 1:
        net = nn.DataParallel(net)

    # 저장된 모델 로드
    f = torch.load(model_path, map_location=device)
    net.load_state_dict(f, strict=True)
    print('Loaded saved model')

    # 예측 수행
    preds, logit = do_predict(net, test_dataloader)

    # 예측 결과를 데이터프레임에 추가
    df['Predicted_Label'] = preds
    df['Logit'] = [max(l) for l in logit]

    # 결과를 CSV 파일로 저장
    output_file = '/content/drive/MyDrive/News_sent.csv'
    df.to_csv(output_file, encoding='utf-8-sig', index=False)
    print(f"감성 분석 결과가 '{output_file}' 파일로 저장되었습니다.")


In [None]:
run_predict(model_path, test_dataloader,df)

Set testloader


config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  f = torch.load(model_path, map_location=device)


Loaded saved model


  with amp.autocast():
  0%|          | 20/31790 [00:49<21:57:35,  2.49s/it]


KeyboardInterrupt: 