In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install pandas tqdm transformers

Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
import torch
import pandas as pd
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# 1. KoBART 모델과 토크나이저 불러오기
tokenizer = PreTrainedTokenizerFast.from_pretrained('digit82/kobart-summarization')
model = BartForConditionalGeneration.from_pretrained('digit82/kobart-summarization')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. 커스텀 데이터셋 정의
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx]).replace('\n', ' ')
        return text

# 3. 텍스트 요약 함수 정의
def summarize_batch(texts, model, tokenizer):
    inputs = tokenizer(
        texts,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=512,
        add_special_tokens=True
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        num_beams=4,
        max_length=128,
        early_stopping=True
    )
    summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                 for g in summary_ids]
    return summaries

# 4. 데이터 로드 및 배치 처리
input_file = '1617.csv'
output_file = '1617_summarize.csv'
df = pd.read_csv(input_file)

# DataLoader를 사용한 배치 처리
dataset = TextDataset(df['Article'].tolist())
batch_size = 16  # 배치 크기 조정
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

summarized_articles = []
for batch in tqdm(dataloader):
    summaries = summarize_batch(batch, model, tokenizer)
    summarized_articles.extend(summaries)

# 요약된 결과를 데이터프레임에 추가
df['Summarized_Article'] = summarized_articles

# 요약된 데이터만 저장
df[['Summarized_Article']].to_csv(output_file, encoding='utf-8-sig', index=False)
print(f"요약된 데이터가 '{output_file}' 파일로 저장되었습니다.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/109 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

100%|██████████| 23/23 [05:11<00:00, 13.54s/it]

요약된 데이터가 '1617_summarize.csv' 파일로 저장되었습니다.



