<a href="https://colab.research.google.com/github/absolutelydawn/TestNeoX/blob/main/Fewdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 데이터 줄여서 다시 실행해보기

!pip uninstall transformers -y
!pip install transformers
!pip install accelerate
!pip install accelerate --upgrade

import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

import urllib.request
url = "https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv"
urllib.request.urlretrieve(url, "ChatbotData.csv")

# 데이터 불러오기
data_file_path = "ChatbotData.csv"
df = pd.read_csv(data_file_path)

# 입력과 출력을 라벨없이 합친 텍스트로 설정 (학습 데이터 3000개만 사용)
all_texts = df['Q'][:1500].tolist() + df['A'][:1500].tolist()

# 토큰화
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
all_encodings = tokenizer(all_texts, truncation=True, padding=True)

class UnsupervisedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

unsupervised_dataset = UnsupervisedDataset(all_encodings)

# 기존 학습된 모델 경로를 설정합니다.
pretrained_model_path = "gpt2"

# 기존 학습된 모델 불러오기
config = GPT2Config.from_pretrained(pretrained_model_path)
config.gradient_checkpointing = True
model = GPT2LMHeadModel.from_pretrained(pretrained_model_path, config=config)

# 학습 설정
training_args = TrainingArguments(
    output_dir='./finetuned_model',
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    logging_steps=100,
    save_steps=500,
    save_total_limit=1
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 트레이너 생성 및 학습 설정
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=unsupervised_dataset
)

# 추가 학습 진행
trainer.train()

# 추가 학습된 모델 저장
model.save_pretrained('./finetuned_model')


Found existing installation: transformers 4.31.0
Uninstalling transformers-4.31.0:
  Successfully uninstalled transformers-4.31.0
Collecting transformers
  Using cached transformers-4.31.0-py3-none-any.whl (7.4 MB)
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python



Step,Training Loss
100,2.1835
200,1.9593
300,1.8552
400,1.7851
500,1.6953
600,1.6554


In [None]:
def generate_from_trial_tokenized(tokenized):
    tokenized = tokenized.unsqueeze(0)  # 배치 차원 추가
    tokenized = tokenized.to(device)  # 텐서를 GPU로 이동
    model.to(device)  # 모델을 GPU로 이동

# 챗봇 테스트
def chat_with_model(input_text):
    input_tokens = tokenizer.encode(input_text, return_tensors="pt")

    with torch.no_grad():
        output_tokens = model.generate(
            input_tokens, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2
        )

    decoded_output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return decoded_output

while True:
    user_input = input("User: ")
    if user_input.lower() == "quit":
        break

    response = chat_with_model(user_input)
    print("Model: ", response)