In [1]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, TrainingArguments
from trl import SFTTrainer
import pandas as pd

In [2]:
model_id = "google/gemma-2-2b"
data_dir = "./dataset/csv"
output_dir = './results2'
token = 'aaa'

In [3]:
# 모델 및 토크나이저 로드
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map={"":0},
    token=token
)

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [5]:
# 데이터셋 로드 및 전처리
def load_qna_files(data_dir):
    files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]
    data = []
    for file in files:
        dataset = pd.read_csv(file)
        print(f'Sample data of {file}')
        print(dataset.head(5))
        for index, row in dataset.iterrows():
            data.append(f"Question: {row['Question']}\nAnswer: {row['Answer']}")
    return data

data = load_qna_files(data_dir)
print(f'Sample of refined data')
print(data[:30])

Sample data of ./dataset/csv/g_lecture10_qna.csv
                                            Question  \
0            Why is unsupervised learning important?   
1  How does self-supervised learning relate to un...   
2    What challenge does unsupervised learning face?   
3            Why is representation learning crucial?   
4  How does clustering benefit unsupervised learn...   

                                              Answer  
0  It helps models learn from unlabelled data and...  
1  It uses the structure of data to train without...  
2  It lacks clear evaluation criteria without lab...  
3  Good representations improve interpretability ...  
4  It groups similar data points, aiding in later...  
Sample data of ./dataset/csv/g_lecture11_qna.csv
                                            Question  \
0          What are latent variable models used for?   
1            Why is variational inference important?   
2  How are generative models different from other...   
3    What i

In [6]:
def preprocess_data(texts, tokenizer, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")
    return encodings

# 전처리된 데이터셋
encodings = preprocess_data(data, tokenizer)

In [7]:
# Hugging Face Dataset 객체로 변환
dataset = Dataset.from_dict({"input_ids": encodings["input_ids"], "attention_mask": encodings["attention_mask"]})

# 데이터셋 샘플 출력
print(f"Dataset size: {len(dataset)}")
print("Sample dataset entry:", dataset[0])

Dataset size: 951
Sample dataset entry: {'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
# Sample text data to demonstrate tokenization before and after
sample_texts = ["Question: What is a logistic regression model?\nAnswer: It’s a basic machine learning model for classification."]

# Tokenization before and after comparison
for text in sample_texts:
    print(f"Original text: {text}")
    tokens = tokenizer.tokenize(text)
    print(f"Tokenized: {tokens}")

Original text: Question: What is a logistic regression model?
Answer: It’s a basic machine learning model for classification.
Tokenized: ['Question', ':', '▁What', '▁is', '▁a', '▁logistic', '▁regression', '▁model', '?', '\n', 'Answer', ':', '▁It', '’', 's', '▁a', '▁basic', '▁machine', '▁learning', '▁model', '▁for', '▁classification', '.']


In [9]:
# 데이터 콜레이터 설정
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
# Fine-tuning을 여러 번 가능하게 하는 함수
def fine_tune_model(model, dataset, tokenizer, output_dir="./results", epochs=3):
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=1,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
        learning_rate=5e-5,
        # fp16=True,  # GPU의 성능을 최대한 활용하기 위한 설정
    )
    
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator
    )
    
    trainer.train()
    
    # 모델 저장
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to {output_dir}")

# Fine-tuning 실행
# fine_tune_model(model, dataset, tokenizer, output_dir)
fine_tune_model(model, dataset, tokenizer, output_dir, epochs=3)



Step,Training Loss
100,3.5992
200,3.6659
300,3.5997
400,3.5299
500,3.3798
600,3.258
700,3.1263
800,3.1081
900,3.1118
1000,2.3887


Model saved to ./results2


In [11]:
def query_model(question, model, tokenizer, max_length=100):
    # Tokenize the input question
    inputs = tokenizer(question, return_tensors="pt").to(model.device)
    
    # Generate the answer with better control over generation parameters
    output_tokens = model.generate(
        **inputs, 
        max_length=max_length, 
        num_return_sequences=1, 
        no_repeat_ngram_size=2,  # Prevent token repetition
        do_sample=True,  # Enable sampling for varied answers
        top_p=0.95,  # Use nucleus sampling
        temperature=0.7  # Add randomness for more natural output
    )
    
    # Decode the generated tokens
    answer = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    
    return answer

In [12]:
question = "What is one benefit of using cloud data for model maintenance??"
# sample_prompt = f"Question: {question}\nAnswer: "
# response = query_model(sample_prompt, model, tokenizer)
response = query_model(question, model, tokenizer)
print(f"{response}")

The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


What is one benefit of using cloud data for model maintenance??
Answer: Using cloud datasets allows for easier identification of issues and gathering of training data to retrain and improve model performance.
200,080 chest X-rays and their corresponding intent, then using either the model or prevent overfitting.240 chests and AI models to maintain and mitigate overfit.48 chestrays. AI can detect diseases like pneumonia and tumors in X X.rays, leading to


In [13]:
question = "Why do we use CNN?"
response = query_model(question, model, tokenizer)
print(f"{response}")

Why do we use CNN?
Answer: To detect patterns in images using filters. To capture different relationships between images.
220 states in the image.230 States represent categories. Capture different patterns. and capture diverse accents. The image captures diverse patterns, capturing different accents, and environments.32.48 hours.80 hours, to capture essential features for classification. 3x3 convolutions throughout the feature map.00x4 convings throughout each
