# 1번

In [2]:
! pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from difflib import SequenceMatcher
torch.manual_seed(100)

# 임의의 한문장으로 학습
sample_sentence_1 = "Harry Potter and the Sorcerer's Stone "

# 문자 집합 및 매핑 생성
char_set = list(set(sample_sentence_1))
dic = {c: i for i, c in enumerate(char_set)}
rev_dic = {i: c for c, i in dic.items()}

# 파라미터 설정
dic_size = len(dic)
input_size = dic_size
hidden_size = dic_size * 2
output_size = dic_size
unit_sequence_length = 20
learning_rate = 0.005
training_epochs = 1000

# 데이터셋 준비
input_batch = []
target_batch = []

for i in range(0, len(sample_sentence_1) - unit_sequence_length):
    input_seq = sample_sentence_1[i:i + unit_sequence_length]
    target_seq = sample_sentence_1[i + 1:i + unit_sequence_length + 1]
    input_batch.append([dic[char] for char in input_seq])
    target_batch.append([dic[char] for char in target_seq])

input_batch = torch.tensor(input_batch, dtype=torch.long)
target_batch = torch.tensor(target_batch, dtype=torch.long)

X = torch.zeros(len(input_batch), unit_sequence_length, dic_size)
for i, seq in enumerate(input_batch):
    X[i] = torch.nn.functional.one_hot(seq, num_classes=dic_size).float()

Y = target_batch

# Model(LSTM)으로 사용
class Custom_RNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, layers):
        super(Custom_RNN, self).__init__()
        self.rnn = torch.nn.LSTM(input_dim, hidden_dim, num_layers=layers)
        self.fc = torch.nn.Linear(hidden_dim, output_dim, bias=True)

    def forward(self, x):
        x, _status = self.rnn(x)
        x = self.fc(x)
        return x

model = Custom_RNN(input_size, hidden_size, output_size, 2)



# 손실 계산 및 최적화
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 학습 루프
for epoch in range(training_epochs):
    optimizer.zero_grad()

    # 모델 예측
    outputs = model(X)

    loss = criterion(outputs.reshape(-1, dic_size), Y.reshape(-1))

    loss.backward()
    optimizer.step()

    # 에포크별 손실 출력
    if epoch % 200 == 199:
        print(f"Epoch: {epoch + 1}, Loss: {loss.item():.4f}")


# 학습 후 예측 결과 출력
model.eval()
with torch.no_grad():
    outputs = model(X)
    results = outputs.data.numpy().argmax(axis=2)

    first_result = results[:, 0]
    final_result = results[-1, :]
    result_str = np.concatenate((first_result, final_result), axis=0)

    # 결합된 결과를 문자로 변환
    predicted_text = ''.join([rev_dic[char] for char in result_str])

    # Original 문장 생성 (merged_sentence의 대응 부분)
    original_str = sample_sentence_1[:len(first_result)] + sample_sentence_1[-len(final_result):]

    # 문자열 유사도 계산
    similarity = SequenceMatcher(None, original_str, predicted_text).ratio()

    # 결과 출력
    print("\nOriginal Extracted Text:")
    print(original_str)
    print("\nPredicted Extracted Text:")
    print(predicted_text)

    # 유사도 기반 정확도 출력
    print(f"\nSimilarity-based Accuracy (Structural): {similarity * 100:.2f}%")


Epoch: 200, Loss: 0.0586
Epoch: 400, Loss: 0.0397
Epoch: 600, Loss: 0.0373
Epoch: 800, Loss: 0.0364
Epoch: 1000, Loss: 0.0360

Original Extracted Text:
Harry Potter and the Sorcerer's Stone 

Predicted Extracted Text:
arry Potter and thhe Sorcerer's Stone 

Similarity-based Accuracy (Structural): 97.37%


#2번.

2. L20 실습 그대로 수행
허깅페이스에서 적당한 한글문장 감정분류를 학습한 모델 2개 이상 선택

허깅페이스에서 dataset을 선택, 이걸 test data로 사용

평가는 랜덤한 100개의 sample골라 accuracy 측정
동일 회차의 평가에는 동일한 sample로 모든 모델 평가

총 3회 평가, 평균적으로 어떤 모델이 우수한지 출력 사용  

In [4]:
# HugginhFace 연결
#토큰: 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
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# dataset 가져오기
from datasets import load_dataset

ds = load_dataset("sepidmnorozy/Korean_sentiment")

train.csv:   0%|          | 0.00/3.29M [00:00<?, ?B/s]

dev.csv:   0%|          | 0.00/127k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/239k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/36000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1333 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2667 [00:00<?, ? examples/s]

In [21]:
# 모델.1
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("WhitePeak/bert-base-cased-Korean-sentiment")
model1 = AutoModelForSequenceClassification.from_pretrained("WhitePeak/bert-base-cased-Korean-sentiment")

# 'sentiment-analysis'를 첫 번째 인수로 사용
classifier1 = pipeline('sentiment-analysis', model=model1, tokenizer=tokenizer, device=0)


#모델 2.
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer2 = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)
model2 = AutoModelForSequenceClassification.from_pretrained("rkdaldus/ko-sent5-classification")

# 'sentiment-analysis'를 첫 번째 인수로 사용
classifier2 = pipeline('sentiment-analysis', model=model2, tokenizer=tokenizer2, device=0)


# 모델 평가
import evaluate
accuracy = evaluate.load('accuracy')

import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

test_list = ds['test']
test_sample_size = 100
rng = np.random.default_rng()
random_indices = rng.choice(len(test_list), size=test_sample_size, replace=False)
sampled_data = test_list.select(random_indices)

# classifier1을 사용하도록 변경
test_predictions = classifier1(sampled_data['text'], max_length=512, truncation=True)
predicted_labels = [label['label'] for label in test_predictions]

# model1을 사용하도록 변경
label2id = model1.config.label2id

mapped_labels = [label2id[label] for label in predicted_labels]
reference_labels = sampled_data['label']
print(accuracy.compute(predictions=mapped_labels, references=reference_labels))
print(reference_labels)
print(mapped_labels)


# classifier2
test_predictions = classifier2(sampled_data['text'], max_length=512, truncation=True)
predicted_labels = [label['label'] for label in test_predictions]

# model2
label2id = model2.config.label2id

mapped_labels = [label2id[label] for label in predicted_labels]
reference_labels = sampled_data['label']
print(accuracy.compute(predictions=mapped_labels, references=reference_labels))
print(reference_labels)
print(mapped_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rkdaldus/ko-sent5-classification and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'accuracy': 0.79}
[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0]
[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0]
{'accuracy': 0.34}
[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0]
[4, 2, 0, 0, 0, 4, 3, 3, 4, 0, 0, 0, 3, 1, 0, 4, 4, 4, 4, 0

첫번째 실행   
model 1. {'accuracy': 0.71}  
[0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1]  
model 2. {'accuracy': 0.22}  
[0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]
[4, 4, 4, 4, 1, 4, 0, 1, 4, 4, 4, 4, 4, 1, 0, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 4, 1, 0, 4, 4, 1, 4, 0, 4, 4, 1, 4, 1, 0, 4, 4, 4, 4, 0, 4, 4, 4, 1, 0, 1, 1, 4, 3, 4, 4, 4, 4, 0, 4, 4, 0, 4, 1, 4, 1, 0, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 1, 1, 4, 4, 0, 1, 1, 1, 4, 4, 4, 1, 1, 1]  

두번째 실행    
model 1. {'accuracy': 0.75}  
[1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]  
model 2. {'accuracy': 0.46}  
[1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]  
[1, 3, 0, 0, 0, 1, 2, 1, 0, 4, 1, 4, 4, 4, 3, 0, 1, 0, 0, 2, 1, 0, 2, 4, 1, 1, 4, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2, 1, 2, 2, 1, 4, 0, 4, 1, 0, 1, 0, 1, 2, 0, 1, 1, 1, 1, 2, 2, 1, 0, 2, 0, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 4, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 4, 0, 1, 0, 2, 2]

세번째 실행    
model1. {'accuracy': 0.79}  
[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0]
[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0]  
model2. {'accuracy': 0.34}  
[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0]
[4, 2, 0, 0, 0, 4, 3, 3, 4, 0, 0, 0, 3, 1, 0, 4, 4, 4, 4, 0, 0, 4, 4, 4, 3, 3, 4, 3, 0, 0, 4, 0, 4, 0, 0, 3, 4, 3, 3, 0, 0, 4, 4, 0, 0, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 0, 3, 3, 0, 0, 0, 4, 0, 0, 3, 0, 2, 4, 0, 0, 0, 4, 4, 0, 2, 4, 3, 0, 0, 0, 0, 3, 4, 0, 4, 0, 4, 0, 0, 0, 2, 0, 1, 0, 0, 0]


평균적으로 모델 1이 우수한 것을 알 수 있다.  
