In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score, f1_score

In [3]:
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self): # length of the dataset
      return len(self.embeddings)

    def __getitem__(self, idx):
      embedding = self.embeddings[idx]
      label = self.labels[idx]
      return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)

In [7]:
class CNN(nn.Module):
  def __init__(self, input_dim):
    super(CNN, self).__init__()
    self.conv1 = nn.Conv1d(1, 128, kernel_size=3, padding=1) # Convolutional layer: 1 input channel, 128 output channels, 3x3 kernel size
    self.pool = nn.AdaptiveMaxPool1d(1) # Adaptive max pooling layer: 1 output channel
    self.fc1 = nn.Linear(128, 64) # first fully connected layer: 128 input features, 64 output features
    self.fc2 = nn.Linear(64, 1) # second fully connected layer: 64 input features, 1 output feature
    self.dropout = nn.Dropout(0.5) # dropout layer with 0.5 dropout rate
    # what is the dropout layer? 드롭아웃은 학습 중에 입력 단위의 일부를 무작위로 0으로 설정하여 과적합을 방지하는 데 도움이 됩니다.

  def forward(self, x): # x: input tensor
    x = self.conv1(x) # apply convolutional layer
    x = torch.relu(x)
    x = self.pool(x) # pool: apply pooling layer # pooling layer: reduce the size of the input tensor
    x = x.view(-1, 128) # flatten the output tensor
    x = self.fc1(x)
    x = torch.relu(x)
    x = self.dropout(x)
    x = self.fc2(x)
    return torch.sigmoid(x) # # 이진 분류를 위한 sigmoid 활성화 함수

In [5]:
embeddings = np.loadtxt('/content/drive/MyDrive/grad/ai-system/data/embeddings.csv', delimiter=',')

data = pd.read_excel('/content/drive/MyDrive/grad/ai-system/data/에타 1차 라벨링.xlsx')
labels = data['label'].tolist()

In [8]:
# Dataset 및 DataLoader 생성
dataset = EmbeddingDataset(embeddings, labels)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# DataLeader를 사용해 학습 및 테스트 데이터셋을 로드
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 모델, 손실 함수, 옵티마이저 정의
input_dim = embeddings.shape[1]
model = CNN(input_dim) # CNN 모델 초기화
criterion = nn.BCELoss() # binary cross-entropy loss function 이진 분류를 위한 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam optimizer # lr = learning rate

In [9]:
# 모델 학습
num_epochs = 10
model.train() # set the model to training mode
for epoch in range(num_epochs):
  for embeddings_batch, labels_batch in train_loader:
    embeddings_batch = embeddings_batch.unsqueeze(1) # (batch_size, 1, input_dim)으로 차원 확장
    outputs = model(embeddings_batch) # 모델을 사용하여 예측 생성
    loss = criterion(outputs.squeeze(), labels_batch) # 손실 계산

    optimizer.zero_grad() # 옵티마이저의 그래디언트 초기화
    loss.backward()       # 역전파 수행
    optimizer.step()      # 가중치 갱신
  print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.7442
Epoch [2/10], Loss: 0.7228
Epoch [3/10], Loss: 0.6786
Epoch [4/10], Loss: 0.6394
Epoch [5/10], Loss: 0.6662
Epoch [6/10], Loss: 0.8038
Epoch [7/10], Loss: 0.5645
Epoch [8/10], Loss: 0.6289
Epoch [9/10], Loss: 0.5952
Epoch [10/10], Loss: 0.5693


In [10]:
# 모델 평가
model.eval() # set the model to evaluation mode
y_true = []
y_pred = []
with torch.no_grad(): # 평가 중에는 그래디언트를 계산하지 않음
  for embeddings_batch, labels_batch in test_loader:
    embeddings_batch = embeddings_batch.unsqueeze(1)
    outputs = model(embeddings_batch)
    predicted = (outputs.squeeze() > 0.5).float() # 0.5보다 크면 1, 아니면 0으로 예측
    y_true.extend(labels_batch.numpy())
    y_pred.extend(predicted.numpy())

# 평가 지표 계산
accuracy = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_micro = f1_score(y_true, y_pred, average='micro')

print(f'Accuracy: {accuracy}')
print(f'F1 Score (Macro): {f1_macro}')
print(f'F1 Score (Micro): {f1_micro}')


Accuracy: 0.5908175594039469
F1 Score (Macro): 0.37139240506329113
F1 Score (Micro): 0.5908175594039469
