In [2]:
import numpy as np
from collections import Counter
import gensim

In [3]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent great work', 'supreme nice quality', 'bad', 'nice highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

tokenized_sentences = [sent.split() for sent in sentences]
print(tokenized_sentences)

[['nice', 'great', 'best', 'amazing'], ['stop', 'lies'], ['pitiful', 'nerd'], ['excellent', 'great', 'work'], ['supreme', 'nice', 'quality'], ['bad'], ['nice', 'highly', 'respectable']]


In [4]:
word_list = [w for sent in tokenized_sentences for w in sent]
word_list
word_counts = Counter(word_list)
word_counts

Counter({'nice': 3,
         'great': 2,
         'best': 1,
         'amazing': 1,
         'stop': 1,
         'lies': 1,
         'pitiful': 1,
         'nerd': 1,
         'excellent': 1,
         'work': 1,
         'supreme': 1,
         'quality': 1,
         'bad': 1,
         'highly': 1,
         'respectable': 1})

In [5]:
word_to_index = {}
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1
for word in word_counts:
    if word not in word_to_index:
        word_to_index[word] = len(word_to_index)
word_to_index

{'<PAD>': 0,
 '<UNK>': 1,
 'nice': 2,
 'great': 3,
 'best': 4,
 'amazing': 5,
 'stop': 6,
 'lies': 7,
 'pitiful': 8,
 'nerd': 9,
 'excellent': 10,
 'work': 11,
 'supreme': 12,
 'quality': 13,
 'bad': 14,
 'highly': 15,
 'respectable': 16}

In [6]:
def texts_to_sequences(tokenized_X_data, word_to_index):
    seq = []
    for sent in tokenized_sentences:
        seq.append([word_to_index.get(w, 1) for w in sent])
    return seq
X_encoded = texts_to_sequences(tokenized_sentences, word_to_index)
print(X_encoded)

[[2, 3, 4, 5], [6, 7], [8, 9], [10, 3, 11], [12, 2, 13], [14], [2, 15, 16]]


In [7]:
def pad_sequences(X_encoded, maxlen):
    padded_X = []
    for seq in X_encoded:
        if len(seq) < maxlen:
            seq = seq + [0] * (maxlen - len(seq))
        else:
            seq = seq[:maxlen]
        padded_X.append(seq)
    return padded_X
X_train = pad_sequences(X_encoded, 5)
print(X_train)

[[2, 3, 4, 5, 0], [6, 7, 0, 0, 0], [8, 9, 0, 0, 0], [10, 3, 11, 0, 0], [12, 2, 13, 0, 0], [14, 0, 0, 0, 0], [2, 15, 16, 0, 0]]


In [8]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

In [9]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SimpleModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(embedding_dim * 5, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        embedded = self.embedding(x)
        flattend = self.flatten(embedded)
        output = self.fc(flattend)
        return self.sigmoid(output)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_dim = 100
simple_model = SimpleModel(len(word_to_index), embedding_dim).to(device)

In [11]:
criterion = nn.BCELoss()
optimizer = Adam(simple_model.parameters())
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float32))
train_dataloader = DataLoader(train_dataset, batch_size=2)
print(len(train_dataloader))

4


In [12]:
for epoch in range(10):
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = simple_model(inputs).view(-1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 0.624923586845398
Epoch 2, Loss: 0.4315648078918457
Epoch 3, Loss: 0.28908026218414307
Epoch 4, Loss: 0.20004642009735107
Epoch 5, Loss: 0.14668801426887512
Epoch 6, Loss: 0.11396041512489319
Epoch 7, Loss: 0.09262114018201828
Epoch 8, Loss: 0.07758186757564545
Epoch 9, Loss: 0.06618209928274155
Epoch 10, Loss: 0.05708176642656326


In [13]:
simple_model.embedding.weight

Parameter containing:
tensor([[ 0.5895,  0.1298, -0.6350,  ..., -2.0524,  2.4095, -0.1532],
        [-1.0700, -1.1878, -1.0865,  ..., -1.5663, -0.8975, -0.1144],
        [-0.3585, -0.4442,  0.1917,  ..., -0.4807,  0.2418, -0.4552],
        ...,
        [-0.7217,  0.9467,  2.2711,  ..., -1.0715,  0.9850, -0.3917],
        [ 0.7121, -0.7582,  1.1838,  ..., -0.6445, -1.8391,  0.0964],
        [ 1.0371,  1.1644,  0.3995,  ..., -1.1793, -0.7625,  0.9683]],
       requires_grad=True)

In [14]:
# 감성 분류를 위한 샘플 데이터셋 (50개 문장: 25개 긍정, 25개 부정)
sample_texts = [
    # 긍정적인 문장 (25개)
    "This movie was absolutely amazing, I loved every second of it!",
    "The restaurant's food was delicious and the service was outstanding.",
    "I had the best experience with this product, highly recommend it to everyone.",
    "The concert was incredible, the band performed beyond my expectations.",
    "This book changed my perspective on life, truly inspiring.",
    "The customer service team was very helpful and resolved my issue quickly.",
    "The new software update has significantly improved performance.",
    "I'm extremely satisfied with my purchase, worth every penny.",
    "The hotel room was clean, comfortable, and had a beautiful view.",
    "My vacation was perfect from start to finish, couldn't have asked for more.",
    "The team's collaboration made the project a huge success.",
    "This phone exceeds all my expectations, best device I've ever owned.",
    "The online course was well-structured and provided valuable knowledge.",
    "The scenery at the national park was breathtaking and unforgettable.",
    "This coffee shop serves the most delicious pastries in town.",
    "The instructor was knowledgeable and made learning enjoyable.",
    "I received my package earlier than expected, fantastic service!",
    "The mobile app is user-friendly and makes tasks much easier.",
    "The theater performance moved me to tears, absolutely brilliant.",
    "My new car drives smoothly and has amazing fuel efficiency.",
    "The gym facilities are modern and the trainers are professional.",
    "This streaming service offers an incredible selection of movies.",
    "The wedding venue was perfect and created wonderful memories.",
    "The tech support solved my complex problem in minutes.",
    "The conference was informative and provided great networking opportunities.",
    
    # 부정적인 문장 (25개)
    "This movie was terrible, I wasted two hours of my life.",
    "The food was cold and the waiter was extremely rude.",
    "The product broke after just two days, terrible quality.",
    "The concert was disappointing, the sound quality was awful.",
    "This book was boring and predictable, wouldn't recommend it.",
    "Customer service was unhelpful and ignored my complaints.",
    "The software update crashed my computer and lost my files.",
    "I regret buying this overpriced and underperforming product.",
    "The hotel room was dirty, noisy, and nothing like the pictures.",
    "My vacation was ruined by bad weather and poor planning.",
    "The team failed to meet deadlines and the project was a disaster.",
    "This phone constantly freezes and the battery life is terrible.",
    "The online course was disorganized and contained outdated information.",
    "The park was overcrowded and filled with litter everywhere.",
    "This coffee shop serves the worst coffee I've ever tasted.",
    "The instructor was unprepared and couldn't answer basic questions.",
    "My package arrived damaged and items were missing.",
    "The app is frustrating to use and crashes constantly.",
    "The performance was amateur and not worth the ticket price.",
    "My new car has been in the repair shop more than on the road.",
    "The gym equipment is outdated and often out of order.",
    "This streaming service buffers constantly despite my fast internet.",
    "The venue was too small and uncomfortable for the event.",
    "Tech support kept me on hold for hours without resolving my issue.",
    "The conference was a waste of time and money, learned nothing new."
]

# 레이블 (1: 긍정, 0: 부정)
sample_labels = [1] * 25 + [0] * 25

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import random

In [16]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7a6495567ad0>

In [17]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^%\w\s]', '', text)
    return text
processed_texts = [preprocess_text(text) for text in sample_texts]
tokenized_texts = [text.split() for text in processed_texts]

In [18]:
all_words = [word for text in tokenized_texts for word in text]
word_counts = Counter(all_words)
print(f"총 고유 단어 수: {len(word_counts)}")
word_counts


총 고유 단어 수: 261


Counter({'the': 46,
         'and': 29,
         'was': 28,
         'my': 18,
         'this': 12,
         'service': 6,
         'i': 5,
         'of': 5,
         'to': 5,
         'new': 4,
         'a': 4,
         'is': 4,
         'it': 3,
         'with': 3,
         'product': 3,
         'on': 3,
         'life': 3,
         'has': 3,
         'performance': 3,
         'for': 3,
         'coffee': 3,
         'shop': 3,
         'in': 3,
         'terrible': 3,
         'constantly': 3,
         'movie': 2,
         'absolutely': 2,
         'amazing': 2,
         'every': 2,
         'food': 2,
         'delicious': 2,
         'had': 2,
         'best': 2,
         'recommend': 2,
         'concert': 2,
         'incredible': 2,
         'expectations': 2,
         'book': 2,
         'customer': 2,
         'team': 2,
         'issue': 2,
         'software': 2,
         'update': 2,
         'extremely': 2,
         'worth': 2,
         'hotel': 2,
         'room': 2,
 

In [19]:
word_list = list(word_counts)
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
for word in word_list:
    if word not in word_to_idx:
        word_to_idx[word] = len(word_to_index)
word_to_idx['this']

17

In [20]:
# 텍스트를 인덱스 시퀀스로 변환
def text_to_sequence(text, word_to_idx, max_len=30):
    words = preprocess_text(text).split()
    sequence = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in words]
    
    # 시퀀스 길이 조정 (패딩 또는 자르기)
    if len(sequence) < max_len:
        sequence = sequence + [word_to_idx["<PAD>"]] * (max_len - len(sequence))
    else:
        sequence = sequence[:max_len]
    
    return sequence

# 모든 샘플을 인덱스 시퀀스로 변환
max_seq_length = 30  # 최대 시퀀스 길이 설정
X = np.array([text_to_sequence(text, word_to_idx, max_seq_length) for text in sample_texts])
y = np.array(sample_labels)


In [21]:
# 데이터 분할: 학습 80%, 검증 20%
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"학습 데이터 크기: {X_train.shape}, 레이블: {y_train.shape}")
print(f"검증 데이터 크기: {X_val.shape}, 레이블: {y_val.shape}")

# PyTorch 텐서로 변환
X_train_tensor = torch.LongTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_val_tensor = torch.LongTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val)

# 데이터로더 생성
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

batch_size = 8  # 작은 데이터셋이므로 작은 배치 크기 사용
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)


학습 데이터 크기: (40, 30), 레이블: (40,)
검증 데이터 크기: (10, 30), 레이블: (10,)


In [None]:
# PyTorch 텐서로 변환
X_train_tensor = torch.LongTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_val_tensor = torch.LongTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

batch_size =8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [22]:
# 하이퍼파라미터 설정
vocab_size = len(word_to_idx)
embedding_dim = 50  # 작은 데이터셋에 적합한 작은 차원
hidden_dim = 64
output_dim = 1  # 이진 분류
learning_rate = 0.01
num_epochs = 20


In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden = hidden.squeeze(0)
        output = self.fc(hidden)
        return output