In [None]:
import os

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

%matplotlib inline

In [None]:
def text_preprocessing(text):
    # 1. NaN 방지: 문자열 변환
    if pd.isnull(text):
        return ""
    
    # 2. 특수문자 제거 (한글, 영어, 숫자, 공백만 남김)
    text = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", text)
    
    # 3. 양쪽 공백 제거
    text = text.strip()
    
    return text

In [None]:
# sentencepiece 토크나이저 함수

def sp_tokenize(s, corpus, padding='pre'):
    tensor = []

    for sen in corpus:
#         tensor.append(s.EncodeAsIds(sen))
        tensor.append(s.encode_as_ids(sen))
    
    with open("spm.vocab", 'r') as f:
        vocab = f.readlines()

    word_index = {}
    index_word = {}

    for idx, line in enumerate(vocab):
        word = line.split("\t")[0]

        word_index.update({word:idx})
        index_word.update({idx:word})

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding=padding)

    return tensor, word_index, index_word

In [4]:
import pandas as pd
import numpy as np
import re

In [5]:
train_data = pd.read_csv('ratings_train.txt', sep='\t') # train 불러오기
test_data = pd.read_csv('ratings_test.txt', sep='\t') # test 불러오기

In [6]:
train_data['document'] = train_data['document'].apply(text_preprocessing)
test_data['document'] = test_data['document'].apply(text_preprocessing)

In [7]:
# row, column의 수가 제대로 읽혔는지 확인
print(len(train_data['document']))      # nrows: 150000
print(len(test_data['document']))       # nrows: 50000

150000
50000


In [8]:
train_data = train_data.drop_duplicates('document').dropna()
test_data = test_data.drop_duplicates('document').dropna()

In [9]:
# row, column의 수가 제대로 읽혔는지 확인
print(len(train_data['document']))      # nrows: 143475
print(len(test_data['document']))       # nrows: 48437

143475
48437


In [10]:
import sentencepiece as spm
import os

## unigram 사용

In [11]:
# sentencepiece 학습
import sentencepiece as spm
vocab_size = 8000
spm.SentencePieceTrainer.Train(
#     input='spm_input.txt',
    input = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp',
    model_prefix='spm',
    vocab_size=vocab_size,
    model_type='unigram',
)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /aiffel/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp
  input_format: 
  model_prefix: spm
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_p

In [12]:
sp = spm.SentencePieceProcessor()
sp.load("spm.model")

True

In [13]:
train_tokens = sp_tokenize(sp, list(train_data['document']), padding='pre')
test_tokens = sp_tokenize(sp, list(test_data['document']), padding='pre')

In [14]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [15]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=0)

In [16]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=256),
    LSTM(128),
    Dense(256),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
history = model.fit(
    train_tokens[0], train_data['label'].values,
    validation_data=(test_tokens[0], test_data['label'].values),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


val_acc = 84.91

## bpe 사용

In [18]:
# sentencepiece 학습
import sentencepiece as spm
vocab_size = 8000
spm.SentencePieceTrainer.Train(
#     input='spm_input.txt',
    input = os.getenv('HOME')+'/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp',
    model_prefix='spm',
    vocab_size=vocab_size,
    model_type='bpe',
)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /aiffel/aiffel/sp_tokenizer/data/korean-english-park.train.ko.temp
  input_format: 
  model_prefix: spm
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefi

In [19]:
sp = spm.SentencePieceProcessor()
sp.load("spm.model")

True

In [20]:
train_tokens = sp_tokenize(sp, list(train_data['document']), padding='pre')
test_tokens = sp_tokenize(sp, list(test_data['document']), padding='pre')

In [21]:
del model

In [22]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=256),
    LSTM(128),
    Dense(256),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
history = model.fit(
    train_tokens[0], train_data['label'].values,
    validation_data=(test_tokens[0], test_data['label'].values),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


val_acc = 85.05

## konlpy 사용

In [None]:
from konlpy.tag import Mecab
from collections import Counter

mecab = Mecab()

In [None]:
def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in mecab.pos(doc)]

train_docs = [tokenize(row) for row in train_data['document']]
test_docs = [tokenize(row) for row in test_data['document']]
tokens = [t for d in train_docs for t in d]
all_tokens = [token for tokens in train_docs for token in tokens]
counter = Counter(all_tokens)

In [None]:
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

vocab_size = 8000
most_common_tokens = [token for token, _ in counter.most_common(vocab_size)]
vocab_set = set(most_common_tokens)

print("✅ vocab 생성 완료:", len(vocab_set))

# 3. <UNK> 처리 함수
def filter_tokens(tokens, vocab_set):
    return [token if token in vocab_set else "<UNK>" for token in tokens]

# 4. train/test 문서에 vocab 적용
filtered_train_docs = [filter_tokens(doc, vocab_set) for doc in train_docs]
filtered_test_docs = [filter_tokens(doc, vocab_set) for doc in test_docs]

# 5. Tokenizer 정의 및 word_index 설정
tokenizer = Tokenizer(num_words=vocab_size + 2, oov_token="<UNK>")
tokenizer.word_index = {word: idx + 1 for idx, word in enumerate(most_common_tokens)}
tokenizer.word_index[tokenizer.oov_token] = vocab_size + 1

# 6. 텍스트를 시퀀스로 변환
train_texts = [" ".join(doc) for doc in filtered_train_docs]
test_texts = [" ".join(doc) for doc in filtered_test_docs]

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# 7. 시퀀스 padding
max_len = 50
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# 8. 레이블 분리
train_labels = train_data['label'].tolist()
test_labels = test_data['label'].tolist()

# 9. TensorFlow Dataset 생성
batch_size = 64

train_dataset = tf.data.Dataset.from_tensor_slices((train_padded, train_labels))
train_dataset = train_dataset.shuffle(len(train_padded)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_padded, test_labels))
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# 10. 확인 출력
print("✅ 데이터셋 준비 완료")
print("Train dataset shape:", train_padded.shape)
print("Test dataset shape:", test_padded.shape)

In [None]:
del model

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=256),
    LSTM(128),
    Dense(256),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    callbacks=[early_stop]
)


val_acc = 