In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np

# 데이터 로드
data = pd.read_csv("fake_reviews_dataset.csv")
texts = data['text_'].tolist()
num_words = 10000

# 토크나이저 초기화 및 텍스트 시퀀스화
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# n-gram 데이터 생성
max_seq_len = 50
input_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        n_gram_seq = sequence[:i+1]
        input_sequences.append(n_gram_seq)

# 패딩 처리
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# X와 y 분리
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
# y = to_categorical(y, num_classes=10000)

# CNN + LSTM 모델 구축
input_layer = Input(shape=(max_seq_len-1,))
embedding = Embedding(input_dim=10000, output_dim=128, input_length=max_seq_len-1)(input_layer)
conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedding)
pool = MaxPooling1D(pool_size=2)(conv)
lstm = LSTM(128, return_sequences=False)(pool)
dense = Dense(256, activation='relu')(lstm)
dropout = Dropout(0.5)(dense)
# 모델 출력 차원 수정
output_layer = Dense(num_words, activation='softmax')(dropout)

model = Model(inputs=input_layer, outputs=output_layer)

# SparseCategoricalCrossentropy 사용
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
# 모델 학습
model.fit(X, y, epochs=10, batch_size=64)

# 예측 함수
def restore_truncated_review(seed_text, model, tokenizer, max_seq_len, max_words=50):
    for _ in range(max_words):
        tokenized_sequence = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_sequence = pad_sequences([tokenized_sequence], maxlen=max_seq_len-1, padding='pre')
        
        predicted_word_index = np.argmax(model.predict(tokenized_sequence), axis=-1)
        predicted_word = tokenizer.index_word.get(predicted_word_index[0], None)
        
        if predicted_word is None or predicted_word in ['.', '!', '?']:
            seed_text += f" {predicted_word}" if predicted_word else ""
            break
        
        seed_text += f" {predicted_word}"
    return seed_text

# 테스트: 잘린 리뷰 복원
truncated_reviews = data[data['text_'].apply(lambda x: not x.strip().endswith(('.', '!', '?')))]
truncated_texts = truncated_reviews['text_'].tolist()

restored_reviews = []
for review in truncated_texts:
    restored = restore_truncated_review(review, model, tokenizer, max_seq_len)
    restored_reviews.append(restored)

# 결과 출력
for original, restored in zip(truncated_texts, restored_reviews):
    print(f"Original: {original}")
    print(f"Restored: {restored}")
    print("-" * 50)




Epoch 1/10
[1m41423/41423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4404s[0m 106ms/step - accuracy: 0.0833 - loss: 5.9873
Epoch 2/10
[1m41423/41423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3403s[0m 82ms/step - accuracy: 0.1234 - loss: 5.4902
Epoch 3/10
[1m41423/41423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3415s[0m 82ms/step - accuracy: 0.1310 - loss: 5.4043
Epoch 4/10
[1m41423/41423[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7894s[0m 191ms/step - accuracy: 0.1355 - loss: 5.3606
Epoch 5/10
[1m 9657/41423[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m59:26[0m 112ms/step - accuracy: 0.1376 - loss: 5.3274