In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# 1. 데이터 불러오기
file_path = "fake_reviews_dataset.csv"  # 데이터 파일 경로
data = pd.read_csv(file_path)

# 2. 데이터 확인 및 전처리
# Null 값 확인 및 제거
print(data.info())
data.dropna(inplace=True)

# 텍스트와 라벨 분리
texts = data['text_']
labels = data['label']

# 'CG'를 1로, 'OR'을 0으로 변환
labels = labels.map({'CG': 1, 'OR': 0}).astype(np.float32)

# 3. 텍스트 데이터 전처리
# 토크나이저 정의
max_words = 10000  # 사용할 최대 단어 수
max_len = 100  # 리뷰의 최대 길이
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

# 텍스트 시퀀스 변환 및 패딩
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# 4. 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# 5. 모델 생성
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 6. 모델 학습
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)

# 7. 모델 평가
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

# 8. 모델 저장
model.save("ai_human_review_classifier.h5")



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40432 non-null  object 
 1   rating    40432 non-null  float64
 2   label     40432 non-null  object 
 3   text_     40432 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.2+ MB
None


### 잘린 텍스트 생성하기 (LSTM)

In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [4]:
# 1. 데이터 불러오기
file_path = "fake_reviews_dataset.csv"  # 데이터 파일 경로
data = pd.read_csv(file_path)

# 'CG' 레이블 필터링
cg_data = data[data['label'] == 'CG']

# 마침표로 끝나지 않는 텍스트 필터링
cg_data_complete = cg_data[cg_data['text_'].str.endswith(('.', '?', '!'))]
# 마침표로 끝나는 텍스트만 학습 데이터로 사용
cg_data_incomplete = cg_data[~cg_data['text_'].str.endswith(('.', '?', '!'))]

# 텍스트 전처리 함수
def preprocess_text(texts):
    return [text.lower() for text in texts]

# 학습용 데이터 준비
texts = preprocess_text(cg_data_complete['text_'].tolist())
# texts = cg_data_complete['text_'].tolist()

# 토큰화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1

# 시퀀스 생성
input_sequences = []
for sentence in texts:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# 패딩 처리
max_seq_length = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre')

# 입력(X)과 출력(Y) 분리
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
# y = to_categorical(y, num_classes=vocab_size)

print(f"X shape: {X.shape}, y shape: {y.shape}")


X shape: (369639, 296), y shape: (369639,)


In [3]:
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length-1),
    LSTM(units=64, return_sequences=True),
    LSTM(units=64),
    Dense(units=vocab_size, activation='softmax')
])

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 296, 50)           369700    
                                                                 
 lstm (LSTM)                 (None, 296, 64)           29440     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 7394)              480610    
                                                                 
Total params: 912,774
Trainable params: 912,774
Non-trainable params: 0
_________________________________________________________________


In [5]:
epochs = 15
batch_size = 256
history = model.fit(X, y, epochs=epochs, batch_size = batch_size, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [6]:
def generate_text(model, tokenizer, seed_text, max_length=20):
    for _ in range(max_length):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
        
        # 모델 예측
        predicted = model.predict(token_list, verbose=0)
        
        # 예측된 값에서 가장 확률이 높은 단어 인덱스 선택
        predicted_word_index = np.argmax(predicted, axis=-1)[0]
        
        # 예측된 인덱스를 단어로 변환
        output_word = tokenizer.index_word.get(predicted_word_index, '')
        
        # 단어를 시드 텍스트에 추가
        seed_text += " " + output_word
        
        # 마침표가 나오면 텍스트 생성 종료
        if output_word == '.':
            break
    
    return seed_text

# 마침표가 없는 리뷰 중 하나를 가져와 뒷부분 생성
test_text = cg_data_incomplete.iloc[10]['text_']
generated_text = generate_text(model, tokenizer, test_text)

print(f"Original Text: {test_text}")
print(f"Generated Text: {generated_text}")


Original Text: Makes may tea with out stirring. The only problem is that it's kind of hard to put
Generated Text: Makes may tea with out stirring. The only problem is that it's kind of hard to put the bottom on top the plastic part is also very light and easy to wash i have used it twice


In [7]:
model.save("recover_model_lstm_x2.5.keras")

In [1]:
from keras.models import load_model

model = load_model('recover_model_lstm_x2.keras')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 296, 50)           369700    
                                                                 
 lstm (LSTM)                 (None, 296, 64)           29440     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 7394)              480610    
                                                                 
Total params: 912,774
Trainable params: 912,774
Non-trainable params: 0
_________________________________________________________________
