In [4]:
model.summary()

In [3]:
import time  # 시간 측정을 위한 모듈
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 데이터 불러오기
data = pd.read_csv('fake_reviews_dataset.csv')  # 데이터 파일 경로 수정 필요

# 레이블 인코딩
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=5000)  # 최대 5000개의 단어 사용
X = tfidf.fit_transform(data['text_']).toarray()  # 텍스트 데이터를 TF-IDF 벡터로 변환
y = data['label_encoded']

# 데이터 분할 (훈련 세트, 테스트 세트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape[1])
# MLP 모델 구축
# model = Sequential()
# model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # 첫 번째 은닉층
# model.add(Dense(32, activation='relu'))  # 두 번째 은닉층
# model.add(Dense(1, activation='sigmoid'))  # 출력층 (이진 분류)

from tensorflow.keras.layers import Dropout

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))  # 첫 번째 Dropout
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))  # 두 번째 Dropout
model.add(Dense(1, activation='sigmoid'))

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 학습 시간 측정
start_time = time.time()
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))
end_time = time.time()

# 모델 평가
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# 정확도 출력
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# 성능 보고서 출력
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# 학습 시간 출력
print(f"\nTotal Training Time: {end_time - start_time:.2f} seconds")


5000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 75ms/step - accuracy: 0.7213 - loss: 0.5600 - val_accuracy: 0.9047 - val_loss: 0.2345
Epoch 2/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 38ms/step - accuracy: 0.9108 - loss: 0.2447 - val_accuracy: 0.9106 - val_loss: 0.2155
Epoch 3/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 62ms/step - accuracy: 0.9268 - loss: 0.2049 - val_accuracy: 0.9082 - val_loss: 0.2180
Epoch 4/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9392 - loss: 0.1760 - val_accuracy: 0.9064 - val_loss: 0.2245
Epoch 5/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9489 - loss: 0.1505 - val_accuracy: 0.9076 - val_loss: 0.2266
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
Accuracy: 90.76%

Classification Report:
              precision    recall  f1-score   support

          CG    

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scikeras.wrappers import KerasClassifier  # scikeras 사용
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 데이터 불러오기
data = pd.read_csv('fake_reviews_dataset.csv')

# 레이블 인코딩
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# TF-IDF 벡터화
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text_']).toarray()
y = data['label_encoded']

# 데이터 분할 (훈련 세트, 테스트 세트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLP 모델 함수 정의
def create_model(activation='relu', optimizer='adam', hidden_units=128):
    model = Sequential()
    model.add(Dense(hidden_units, input_dim=X_train.shape[1], activation=activation))  # 첫 번째 은닉층
    model.add(Dense(hidden_units // 2, activation=activation))  # 두 번째 은닉층
    model.add(Dense(1, activation='sigmoid'))  # 출력층
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])  # 모델 컴파일
    return model

# KerasClassifier 생성 시 'build_fn' 대신 'model' 파라미터 사용
model = KerasClassifier(model=create_model, activation='relu', optimizer='adam', hidden_units=128, verbose=0)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'hidden_units': [64, 128, 256],  # 은닉층 뉴런 수
    'optimizer': ['adam', 'sgd'],    # 최적화 알고리즘
    'batch_size': [32, 64],          # 배치 크기
    'epochs': [5, 10]                # 에포크 수
}

# GridSearchCV로 하이퍼파라미터 튜닝
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=2)
grid_result = grid.fit(X_train, y_train)

# 최적의 파라미터 출력
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

# 최적의 모델로 평가
best_model = grid_result.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Optimized Model Accuracy: {accuracy * 100:.2f}%')


Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END batch_size=32, epochs=5, hidden_units=64, optimizer=adam; total time=   7.1s
[CV] END batch_size=32, epochs=5, hidden_units=64, optimizer=adam; total time=   3.0s
[CV] END batch_size=32, epochs=5, hidden_units=64, optimizer=adam; total time=   1.7s
[CV] END batch_size=32, epochs=5, hidden_units=64, optimizer=sgd; total time=   1.7s
[CV] END batch_size=32, epochs=5, hidden_units=64, optimizer=sgd; total time=   1.7s
[CV] END batch_size=32, epochs=5, hidden_units=64, optimizer=sgd; total time=   1.6s
[CV] END batch_size=32, epochs=5, hidden_units=128, optimizer=adam; total time=   1.6s
[CV] END batch_size=32, epochs=5, hidden_units=128, optimizer=adam; total time=   1.5s
[CV] END batch_size=32, epochs=5, hidden_units=128, optimizer=adam; total time=   1.5s
[CV] END batch_size=32, epochs=5, hidden_units=128, optimizer=sgd; total time=   2.0s
[CV] END batch_size=32, epochs=5, hidden_units=128, optimizer=sgd; total time= 

ValueError: 
All the 72 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\82102\PycharmProjects\iM_ML-DL\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\82102\PycharmProjects\iM_ML-DL\.venv\lib\site-packages\scikeras\wrappers.py", line 1501, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "C:\Users\82102\PycharmProjects\iM_ML-DL\.venv\lib\site-packages\scikeras\wrappers.py", line 770, in fit
    self._fit(
  File "C:\Users\82102\PycharmProjects\iM_ML-DL\.venv\lib\site-packages\scikeras\wrappers.py", line 928, in _fit
    self._ensure_compiled_model()
  File "C:\Users\82102\PycharmProjects\iM_ML-DL\.venv\lib\site-packages\scikeras\wrappers.py", line 439, in _ensure_compiled_model
    if not self.model_.compiled:
AttributeError: 'Sequential' object has no attribute 'compiled'


In [6]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 데이터 불러오기
data = pd.read_csv('fake_reviews_dataset.csv')  # 데이터 파일 경로를 수정해주세요

# 레이블 인코딩
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Word2Vec 모델 훈련 (단어 벡터화)
tokenized_reviews = [review.split() for review in data['text_']]  # 리뷰 텍스트를 단어 단위로 분할
word2vec_model = Word2Vec(tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# 리뷰 텍스트를 벡터화 (단어 벡터 평균)
def vectorize_text(text):
    words = text.split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word_vectors, axis=0)

X = np.array([vectorize_text(review) for review in data['text_']])
y = data['label_encoded']

# 데이터 분할 (훈련 세트, 테스트 세트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# MLP 모델 구축
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # 첫 번째 은닉층
model.add(Dense(64, activation='relu'))  # 두 번째 은닉층
model.add(Dense(1, activation='sigmoid'))  # 출력층 (이진 분류)

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 학습
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# 모델 평가
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# 정확도 출력
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.8296 - loss: 0.3836 - val_accuracy: 0.8912 - val_loss: 0.2645
Epoch 2/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.8962 - loss: 0.2546 - val_accuracy: 0.8992 - val_loss: 0.2486
Epoch 3/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9037 - loss: 0.2351 - val_accuracy: 0.8998 - val_loss: 0.2427
Epoch 4/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.9111 - loss: 0.2185 - val_accuracy: 0.9065 - val_loss: 0.2275
Epoch 5/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.9174 - loss: 0.2041 - val_accuracy: 0.9027 - val_loss: 0.2354
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Accuracy: 90.27%


In [5]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp39-cp39-win_amd64.whl.metadata (8.2 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp39-cp39-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   --- ------------------------------------ 2.4/24.0 MB 16.8 MB/s eta 0:00:02
   ---------- ----------------------------- 6.6/24.0 MB 16.1 MB/s eta 0:00:02
   ------------------ --------------------- 11.0/24.0 MB 17.6 MB/s eta 0:00:01
   ---------------------------------- ----- 20.7/24.0 MB 24.7 MB/s eta 0:00:01
   ---------------------------------------- 24.0/24.0 MB 23.0 MB/s eta 0:00:00
Downloading smart_open-7.1.0-py3-none-any.whl (61 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.3 smart-open-7.1.0
