In [4]:
import pandas as pd
# 형태소 분석기
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer

# 필요한것 import 
from tensorflow import keras
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import nltk

# 불용어 리스트 제거하기

In [6]:
with open('./data/nsmc/stopwords.txt', 'r') as f:
    list_file = f.readlines()
stopwords = list_file[0].split(",")
len(stopwords)

861

In [13]:
# 데이터 불러오기
train_df = pd.read_csv("./data/nsmc/ratings_train.txt", sep="\t")
test_df = pd.read_csv("./data/nsmc/ratings_test.txt", sep ="\t")


# 결측치 제거
train_df = train_df.dropna()
test_df = test_df.dropna()


# 전처리 숫자는 살려
train_df["document"] = train_df["document"].str.replace("[^ㄱ-ㅎ가-힣ㅏ-ㅣ ]", " ", regex = True) 
test_df["document"] = test_df["document"].str.replace("[^ㄱ-ㅎ가-힣ㅏ-ㅣ ]", " ", regex = True)

# 불용어 리스트 제거
# train_df["document"] = train_df["document"].map(remove_stopwords)
# test_df["document"] = test_df["document"].map(remove_stopwords)

# 문장 5개 이상인것만 살려
train_df = train_df[train_df["document"].map(lambda x : len(x.strip()) >= 5)]
test_df = test_df[test_df["document"].map(lambda x : len(x.strip()) >= 5)]


# 중복 데이터 제거
train_df = train_df.drop_duplicates(subset=["document"])
test_df = test_df.drop_duplicates(subset=["document"])

In [15]:
# 토큰화
okt = Okt() 

In [16]:
%%time
train_df["token"] = train_df["document"].map(lambda x : okt.morphs(x, stem = True))
test_df["token"] = test_df["document"].map(lambda x : okt.morphs(x, stem = True))

CPU times: total: 10min 3s
Wall time: 9min 51s


In [17]:
# train_df.to_csv("./data/nsmc_ratings_train_pre2.csv", index = False)
# test_df.to_csv("./data/nsmc_ratings_test_pre2.csv", index = False)

In [9]:
train_df = pd.read_csv("./data/nsmc_ratings_train_pre.csv", index_col=False)
test_df = pd.read_csv("./data/nsmc_ratings_test_pre.csv", index_col=False)

# 토큰에서도 불용어 제거

In [None]:
# 불용어 리스트 제거
train_df["token"] = train_df["token"].map(lambda x : [i for i in x.strip() if x.strip() not in stopwords])

In [None]:
test_df["token"] = test_df["token"].map(remove_token_stopwords)


In [None]:
train_df.head()

In [None]:
# 토큰이 5개 이상인 애들만
train_df = train_df[train_df["token"].map(lambda x :len (x) >= 1)]
test_df = test_df[test_df["token"].map(lambda x :len (x) >= 1)]


# 정수 인코딩 아 10000개는 있어야지요~~
tokenizer = Tokenizer(num_words = 10000)


# 토큰화
x_train = tokenizer.texts_to_sequences(train_df["token"])
x_test = tokenizer.texts_to_sequences(test_df["token"])

# 독립변수 설정하기
x_train = np.array(x_train, dtype=object)
x_test = np.array(x_test, dtype=object)
# 종속변수 설정하기
y_train = train_df["label"].to_numpy()
y_test = test_df["label"].to_numpy()

In [27]:

# 데이터 나누기
x_sub, x_val, y_sub, y_val = train_test_split(
    x_train,
    y_train, 
    test_size = 0.2,
    stratify= y_train,
    random_state = 34
)

In [28]:
# 패딩적용하기
sub_seq = pad_sequences(x_sub, maxlen = 15, padding="post",truncating='post')
val_seq = pad_sequences(x_val, maxlen = 15, padding="post",truncating='post')
test_seq = pad_sequences(x_test, maxlen = 15, padding="post",truncating='post')

# 모델 학습

In [29]:
model = keras.Sequential()
model.add(keras.Input(shape = (15,)))

# 10000개의 단어를 256개로 늘릴것이다
model.add(keras.layers.Embedding(10000, 128)) # 128개의 특성으로 표현해줄께
model.add(keras.layers.Dropout(0.2)) # 모델 복잡해지니까 ~

# conv1d 로 좍좍~ 3번 레이어링 해주셈
model.add(keras.layers.Conv1D(256, 5, activation="swish"))
model.add(keras.layers.Dense(64))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Conv1D(128, 3, activation="swish"))
model.add(keras.layers.Dense(32))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))


model.add(keras.layers.MaxPooling1D(2))


# model.add(keras.layers.GRU(32, dropout = 0.3, return_sequences=True ))
model.add(keras.layers.GRU(16, dropout = 0.3))

model.add(keras.layers.Dense(100, activation="swish"))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(1, activation="sigmoid"))

# 멕여~

In [30]:
model.compile(
    optimizer= "adam", 
    loss = "binary_crossentropy",
    metrics = ["accuracy"]
)

es_cb = keras.callbacks.EarlyStopping(patience=4, restore_best_weights=True, monitor='val_loss')

history = model.fit(
    sub_seq, 
    y_sub, 
    epochs = 100,
    batch_size = 32, 
    validation_data = (val_seq, y_val), 
    callbacks = [es_cb]
)

Epoch 1/100
[1m3080/3080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 18ms/step - accuracy: 0.5020 - loss: 0.6947 - val_accuracy: 0.4962 - val_loss: 0.6933
Epoch 2/100
[1m3080/3080[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 18ms/step - accuracy: 0.5005 - loss: 0.6937 - val_accuracy: 0.4962 - val_loss: 0.6932
Epoch 3/100
[1m1268/3080[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m32s[0m 18ms/step - accuracy: 0.4996 - loss: 0.6934

KeyboardInterrupt: 

# 모델 평가

In [None]:
model.evaluate(test_seq, y_test)

In [None]:
plt.figure()

plt.plot(history.history["loss"], label = "train_loss")
plt.plot(history.history["val_loss"], label = "val_loss")

plt.plot(history.history["accuracy"], label = "train_accuracy")
plt.plot(history.history["val_accuracy"], label = "val_accuracy")

plt.legend()
plt.xlabel("epoch")
plt.show()