In [6]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 확인

In [16]:
DATA_PATH = 'D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/'

In [17]:
print('파일 크기: ')
for file in os.listdir(DATA_PATH):
  if 'txt' in file:
    print(file.ljust(30)+str(round(os.path.getsize(DATA_PATH+ file) / 100000,2))+'MB')


파일 크기: 
ratings.txt                   195.15MB
ratings_test.txt              48.93MB
ratings_train.txt             146.29MB


In [18]:
train_data = pd.read_csv(DATA_PATH + 'ratings_train.txt',header = 0, delimiter='\t', quoting = 3)
train_data.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [20]:
print('학습데이터 전체 개수 : {}'.format(len(train_data)))

학습데이터 전체 개수 : 150000


In [22]:
train_length = train_data['document'].astype(str).apply(len)
train_length.head()

0    19
1    33
2    17
3    29
4    61
Name: document, dtype: int64

In [23]:
train_review = [review for review in train_data['document'] if type(review) is str]

In [26]:
print('긍정 리뷰 개수: {}'.format(train_data['label'].value_counts()[1]))
print('부정 리뷰 개수: {}'.format(train_data['label'].value_counts()[0]))

긍정 리뷰 개수: 74827
부정 리뷰 개수: 75173


In [24]:
len(train_review)x

149995

# 데이터 전처리

In [27]:
import numpy as np
import pandas as pd
import re
import json
from konlpy.tag import Okt
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

In [28]:
DATA_PATH = 'D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/'
train_data = pd.read_csv(DATA_PATH + 'ratings_train.txt',header = 0,delimiter = '\t',quoting=3)
train_data['document'][:5]

0                                  아 더빙.. 진짜 짜증나네요 목소리
1                    흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
2                                    너무재밓었다그래서보는것을추천한다
3                        교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
4    사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...
Name: document, dtype: object

In [31]:
#전처리 함수

def preprocessing(review,okt,remove_stopwords = False,stop_words = []):
    #함수인자 설명
    # review : 전처리할 텍스트
    # okt : okt객체를 반복적으로 생성하지 않고 미리 생성 후 인자로 받음
    # remove_stopword : 불용어 제거여부
    # stop_words : 불용어 사전은 사용자가 입력
    
    # 1. 한글 및 공백 제외한 문자 모두 제거
    review_text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]','',review)
    
    # 2. okt 객체를 활용해 형태소 나누기
    word_review = okt.morphs(review_text,stem = True)
    
    if remove_stopwords:
        # 3. 불용어 제거
        word_review = [token for token in word_review if not token in stop_words]
        return word_review
    

In [32]:
# 전체 텍스트 전처리
stop_words = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한']
okt = Okt()
clean_train_review = []

for review in train_data['document']:
    # 리뷰가 문자열인 경우만 전처리
    if type(review) == str :
        clean_train_review.append(preprocessing(review,okt,remove_stopwords = True,stop_words = stop_words))
    else:
        clean_train_review.append([]) # str이 아닌 행은 빈칸

clean_train_review[:4]

[['더빙', '진짜', '짜증나다', '목소리'],
 ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다'],
 ['너', '무재', '밓었', '다그', '래서', '보다', '추천', '다'],
 ['교도소', '이야기', '구먼', '솔직하다', '재미', '없다', '평점', '조정']]

In [33]:
# 테스트 리뷰도 동일하게 전처리
test_data = pd.read_csv(DATA_PATH + 'ratings_test.txt',header = 0 , delimiter = '\t',quoting = 3)

clean_test_review = []

for review in test_data['document']:
    # 리뷰가 문자열인 경우만 전처리
    if type(review) == str :
        clean_test_review.append(preprocessing(review,okt,remove_stopwords = True,stop_words = stop_words))
    else:
        clean_test_review.append([])
        
clean_test_review[:4]

[['굳다', 'ㅋ'],
 [],
 ['뭐', '야', '평점', '나쁘다', '않다', '점', '짜다', '리', '더', '더욱', '아니다'],
 ['지루하다', '않다', '완전', '막장', '임', '돈', '주다', '보기', '에는']]

In [34]:
# 인덱스 벡터 변환 후 일정길이 넘어가거나 모자라는 리뷰는 패딩처리
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_review)
train_sequences = tokenizer.texts_to_sequences(clean_train_review)
test_sequences = tokenizer.texts_to_sequences(clean_test_review)

word_vocab = tokenizer.word_index
MAX_SEQUENCE_LENGTH = 8

# train데이터
train_inputs = pad_sequences(train_sequences,maxlen = MAX_SEQUENCE_LENGTH,padding = 'post')

# 학습 데이터 라벨 벡터화
train_labels = np.array(train_data['label'])

# test데이터
test_inputs = pad_sequences(test_sequences,maxlen = MAX_SEQUENCE_LENGTH,padding = 'post')

# 평가 데이터 라벨 벡터화
test_labels = np.array(test_data['label'])

In [38]:
DEFAULT_PATH = 'D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/'
DATA_PATH = 'CLEAN_DATA/'
TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_LABEL_DATA = 'nsmc_train_label.npy'
TEST_INPUT_DATA = 'nsmc_test_input.npy'
TEST_LABEL_DATA = 'nsmc_test_label.npy'
DATA_CONFIGS = 'data_configs.json'

data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) + 1

#전처리한 데이터 파일 저장
import os

if not os.path.exists(DEFAULT_PATH + DATA_PATH):
    os.makedirs(DEFAULT_PATH + DATA_PATH)
    
#전처리 학습데이터 넘파이로 저장
np.save(open(DEFAULT_PATH+DATA_PATH+TRAIN_INPUT_DATA,'wb'),train_inputs)
np.save(open(DEFAULT_PATH+DATA_PATH+TRAIN_LABEL_DATA,'wb'),train_labels)
#전처리 테스트데이터 넘파이로 저장
np.save(open(DEFAULT_PATH+DATA_PATH+TEST_INPUT_DATA,'wb'),test_inputs)
np.save(open(DEFAULT_PATH+DATA_PATH+TEST_LABEL_DATA,'wb'),test_labels)
#데이터 사전 json으로 저장
json.dump(data_configs,open(DEFAULT_PATH + DATA_PATH + DATA_CONFIGS,'w'),ensure_ascii=False)

# 데이터 학습

In [39]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from tqdm import tqdm

In [42]:
DATA_PATH = 'D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/CLEAN_DATA/'
DATA_OUT = 'D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/DATA_OUT/'
INPUT_TRAIN_DATA = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

train_input = np.load(open(DATA_PATH + INPUT_TRAIN_DATA, 'rb'))
train_input = pad_sequences(train_input,maxlen = train_input.shape[1])
train_label = np.load(open(DATA_PATH + LABEL_TRAIN_DATA,'rb'))
prepro_configs = json.load(open(DATA_PATH+DATA_CONFIGS,'r'))

In [45]:
# 파라미터 세팅
model_name = 'cnn_classifier_kr'
BATCH_SIZE = 512
NUM_EPOCHS = 10
VALID_SPLIT = 0.1
MAX_LEN = train_input.shape[1]

kargs = {'model_name': model_name,'vocab_size':prepro_configs['vocab_size'],
        'embbeding_size':128,'num_filters':100,'dropout_rate':0.5,'hidden_dimension':250,
        'output_dimension':1}

In [57]:
# 학습 진행

class CNNClassifier(tf.keras.Model):

  def __init__(self, **kargs):
    super(CNNClassifier, self).__init__(name=kargs['model_name'])
    self.embedding = layers.Embedding(input_dim=kargs['vocab_size'], output_dim=kargs['embbeding_size'])
    self.conv_list = [layers.Conv1D(filters=kargs['num_filters'], kernel_size=kernel_size, padding='valid',activation = tf.keras.activations.relu,
                                    kernel_constraint = tf.keras.constraints.MaxNorm(max_value=3)) for kernel_size in [3,4,5]]
    self.pooling = layers.GlobalMaxPooling1D()
    self.dropout = layers.Dropout(kargs['dropout_rate'])
    self.fc1 = layers.Dense(units=kargs['hidden_dimension'],
                            activation = tf.keras.activations.relu,
                            kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
    self.fc2 = layers.Dense(units=kargs['output_dimension'],
                            activation=tf.keras.activations.sigmoid,
                            kernel_constraint= tf.keras.constraints.MaxNorm(max_value=3.))
    

  def call(self,x):
    x = self.embedding(x)
    x = self.dropout(x)
    x = tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis = 1)
    x = self.fc1(x)
    x = self.fc2(x)
    return x


In [62]:
model = CNNClassifier(**kargs)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy')])

#검증 정확도를 통한 EarlyStopping 기능 및 모델 저장 방식 지정
earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)
checkpoint_path = DATA_OUT + model_name +'\weogjts.h5'
checkpoint_dir = os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
  print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
  os.makedirs(checkpoint_dir, exist_ok=True)
  print("{} -- Folder create complete \n".format(checkpoint_dir))

cp_callback = ModelCheckpoint(
    checkpoint_path, monitor = 'val_accuracy', verbose=1, save_best_only = True,
    save_weights_only=True
)

history = model.fit(train_input, train_label, batch_size=BATCH_SIZE, epochs = NUM_EPOCHS,
                    validation_split=VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/DATA_OUT/cnn_classifier_kr -- Folder already exists 

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.82420, saving model to D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/DATA_OUT/cnn_classifier_kr\weogjts.h5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.82420 to 0.82667, saving model to D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/DATA_OUT/cnn_classifier_kr\weogjts.h5
Epoch 3/10
Epoch 00003: val_accuracy improved from 0.82667 to 0.82853, saving model to D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/DATA_OUT/cnn_classifier_kr\weogjts.h5
Epoch 4/10
Epoch 00004: val_accuracy did not improve from 0.82853
Epoch 5/10
Epoch 00005: val_accuracy did not improve from 0.82853


In [64]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'
SAVE_FILE_NM = 'weights.h5'

test_input = np.load(open(DATA_PATH + INPUT_TEST_DATA,'rb'))
test_input = pad_sequences(test_input,maxlen=test_input.shape[1])
test_label_data = np.load(open(DATA_PATH + LABEL_TEST_DATA,'rb'))

model.load_weights('D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/DATA_OUT/cnn_classifier_kr/weogjts.h5')
model.evaluate(test_input,test_label_data)



[0.393337607383728, 0.8272799849510193]

# 새로운 문장 예측

In [66]:
import numpy as np
import pandas as pd
import re
import json
from konlpy.tag import Okt
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer


In [79]:
okt = Okt()
tokenizer = Tokenizer()

DATA_CONFIGS = 'data_configs.json'
prepro_configs = json.load(open('D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/CLEAN_DATA/'+DATA_CONFIGS,'r'))
prepro_configs['vocab'] = word_vocab

tokenizer.fit_on_texts(word_vocab)

MAX_LENGTH = 8

sentence = input('감정분석할 문장을 입력해주세요 :')
sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\\s]','',sentence)
stopwords = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한']
sentence = okt.morphs(sentence,stem=True) #토큰화
sentence = [word for word in sentence if not word in stopwords] #불용어 제거
vector = tokenizer.texts_to_sequences(sentence)
pad_new = pad_sequences(vector,maxlen = MAX_LENGTH) #패딩

model.load_weights('D:/Analysis/BigData-PythonAnalysis-main/nsmc-master/DATA_OUT/cnn_classifier_kr/weogjts.h5') #모델 불러오기
predictions = model.predict(pad_new)
predictions = float(predictions.squeeze(-1)[1])

if(predictions > 0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(predictions * 100))
else:
    print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1-predictions) * 100))

감정분석할 문장을 입력해주세요 :명연기가 빛이난 영화
85.28% 확률로 긍정 리뷰입니다.

