In [1]:
import os # 운영체제 내 경로 지정 및 연산에 쓰기 위함
import re # regular expression: 주어진 규칙에 맞는 언어 연산 수행 위함
import pandas as pd # 데이터 처리 및 데이터 사이언스 위한 라이브러리
import tensorflow as tf # 데이터 다운받기 위함
from tensorflow.keras import utils # 데이터 다운받기 위함
from bs4 import BeautifulSoup # 글에 존재할지 모르는 html 태그 없애는 전처리 위함

In [2]:
# IMDB 데이터 다운로드
data_set = utils.get_file(
    fname = 'imdb.tar.gz', # 다운받은 파일의 이름 변경
    origin = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', # 다운받아올 파일 위치한 경로
    extract = True # 압축 풀 건지
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
def directory_data(directory):
  data = {}
  data["review"] = []
  for file_path in os.listdir(directory):
    with open(os.path.join(directory, file_path), "r", encoding='utf-8') as file:
      data["review"].append(file.read())
  return pd.DataFrame.from_dict(data)

def data(directory):
  pos_df = directory_data(os.path.join(directory, "pos"))
  neg_df = directory_data(os.path.join(directory, "neg"))
  pos_df["sentiment"] = 1
  neg_df["sentiment"] = 0
  return pd.concat([pos_df, neg_df]) # default: axis = 0

In [4]:
train_df = data(os.path.join(os.path.dirname(data_set), "aclImdb", "train"))
test_df = data(os.path.join(os.path.dirname(data_set), "aclImdb", "test"))

imdb_pd = pd.concat([train_df, test_df])
imdb_pd.head()

Unnamed: 0,review,sentiment
0,"... for Paris is a moveable feast."" Ernest Hem...",1
1,"I love the movie, it was a very interesting fa...",1
2,Full Moon High (1981) 3 of 5 Dir: Larry Cohen ...,1
3,"Superb cast, more please!<br /><br />If you ca...",1
4,David Duchovny plays the lead role in this fil...,1


In [5]:
# corpus=말뭉치: NL 연구를 위해 특정한 목적을 가지고 언어의 표본을 추출한 집합
from nltk.corpus import stopwords # 불용어(stopwords) 사전 가져옴
import nltk

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
set_stopwords = set(stopwords.words('english')) # 영어로된 불용어 가져와 집합으로 구성

In [20]:
# 리뷰글 받아서 쓸데없는 부분 없애는 함수
def preprocessing(review, remove_stopwords = True):
    review_text = BeautifulSoup(review, 'html5lib').get_text(separator=' ') # html 태그 제거
    review_text = re.sub('[^a-zA-Z]', ' ', review_text) # a-z, A-Z 제외하고(^) 전부 다 ' '(공백)으로 대체(substitute)

    # 불용어 제거
    if remove_stopwords:
        words = review_text.split() # 단어들로 쪼개 words 리스트에 넣어줌
        words = [w for w in words if not w in set_stopwords]

        review_text = ' '.join(words) # join: 리스트의 원소들 전부 다 붙여줌 (' '을 사이에 두고)

    review_text = review_text.lower() # 대소문자 비교는 감정판단과 관련 없으므로 대문자를 모두 소문자로 변경

    return review_text

In [22]:
list_reviews = list(imdb_pd['review'])
print(list_reviews[0])
print(preprocessing(list_reviews[0], False))
print(preprocessing(list_reviews[0]))

... for Paris is a moveable feast." Ernest Hemingway<br /><br />It is impossible to count how many great talents have immortalized Paris in paintings, novels, songs, poems, short but unforgettable quotes, and yes - movies. The celebrated film director Max Ophüls said about Paris, <br /><br />"It offered the shining wet boulevards under the street lights, breakfast in Montmartre with cognac in your glass, coffee and lukewarm brioche, gigolos and prostitutes at night. Everyone in the world has two fatherlands: his own and Paris." <br /><br />Paris is always associated with love and romance, and "Paris, Je T'Aime" which is subtitled "Petite romances," is a collection of short films, often sketches from 18 talented directors from all over the world. In each, we become familiar with one of the City of Light 20 arrondissements and with the Parisians of all ages, genders, colors, and backgrounds who all deal in love in its many variations and stages. In some of the "petite romances" we are th

In [23]:
list_preprocessing_reviews = [] # 전처리한 모든 리뷰글들 저장

for review in list_reviews:
    list_preprocessing_reviews.append(preprocessing(review))

# 리뷰들의 전처리가 완성되어 리스트로 만들었으니, 다시 sentiment와 결합해 데이터프레임으로 만듦
list_preprocessing_df = pd.DataFrame({'review': list_preprocessing_reviews, 'sentiment': imdb_pd['sentiment']})

list_preprocessing_df

  review_text = BeautifulSoup(review, 'html5lib').get_text(separator=' ') # html 태그 제거


Unnamed: 0,review,sentiment
0,paris moveable feast ernest hemingway it impos...,1
1,i love movie interesting fantasy movie b c rea...,1
2,full moon high dir larry cohen stars adam arki...,1
3,superb cast please if catch anything else writ...,1
4,david duchovny plays lead role film now lot pe...,1
...,...,...
12495,i looking tv guide last night saw movie starri...,0
12496,well i know expect matter fact i never even he...,0
12497,terrible below par not bad good brilliant warn...,0
12498,maybe former hippie fully appreciate aside dat...,0


In [24]:
# 각각 갱신
list_reviews = list(list_preprocessing_df['review'])
list_sentiments = list(list_preprocessing_df['sentiment'])

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer # 텍스트에서 단어 뽑아내 벡터 만들기 위함
import numpy as np
from sklearn.model_selection import train_test_split

In [26]:
vector = TfidfVectorizer(max_features=5000) # 문장을 크기 5000의 벡터로 바꾸는 객체 생성
X = vector.fit_transform(list_reviews).toarray() # 벡터 생성
y = np.array(list_sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [27]:
print(X.shape)
print(y.shape) # (50000, ) 1은 생략되어 나오지 않는 것
print(len(X_train), len(X_test))

(50000, 5000)
(50000,)
40000 10000


In [28]:
from keras.models import Sequential
from keras.layers import Dense

In [29]:
model = Sequential()
model.add(Dense(1, activation = 'sigmoid')) # model
model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])

In [30]:
model.fit(X_train, y_train, epochs = 50, verbose = 1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x78ab45d72200>

In [31]:
_, accuracy = model.evaluate(X_test, y_test) # loss와 accuracy값이 나옴
print('Accuracy: %6.2f%%' %(accuracy*100))
model.summary()

Accuracy:  84.22%
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 1)                 5001      
                                                                 
Total params: 5001 (19.54 KB)
Trainable params: 5001 (19.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# 실습 과제 ↓

In [69]:
# input() 함수로 문장을 입력받고, 그 문장의 감정의 긍부를 맞춰보자.

query = input('문장 입력: ')
preprocessed_sentence = preprocessing(query)

문장 입력: The worst moive ever


In [70]:
preprocessed_sentence

'the worst moive ever'

In [71]:
x = vector.transform([preprocessed_sentence]).toarray()

In [72]:
y_pred = model.predict(x)
if y_pred >= 0.5:
    print('긍정 리뷰일 확률: %5.2f%%' %(y_pred.item()*100))
else:
    print('부정 리뷰일 확률: %5.2f%%' %((1-y_pred.item())*100))

부정 리뷰일 확률: 80.09%
