In [1]:
import pandas as pd
import numpy as np
import os

'''
참고 URL
- https://programmers.co.kr/learn/courses/21/lessons/1693
- http://suanlab.com/assets/lectures/dpp/10.pdf

'''

'''
1. Data preprocessing
2. word2vec
3. modeling
4. 평가

'''


'\n1. Data preprocessing\n2. word2vec\n3. modeling\n4. 평가\n\n'

In [2]:
os.listdir('data')


['labeledTrainData.tsv',
 'sampleSubmission.csv',
 'testData.tsv',
 'unlabeledTrainData.tsv']

In [3]:
# 데이터 읽기
# -- quoting : 특수 문자가 포함된 필드를 감쌀 때 처리하는 방법, 문자를 따옴표로 묶는 방법
import csv
df = pd.read_csv('data/labeledTrainData.tsv', header=0, delimiter='\t',quoting=3)
# QUOTE_MINIMAL (0), QUOTE_ALL (1), 
# QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
'''
-QUOTE_ALL(1) : Quote everything, regardless of type.(문자열처리, 모든데이터 묶음)
-QUOTE_MINIMAL(0) :Quote fields with special characters (특수 문자가 포함된 따옴표 필드)
(anything that would confuse a parser configured with the same dialect and options). This is the default
-QUOTE_NONNUMERIC(2) :Quote all fields that are not integers or floats.(숫자가 아닌 경우 묶음) 
  When used with the reader, input fields that are not quoted are converted to floats.
-QUOTE_NONE(3) : Do not quote anything on output. 데이터를 묶지 않음
 When used with the reader, quote characters are included in the field values (normally, they are treated as delimiters and stripped).
 reader와 사용하면 쌍따옴표는 필드값으로 포함된다.
'''

df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [4]:
# 데이터 전처리 -- HTML 태그, \ 특수 문자 제거
from bs4 import BeautifulSoup
import re

def preprocessing(x):
    #HTML 태그 제거
    x= BeautifulSoup(x,'html.parser').get_text()
    # 특수문자 제거 # 영문자,숫자를 제외한 문자를 모드 변환 띄어쓰기로
    x = re.sub("\W"," ",x)    
    return x

df['review']=df['review'].map(lambda x: preprocessing(x))


In [5]:
# 토크나이징 + Stopwords 제거
# 장점 : 노이즈를 줄일 수 있음, 단점 : 문장 구조 모델링시 정보 유실 발생

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def tokenizing(words):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(words.lower())
    words = [x for x in words if x not in stop_words]
    return words

df['words'] = df['review'].map(lambda x : tokenizing(x))
df.head()
    

Unnamed: 0,id,sentiment,review,words
0,"""5814_8""",1,With all this stuff going down at the moment ...,"[stuff, going, moment, mj, started, listening,..."
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...,"[classic, war, worlds, timothy, hines, enterta..."
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...,"[film, starts, manager, nicholas, bell, giving..."
3,"""3630_4""",0,It must be assumed that those who praised thi...,"[must, assumed, praised, film, greatest, filme..."
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,"[superbly, trashy, wondrously, unpretentious, ..."


In [6]:
'''

Tokenizing 방법 리뷰
1.  Split 함수
2.  NLTK 활용 
   - Tokenizing → Index로 벡터화 해야하는데 NLTK는 Tokenizing 까지만
3.  keras.preprocessing 활용
   - Keras는 Vector화 까지 가능, 

'''
# https://inuplace.tistory.com/536

from tensorflow.python.keras.preprocessing.text import Tokenizer

token = Tokenizer()
token.fit_on_texts(df['words']) # Fit
df['vector'] = token.texts_to_sequences(df['words']) # vector화 


In [7]:
df.head()

Unnamed: 0,id,sentiment,review,words,vector
0,"""5814_8""",1,With all this stuff going down at the moment ...,"[stuff, going, moment, mj, started, listening,...","[410, 71, 425, 8956, 511, 2484, 116, 54, 881, ..."
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...,"[classic, war, worlds, timothy, hines, enterta...","[236, 207, 3086, 3611, 7239, 321, 2, 411, 155,..."
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...,"[film, starts, manager, nicholas, bell, giving...","[2, 388, 2854, 4457, 3780, 604, 2210, 18035, 5..."
3,"""3630_4""",0,It must be assumed that those who praised thi...,"[must, assumed, praised, film, greatest, filme...","[101, 4896, 5399, 2, 688, 670, 1272, 42, 215, ..."
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,"[superbly, trashy, wondrously, unpretentious, ...","[3409, 4193, 37747, 11135, 859, 2062, 13202, 1..."


In [8]:
# 단어 사전 확인
vocab = token.word_index
vocab["<PAD>"] = 0
vocab
len(vocab)

75789

In [9]:
# 최대 문장길이 :: 3사분위에 해당하는 214개를 고정길이로 설정
print("최소 단어수", df['vector'].map(lambda x : len(x)).min())
print("1사분위", df['vector'].map(lambda x : len(x)).quantile(0.25))
print("2사분위",df['vector'].map(lambda x : len(x)).quantile(0.50))
print("3사분위",df['vector'].map(lambda x : len(x)).quantile(0.75))
print("최대 단어수",df['vector'].map(lambda x : len(x)).max())



최소 단어수 4
1사분위 64.0
2사분위 90.0
3사분위 148.0
최대 단어수 1429


In [10]:
# Padding: 가변 길이 → 고정 길이

from tensorflow.python.keras.preprocessing.sequence import pad_sequences

max_padding = 214

X_train = pad_sequences(df['vector'],maxlen = max_padding, padding = 'post' )
Y_train = df['sentiment']
print(X_train)

[[  116    54   881 ... 18947   320  1372]
 [  236   207  3086 ...     0     0     0]
 [ 4657 32515  3589 ...   707  1187  5398]
 ...
 [  118  3144    14 ...     0     0     0]
 [  831   644   521 ...     0     0     0]
 [  110     1   354 ...     0     0     0]]


In [11]:
# Vector 화 
'''
1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)
2. TF-IDF (완료)
3. Countvectorizer -- TF를 의미하는 것
4. Word2vec(완료)

'''

'\n1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)\n2. TF-IDF (완료)\n3. Countvectorizer -- TF를 의미하는 것\n4. Word2vec(완료)\n\n'

In [12]:
# TF-IDF
# 입력이 텍스트여야함.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=0.0, analyzer = "word", sublinear_tf=True,
                           ngram_range=(1,3), max_features=1000,stop_words = 'english')

# min_df : 설정값보다 특정 토큰의 df(document Frequency)가 적으면 벡터화에서 제거
# analyzer : word/char 2가지 : word는 단위 : 단어 / char : 단위 : char 
# sublinear_tf : term frequency에 대한 smoothing 여부
# ngram_range = n-gram 의 범위 : 분석기에 의해 설정값을 사용하여 ngram자동 생성
# max_features = 벡터의 최대 길이, 

tfidf_train = vectorizer.fit_transform(list(df['review']))
tfidf_train
print(type(tfidf_train))
print(tfidf_train.shape)
print(tfidf_train[0]) 
# 5000개의 단어 각각에 대한 tf-idf Weight를 의미함
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
tfidf_train.toarray()

<class 'scipy.sparse.csr.csr_matrix'>
(25000, 1000)
  (0, 229)	0.0942788152451277
  (0, 407)	0.08803182726193616
  (0, 837)	0.08721044748871645
  (0, 280)	0.09496753792186204
  (0, 284)	0.06969585794589742
  (0, 212)	0.07906035864322487
  (0, 228)	0.05231788132798058
  (0, 839)	0.10465861308995464
  (0, 345)	0.09157841992227735
  (0, 47)	0.09787767445694262
  (0, 636)	0.11924450276453806
  (0, 858)	0.10762165447123535
  (0, 897)	0.08444378828088646
  (0, 355)	0.077168582057444
  (0, 899)	0.08261680650536574
  (0, 224)	0.06018549026941358
  (0, 52)	0.07430033101736666
  (0, 822)	0.1010110104655137
  (0, 873)	0.056218845322282135
  (0, 488)	0.09699208939260177
  (0, 495)	0.08318639349213663
  (0, 737)	0.06406989962574486
  (0, 186)	0.10911092450265579
  (0, 100)	0.1008889996138849
  (0, 487)	0.07837482258819749
  :	:
  (0, 702)	0.09631309046208494
  (0, 133)	0.08993827764399456
  (0, 704)	0.08568406507165809
  (0, 316)	0.059719045764548194
  (0, 303)	0.17239761070304283
  (0, 555)	0.0807

array([[0.        , 0.        , 0.10381025, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0796076 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.14477071, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [13]:
tfidf_train.toarray().shape

(25000, 1000)

In [14]:
df.shape

(25000, 5)

In [15]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer ## TF 

vectorizer = CountVectorizer(analyzer = "word", max_features = 5000,ngram_range=(1,3))

count_train = vectorizer.fit_transform(list(df['review']))
count_train.toarray()#.shape

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
# Word2Vec
import logging
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s :  %(message)s', level=logging.INFO)


# Word2vec 입력은 단어로 표현된 리스트를 입력값으로 받음
# n-gram으로 만들어서 넣을 수도 있지만 여기에서는 단순히 split만해서 넣는 것으로 함

sentences = []
for review in list(df['review']) :
    sentences.append(review.split())

# print(sentences[0])

# 하이퍼파라미터
num_features = 1000 # word2vec 특징 수
min_word_count =20 
num_workers = 6
context =10 # Word2vec 수행을 위한 컨텍스트 윈도 크기
# https://medium.com/@omicro03/%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC-nlp-13%EC%9D%BC%EC%B0%A8-word2vec-3c82ec870426
downsampling = 1e-3 #Word2vec 빠른 학습을 위해 정답 단어 라벨에 대한 다운 샘플링, 보통 0.001이 좋은 성능
#Downsampling of frequent words # 자주 나오는 단어에 대해서는 0.001 만큼 다운 샘플링하여 시간을 아낌



In [17]:
print("Training")
# https://wikidocs.net/50739
model = word2vec.Word2Vec(sentences,
                         workers = num_workers,
                          size = num_features,
                          min_count = min_word_count,
                          window =  context,
                          sample = downsampling,
                          iter = 10,
                          sg =0 # sg =0 CBOW, 1 : skip-gram
                         )

2020-10-25 09:19:35,606 : INFO :  collecting all words and their counts
2020-10-25 09:19:35,607 : INFO :  PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training


2020-10-25 09:19:35,928 : INFO :  PROGRESS: at sentence #10000, processed 2398843 words, keeping 64085 word types
2020-10-25 09:19:36,247 : INFO :  PROGRESS: at sentence #20000, processed 4773958 words, keeping 86259 word types
2020-10-25 09:19:36,404 : INFO :  collected 94969 word types from a corpus of 5953723 raw words and 25000 sentences
2020-10-25 09:19:36,405 : INFO :  Loading a fresh vocabulary
2020-10-25 09:19:36,450 : INFO :  effective_min_count=20 retains 14661 unique words (15% of original 94969, drops 80308)
2020-10-25 09:19:36,451 : INFO :  effective_min_count=20 leaves 5672944 word corpus (95% of original 5953723, drops 280779)
2020-10-25 09:19:36,488 : INFO :  deleting the raw counts dictionary of 94969 items
2020-10-25 09:19:36,490 : INFO :  sample=0.001 downsamples 47 most-common words
2020-10-25 09:19:36,491 : INFO :  downsampling leaves estimated 4294841 word corpus (75.7% of prior 5672944)
2020-10-25 09:19:36,529 : INFO :  estimated required memory for 14661 words a

2020-10-25 09:20:19,223 : INFO :  EPOCH 5 - PROGRESS: at 99.54% examples, 530257 words/s, in_qsize 3, out_qsize 1
2020-10-25 09:20:19,224 : INFO :  worker thread finished; awaiting finish of 3 more threads
2020-10-25 09:20:19,227 : INFO :  worker thread finished; awaiting finish of 2 more threads
2020-10-25 09:20:19,235 : INFO :  worker thread finished; awaiting finish of 1 more threads
2020-10-25 09:20:19,237 : INFO :  worker thread finished; awaiting finish of 0 more threads
2020-10-25 09:20:19,238 : INFO :  EPOCH - 5 : training on 5953723 raw words (4295970 effective words) took 8.1s, 531614 effective words/s
2020-10-25 09:20:20,244 : INFO :  EPOCH 6 - PROGRESS: at 11.99% examples, 515306 words/s, in_qsize 12, out_qsize 0
2020-10-25 09:20:21,244 : INFO :  EPOCH 6 - PROGRESS: at 24.01% examples, 519395 words/s, in_qsize 11, out_qsize 0
2020-10-25 09:20:22,258 : INFO :  EPOCH 6 - PROGRESS: at 36.58% examples, 527287 words/s, in_qsize 11, out_qsize 0
2020-10-25 09:20:23,275 : INFO :  E

2020-10-25 09:20:58,665 : INFO :  training on a 59537230 raw words (42948748 effective words) took 79.6s, 539708 effective words/s


In [18]:
# man과 가장 유사한 단어 골라내기 
model.wv.most_similar("man")


2020-10-25 09:20:58,668 : INFO :  precomputing L2-norms of word weight vectors


[('woman', 0.6091312170028687),
 ('lady', 0.5978107452392578),
 ('soldier', 0.530846118927002),
 ('doctor', 0.5244083404541016),
 ('priest', 0.5158758163452148),
 ('boy', 0.5143296718597412),
 ('businessman', 0.5103845596313477),
 ('farmer', 0.5048384666442871),
 ('journalist', 0.49860942363739014),
 ('guy', 0.4938508868217468)]

In [19]:
# word2vec 은 단어 하나하나가 벡터로 표현되어 있다.
# Review 데이터는 단어들의 조합이기에 Review를 벡터로 표현하기 위해
# Review에 포함된 단어 벡터들의 평균값을 만든다.
# 다른 방법으로는 Doc2vec, average of word2vec vectors with TF-IDF
# Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.
 # 단어 벡터에 TF-IDF를 곱해서 평균 내는 방법
    
# https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence

def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features), dtype = np.float32)
    
    num_words = 0
    # 어휘 사전
    index2word_set = set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words +=1
            #사전에 해당하는 단어에 대해 단어 벡터를 더함
            
            feature_vector = np.add(feature_vector, model[w])
            # model은 단어들에 대한 vector를 다 가지고 있음
            # num_features 만큼 이미 학습할때 정의해서 만들어놓음 
            
    feature_vector = np.divide(feature_vector,num_words)
    
    return feature_vector

def get_dataset(reviews, model, num_features):
    dataset = list()

    
    for s in reviews :
        dataset.append(get_features(s,model,num_features))
    
    reviewFeaturevecs = np.stack(dataset)
    
    return reviewFeaturevecs

word2vec_train = get_dataset(sentences,model,num_features)



In [20]:
model['sky'].shape

  """Entry point for launching an IPython kernel.


(1000,)

In [21]:
word2vec_train.shape

(25000, 1000)

### RandomForest Train

In [22]:
Y_train = df['sentiment']

In [23]:
# vectorizer.get_feature_names()
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [24]:
# Count_train

x_train,x_val,y_train,y_val = train_test_split(count_train.toarray(),Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))


Accuracy : 0.840400


In [25]:
# tfidf_train

x_train,x_val,y_train,y_val = train_test_split(tfidf_train.toarray(),Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))


Accuracy : 0.829000


In [26]:
# word2vec_train

x_train,x_val,y_train,y_val = train_test_split(word2vec_train,Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))




Accuracy : 0.832400


In [27]:
word2vec_train.shape

(25000, 1000)

### RNN

In [28]:
import tensorflow as tf
import re
tf.random.set_seed(99)
BATCH_SIZE = 128
epochs = 10
valid = 0.2


In [29]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [30]:
train_data = pd.read_csv('data/labeledTrainData.tsv',header =0, delimiter = '\t', quoting =3)

def preprocessing(review, remove_stopwords = False):
    review_text = BeautifulSoup(review,'html.parser').get_text()
    
    # 특수문자 제거 # 영문자,숫자를 제외한 문자를 모드 변환 띄어쓰기로
    review_text = re.sub("\W"," ",review_text)    
    
    words = review_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        clean_review = ' '.join(words)
        
    else :
        clean_review = ' '.join(words)
    
    return clean_review

clean_train_reviews = []

for review in train_data['review']:
    clean_train_reviews.append(preprocessing(review, remove_stopwords=True))

clean_train_reviews[0]


'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts 20 minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate wor

In [31]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_reviews)
text_sequences = tokenizer.texts_to_sequences(clean_train_reviews)
print(text_sequences[0])

[410, 71, 425, 8957, 512, 2483, 116, 54, 882, 522, 179, 18946, 179, 11409, 167, 79, 14, 669, 2484, 118, 93, 10, 505, 4131, 167, 22, 213, 589, 2361, 1205, 11409, 72, 4896, 72, 641, 2, 257, 71, 11, 306, 1683, 492, 1157, 3309, 8957, 417, 800, 3387, 17, 447, 607, 1516, 15, 4528, 1875, 1010, 148, 348, 1455, 750, 2452, 4, 8957, 424, 71, 643, 70, 241, 95, 547, 8957, 26374, 26375, 121, 1, 8957, 327, 8, 47, 20, 327, 169, 10, 210, 638, 641, 2, 117, 295, 388, 733, 124, 15761, 3361, 1517, 582, 741, 10167, 934, 11746, 829, 1251, 1423, 366, 8957, 225, 15, 584, 8957, 22505, 2300, 13630, 741, 10167, 27, 28950, 346, 16, 41, 18947, 1516, 394, 11410, 167, 4018, 8957, 116, 633, 505, 80, 4, 8957, 1444, 386, 2193, 115, 1943, 2529, 582, 17, 60, 101, 4947, 5239, 264, 1280, 26376, 15, 582, 498, 751, 643, 637, 3, 400, 166, 452, 115, 622, 3310, 1172, 690, 48, 1190, 228, 1, 16, 4, 8957, 3, 513, 62, 25, 16, 646, 135, 235, 96, 7553, 607, 3486, 8957, 37737, 1888, 1, 130, 348, 1455, 251, 3, 874, 16, 42, 1502, 1009, 2

In [32]:
word_vocab = tokenizer.word_index
word_vocab["<PAD>"]=0 
# print(word_vocab)

In [33]:
data_configs = {}
data_configs['vocab']= word_vocab
data_configs['vocab_size'] = len(word_vocab)+1
data_configs['vocab_size']

75684

In [34]:
MAX_SEQUENCE_LENGTH = 174

train_inputs =  pad_sequences(text_sequences,maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
print(train_inputs.shape)
train_labels = np.array(train_data['sentiment'])
print(train_labels.shape)

(25000, 174)
(25000,)


In [35]:
np.array(text_sequences[2]).shape
# text_sequences[0].shape

(244,)

In [36]:
data_configs['vocab_size']

75684

In [37]:
train_inputs

array([[  800,  3387,    17, ..., 18949,   320,  1372],
       [  236,   206,  3085, ...,     0,     0,     0],
       [37741,  3511,  1410, ...,   708,  1190,  5398],
       ...,
       [  118,  3143,    14, ...,     0,     0,     0],
       [  832,   645,   522, ...,     0,     0,     0],
       [  110,     1,   354, ...,     0,     0,     0]])

In [38]:
tfidf_train.toarray().shape
data_configs['vocab_size']
train_inputs.shape

(25000, 174)

In [54]:
'''
Tensorflow 2.0 모델 구축 방법
1. Sequential API : tf.keras.Sequential, model.add()
2. Functional API : Input - layers 
3. Custom layer : layers. layer 상속 : 여러 레이어를 하나로 묶은 레이어 구현 용이
4. Subclassing : tf.keras.Model  : 자유도가 가장 높아서 자주 사용. 

'''

# tf.keras.Model을 학습받아 클래스로 구현 
# Input은 
# Word2 vec!!!!
'''

'''


class RNN(tf.keras.Model):
    def __init__(self):
        super(RNN,self).__init__() #  부모 클래스에 있는__init__ 함수 호출 
        self.embedding = tf.keras.layers.Embedding(input_dim= data_configs['vocab_size'], # 이전에 임베딩된 5000개 Feature 를 넣겠다
                                          output_dim= 300 # Dense 임베딩 결과shape                                         
                                         )
        self.lstm_1 = tf.keras.layers.LSTM(150, #  lstm_shape : units = dimensionality of the output shape 의미
                                          return_sequences = True)
        self.lstm_2 = tf.keras.layers.LSTM(lstm_shape)
        self.dropout = tf.keras.layers.Dropout(0.2)
        self.fc1 = tf.keras.layers.Dense(units = 150, activation = tf.keras.activations.tanh)
        self.fc2 = tf.keras.layers.Dense(units = 1, activation = tf.keras.activations.sigmoid)
        
    def call(self,x):
        x = self.embedding(x)
#         print(x.shape)
        x = self.dropout(x)
        x = self.lstm_1(x)
        x = self.lstm_2(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x
        

In [55]:
batch_size = 100
# input_shape = (batch_size,tfidf_train.toarray().shape[1])
input_shape = (batch_size, train_inputs.shape[1])
emb_shape =500
lstm_shape = 150

model = RNN()
model.compile(optimizer = tf.keras.optimizers.Adam(1e-4),
             loss = tf.keras.losses.BinaryCrossentropy(),
              metrics= [tf.keras.metrics.BinaryAccuracy(name = "accuracy")]
             )

In [56]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# earlystopping 
earlystop = EarlyStopping(monitor = 'val_accuracy', min_delta=0.0001, patience=2)
model_name = 'rnn_classifier'
checkpoint_path = './data_out/{}/weight.h5'.format(model_name)
checkpoint_dir =  os.path.dirname(checkpoint_path)

if os.path.exists(checkpoint_dir):
    print("{} - exists".format(checkpoint_dir))
else :
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} - complete".format(checkpoint_dir))

cp_callback= ModelCheckpoint(
    checkpoint_path, monitor ='val_accuracy', verbose =1, save_best_only =True, save_weights_only = True
)

./data_out/rnn_classifier - exists


In [57]:
history = model.fit(train_inputs, train_labels, batch_size = batch_size, epochs =  epochs,
                    validation_split = 0.2 , callbacks = [earlystop, cp_callback])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.50560, saving model to ./data_out/rnn_classifier/weight.h5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.50560 to 0.67900, saving model to ./data_out/rnn_classifier/weight.h5
Epoch 3/10
Epoch 00003: val_accuracy improved from 0.67900 to 0.84260, saving model to ./data_out/rnn_classifier/weight.h5
Epoch 4/10
Epoch 00004: val_accuracy improved from 0.84260 to 0.85760, saving model to ./data_out/rnn_classifier/weight.h5
Epoch 5/10
Epoch 00005: val_accuracy improved from 0.85760 to 0.87060, saving model to ./data_out/rnn_classifier/weight.h5
Epoch 6/10
Epoch 00006: val_accuracy improved from 0.87060 to 0.88320, saving model to ./data_out/rnn_classifier/weight.h5
Epoch 7/10
Epoch 00007: val_accuracy did not improve from 0.88320
Epoch 8/10
Epoch 00008: val_accuracy did not improve from 0.88320


In [43]:
from tensorflow.keras.utils import plot_graphs
plot_graphs(history,'loss')

ImportError: cannot import name 'plot_graphs' from 'tensorflow.keras.utils' (C:\Users\yseon\Anaconda3\envs\popcorn\lib\site-packages\tensorflow_core\python\keras\api\_v2\keras\utils\__init__.py)

In [49]:
model.summary()

Model: "rnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  22705200  
_________________________________________________________________
lstm (LSTM)                  multiple                  270600    
_________________________________________________________________
lstm_1 (LSTM)                multiple                  180600    
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  22650     
_________________________________________________________________
dense_1 (Dense)              multiple                  151       
Total params: 23,179,201
Trainable params: 23,179,201
Non-trainable params: 0
___________________________________________________

In [47]:
25000*174

4350000