In [3]:
import pandas as pd
import numpy as np
import os

'''
참고 URL
- https://programmers.co.kr/learn/courses/21/lessons/1693
- http://suanlab.com/assets/lectures/dpp/10.pdf

'''

'''
1. Data preprocessing
2. word2vec
3. modeling
4. 평가

'''


'\n1. Data preprocessing\n2. word2vec\n3. modeling\n4. 평가\n\n'

In [4]:
os.listdir('data')


['labeledTrainData.tsv',
 'sampleSubmission.csv',
 'testData.tsv',
 'unlabeledTrainData.tsv']

In [5]:
# 데이터 읽기
# -- quoting : 특수 문자가 포함된 필드를 감쌀 때 처리하는 방법, 문자를 따옴표로 묶는 방법
import csv
df = pd.read_csv('data/labeledTrainData.tsv', header=0, delimiter='\t',quoting=3)
# QUOTE_MINIMAL (0), QUOTE_ALL (1), 
# QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
'''
-QUOTE_ALL(1) : Quote everything, regardless of type.(문자열처리, 모든데이터 묶음)
-QUOTE_MINIMAL(0) :Quote fields with special characters (특수 문자가 포함된 따옴표 필드)
(anything that would confuse a parser configured with the same dialect and options). This is the default
-QUOTE_NONNUMERIC(2) :Quote all fields that are not integers or floats.(숫자가 아닌 경우 묶음) 
  When used with the reader, input fields that are not quoted are converted to floats.
-QUOTE_NONE(3) : Do not quote anything on output. 데이터를 묶지 않음
 When used with the reader, quote characters are included in the field values (normally, they are treated as delimiters and stripped).
 reader와 사용하면 쌍따옴표는 필드값으로 포함된다.
'''

df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [6]:
# 데이터 전처리 -- HTML 태그, \ 특수 문자 제거
from bs4 import BeautifulSoup
import re

def preprocessing(x):
    #HTML 태그 제거
    x= BeautifulSoup(x,'html.parser').get_text()
    # 특수문자 제거 # 영문자,숫자를 제외한 문자를 모드 변환 띄어쓰기로
    x = re.sub("\W"," ",x)    
    return x

df['review']=df['review'].map(lambda x: preprocessing(x))


In [7]:
# 토크나이징 + Stopwords 제거
# 장점 : 노이즈를 줄일 수 있음, 단점 : 문장 구조 모델링시 정보 유실 발생

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def tokenizing(words):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(words.lower())
    words = [x for x in words if x not in stop_words]
    return words

df['words'] = df['review'].map(lambda x : tokenizing(x))
df.head()
    

Unnamed: 0,id,sentiment,review,words
0,"""5814_8""",1,With all this stuff going down at the moment ...,"[stuff, going, moment, mj, started, listening,..."
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...,"[classic, war, worlds, timothy, hines, enterta..."
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...,"[film, starts, manager, nicholas, bell, giving..."
3,"""3630_4""",0,It must be assumed that those who praised thi...,"[must, assumed, praised, film, greatest, filme..."
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,"[superbly, trashy, wondrously, unpretentious, ..."


In [8]:
'''

Tokenizing 방법 리뷰
1.  Split 함수
2.  NLTK 활용 
   - Tokenizing → Index로 벡터화 해야하는데 NLTK는 Tokenizing 까지만
3.  keras.preprocessing 활용
   - Keras는 Vector화 까지 가능, 

'''
# https://inuplace.tistory.com/536

from tensorflow.python.keras.preprocessing.text import Tokenizer

token = Tokenizer()
token.fit_on_texts(df['words']) # Fit
df['vector'] = token.texts_to_sequences(df['words']) # vector화 


In [9]:
df.head()

Unnamed: 0,id,sentiment,review,words,vector
0,"""5814_8""",1,With all this stuff going down at the moment ...,"[stuff, going, moment, mj, started, listening,...","[410, 71, 425, 8956, 511, 2484, 116, 54, 881, ..."
1,"""2381_9""",1,The Classic War of the Worlds by Timothy ...,"[classic, war, worlds, timothy, hines, enterta...","[236, 207, 3086, 3611, 7239, 321, 2, 411, 155,..."
2,"""7759_3""",0,The film starts with a manager Nicholas Bell...,"[film, starts, manager, nicholas, bell, giving...","[2, 388, 2854, 4457, 3780, 604, 2210, 18035, 5..."
3,"""3630_4""",0,It must be assumed that those who praised thi...,"[must, assumed, praised, film, greatest, filme...","[101, 4896, 5399, 2, 688, 670, 1272, 42, 215, ..."
4,"""9495_8""",1,Superbly trashy and wondrously unpretentious ...,"[superbly, trashy, wondrously, unpretentious, ...","[3409, 4193, 37747, 11135, 859, 2062, 13202, 1..."


In [10]:
# 단어 사전 확인
vocab = token.word_index
vocab["<PAD>"] = 0
vocab
len(vocab)

75789

In [11]:
# 최대 문장길이 :: 3사분위에 해당하는 214개를 고정길이로 설정
print("최소 단어수", df['vector'].map(lambda x : len(x)).min())
print("1사분위", df['vector'].map(lambda x : len(x)).quantile(0.25))
print("2사분위",df['vector'].map(lambda x : len(x)).quantile(0.50))
print("3사분위",df['vector'].map(lambda x : len(x)).quantile(0.75))
print("최대 단어수",df['vector'].map(lambda x : len(x)).max())



최소 단어수 4
1사분위 64.0
2사분위 90.0
3사분위 148.0
최대 단어수 1429


In [12]:
# Padding: 가변 길이 → 고정 길이

from tensorflow.python.keras.preprocessing.sequence import pad_sequences

max_padding = 214

X_train = pad_sequences(df['vector'],maxlen = max_padding, padding = 'post' )
Y_train = df['sentiment']
print(X_train)

[[  116    54   881 ... 18947   320  1372]
 [  236   207  3086 ...     0     0     0]
 [ 4657 32515  3589 ...   707  1187  5398]
 ...
 [  118  3144    14 ...     0     0     0]
 [  831   644   521 ...     0     0     0]
 [  110     1   354 ...     0     0     0]]


In [13]:
# Vector 화 
'''
1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)
2. TF-IDF (완료)
3. Countvectorizer -- TF를 의미하는 것
4. Word2vec(완료)

'''

'\n1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)\n2. TF-IDF (완료)\n3. Countvectorizer -- TF를 의미하는 것\n4. Word2vec(완료)\n\n'

In [14]:
# TF-IDF
# 입력이 텍스트여야함.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=0.0, analyzer = "word", sublinear_tf=True,
                           ngram_range=(1,3), max_features=5000,stop_words = 'english')

# min_df : 설정값보다 특정 토큰의 df(document Frequency)가 적으면 벡터화에서 제거
# analyzer : word/char 2가지 : word는 단위 : 단어 / char : 단위 : char 
# sublinear_tf : term frequency에 대한 smoothing 여부
# ngram_range = n-gram 의 범위 : 분석기에 의해 설정값을 사용하여 ngram자동 생성
# max_features = 벡터의 최대 길이, 

tfidf_train = vectorizer.fit_transform(list(df['review']))
tfidf_train
print(type(tfidf_train))
print(tfidf_train.shape)
print(tfidf_train[0]) 
# 5000개의 단어 각각에 대한 tf-idf Weight를 의미함
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
tfidf_train.toarray()

<class 'scipy.sparse.csr.csr_matrix'>
(25000, 5000)
  (0, 2502)	0.1028129880638737
  (0, 1285)	0.06783600784085628
  (0, 4212)	0.09798879691949967
  (0, 4484)	0.10513903894444403
  (0, 3287)	0.09207262080751873
  (0, 2998)	0.09979596337739731
  (0, 4479)	0.09655916967157495
  (0, 3286)	0.1049730103164135
  (0, 37)	0.09626873961802275
  (0, 2776)	0.09875448073816731
  (0, 1638)	0.17021192744889807
  (0, 2448)	0.10116830229528975
  (0, 2825)	0.171073173099871
  (0, 2174)	0.06334114094305214
  (0, 4298)	0.06275013728447221
  (0, 1559)	0.0683315613411591
  (0, 1571)	0.050147944199345866
  (0, 1305)	0.1025268801175609
  (0, 815)	0.10635206492339642
  (0, 1204)	0.056885940864640605
  (0, 1280)	0.03764404759175515
  (0, 4304)	0.07530453665252453
  (0, 1875)	0.06589300465590953
  (0, 312)	0.07042547866816848
  (0, 3348)	0.08579945561982709
  :	:
  (0, 2299)	0.08305401895465898
  (0, 2022)	0.160515236053926
  (0, 2890)	0.058087806097562915
  (0, 2758)	0.038533095865286734
  (0, 1399)	0.10653310

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.08774747, 0.04602346, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.08424142, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [15]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer ## TF 

vectorizer = CountVectorizer(analyzer = "word", max_features = 5000)

count_train = vectorizer.fit_transform(list(df['review']))

In [16]:
# Word2Vec
import logging
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s :  %(message)s', level=logging.INFO)


# Word2vec 입력은 단어로 표현된 리스트를 입력값으로 받음
# n-gram으로 만들어서 넣을 수도 있지만 여기에서는 단순히 split만해서 넣는 것으로 함

sentences = []
for review in list(df['review']) :
    sentences.append(review.split())

# print(sentences[0])

# 하이퍼파라미터
num_features = 1000 # word2vec 특징 수
min_word_count =20 
num_workers = 6
context =10 # Word2vec 수행을 위한 컨텍스트 윈도 크기
# https://medium.com/@omicro03/%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC-nlp-13%EC%9D%BC%EC%B0%A8-word2vec-3c82ec870426
downsampling = 1e-3 #Word2vec 빠른 학습을 위해 정답 단어 라벨에 대한 다운 샘플링, 보통 0.001이 좋은 성능
#Downsampling of frequent words # 자주 나오는 단어에 대해서는 0.001 만큼 다운 샘플링하여 시간을 아낌



In [17]:
print("Training")
# https://wikidocs.net/50739
model = word2vec.Word2Vec(sentences,
                         workers = num_workers,
                          size = num_features,
                          min_count = min_word_count,
                          window =  context,
                          sample = downsampling,
                          iter = 10,
                          sg =0 # sg =0 CBOW, 1 : skip-gram
                         )

2020-10-22 06:27:48,330 : INFO :  collecting all words and their counts
2020-10-22 06:27:48,330 : INFO :  PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training


2020-10-22 06:27:48,654 : INFO :  PROGRESS: at sentence #10000, processed 2398843 words, keeping 64085 word types
2020-10-22 06:27:48,997 : INFO :  PROGRESS: at sentence #20000, processed 4773958 words, keeping 86259 word types
2020-10-22 06:27:49,169 : INFO :  collected 94969 word types from a corpus of 5953723 raw words and 25000 sentences
2020-10-22 06:27:49,170 : INFO :  Loading a fresh vocabulary
2020-10-22 06:27:49,223 : INFO :  effective_min_count=20 retains 14661 unique words (15% of original 94969, drops 80308)
2020-10-22 06:27:49,224 : INFO :  effective_min_count=20 leaves 5672944 word corpus (95% of original 5953723, drops 280779)
2020-10-22 06:27:49,262 : INFO :  deleting the raw counts dictionary of 94969 items
2020-10-22 06:27:49,264 : INFO :  sample=0.001 downsamples 47 most-common words
2020-10-22 06:27:49,264 : INFO :  downsampling leaves estimated 4294841 word corpus (75.7% of prior 5672944)
2020-10-22 06:27:49,293 : INFO :  estimated required memory for 14661 words a

2020-10-22 06:28:32,022 : INFO :  EPOCH 5 - PROGRESS: at 70.24% examples, 501182 words/s, in_qsize 11, out_qsize 0
2020-10-22 06:28:33,028 : INFO :  EPOCH 5 - PROGRESS: at 82.11% examples, 501006 words/s, in_qsize 11, out_qsize 0
2020-10-22 06:28:34,031 : INFO :  EPOCH 5 - PROGRESS: at 94.15% examples, 502060 words/s, in_qsize 11, out_qsize 0
2020-10-22 06:28:34,437 : INFO :  worker thread finished; awaiting finish of 5 more threads
2020-10-22 06:28:34,458 : INFO :  worker thread finished; awaiting finish of 4 more threads
2020-10-22 06:28:34,472 : INFO :  worker thread finished; awaiting finish of 3 more threads
2020-10-22 06:28:34,474 : INFO :  worker thread finished; awaiting finish of 2 more threads
2020-10-22 06:28:34,478 : INFO :  worker thread finished; awaiting finish of 1 more threads
2020-10-22 06:28:34,479 : INFO :  worker thread finished; awaiting finish of 0 more threads
2020-10-22 06:28:34,480 : INFO :  EPOCH - 5 : training on 5953723 raw words (4295561 effective words) t

2020-10-22 06:29:16,711 : INFO :  EPOCH 10 - PROGRESS: at 94.30% examples, 500734 words/s, in_qsize 11, out_qsize 0
2020-10-22 06:29:17,127 : INFO :  worker thread finished; awaiting finish of 5 more threads
2020-10-22 06:29:17,128 : INFO :  worker thread finished; awaiting finish of 4 more threads
2020-10-22 06:29:17,138 : INFO :  worker thread finished; awaiting finish of 3 more threads
2020-10-22 06:29:17,139 : INFO :  worker thread finished; awaiting finish of 2 more threads
2020-10-22 06:29:17,143 : INFO :  worker thread finished; awaiting finish of 1 more threads
2020-10-22 06:29:17,146 : INFO :  worker thread finished; awaiting finish of 0 more threads
2020-10-22 06:29:17,147 : INFO :  EPOCH - 10 : training on 5953723 raw words (4294792 effective words) took 8.5s, 503079 effective words/s
2020-10-22 06:29:17,147 : INFO :  training on a 59537230 raw words (42947273 effective words) took 85.3s, 503609 effective words/s


In [18]:
# man과 가장 유사한 단어 골라내기 
model.wv.most_similar("man")


2020-10-22 06:29:17,155 : INFO :  precomputing L2-norms of word weight vectors


[('woman', 0.6223074197769165),
 ('lady', 0.5865253210067749),
 ('soldier', 0.5489656925201416),
 ('doctor', 0.5334686040878296),
 ('guy', 0.5075160264968872),
 ('businessman', 0.4959918260574341),
 ('priest', 0.4923453629016876),
 ('boy', 0.4903828501701355),
 ('farmer', 0.4762645959854126),
 ('scientist', 0.47407081723213196)]

In [19]:
# word2vec 은 단어 하나하나가 벡터로 표현되어 있다.
# Review 데이터는 단어들의 조합이기에 Review를 벡터로 표현하기 위해
# Review에 포함된 단어 벡터들의 평균값을 만든다.
# 다른 방법으로는 Doc2vec, average of word2vec vectors with TF-IDF
# Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.
 # 단어 벡터에 TF-IDF를 곱해서 평균 내는 방법
    
# https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence

def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features), dtype = np.float32)
    
    num_words = 0
    # 어휘 사전
    index2word_set = set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words +=1
            #사전에 해당하는 단어에 대해 단어 벡터를 더함
            
            feature_vector = np.add(feature_vector, model[w])
            # model은 단어들에 대한 vector를 다 가지고 있음
            # num_features 만큼 이미 학습할때 정의해서 만들어놓음 
            
    feature_vector = np.divide(feature_vector,num_words)
    
    return feature_vector

def get_dataset(reviews, model, num_features):
    dataset = list()

    
    for s in reviews :
        dataset.append(get_features(s,model,num_features))
    
    reviewFeaturevecs = np.stack(dataset)
    
    return reviewFeaturevecs

word2vec_train = get_dataset(sentences,model,num_features)



In [20]:
model['sky'].shape

  """Entry point for launching an IPython kernel.


(1000,)

In [21]:
word2vec_train[0]

array([ 3.84667248e-01, -1.08994767e-02, -1.13588035e-01,  1.17495187e-01,
        2.56480053e-02,  8.04757476e-02, -1.05506063e-01,  1.20215714e-01,
        1.46283405e-02,  1.04477229e-02,  4.12839418e-03, -1.64638460e-02,
       -6.46725297e-02,  8.36449955e-03,  1.86226100e-01, -9.32694823e-02,
       -1.03054129e-01,  2.75235176e-02,  1.43799812e-01, -1.12206146e-01,
        1.62661135e-01,  9.93568376e-02, -2.88734213e-02,  5.48825972e-02,
        1.35131255e-01,  1.42677546e-01, -1.08255811e-01,  6.69090673e-02,
       -2.67270803e-01,  1.00531559e-02,  1.50428815e-02, -8.02390724e-02,
        6.29948005e-02,  1.52281508e-01, -1.95384264e-01,  3.57935578e-01,
        5.85520752e-02, -1.68395210e-02,  1.91368505e-01, -1.07441626e-01,
       -7.34639242e-02, -7.73130506e-02,  1.45138931e-02,  4.77659442e-02,
        1.24424910e-02,  6.51779473e-02,  2.80315448e-02, -1.30822554e-01,
       -6.50603920e-02, -7.20745772e-02, -1.88913777e-01, -2.81763941e-01,
       -3.56749929e-02,  

### RandomForest Train

In [272]:
Y_train = df['sentiment']

In [277]:
# vectorizer.get_feature_names()
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [279]:
# Count_train

x_train,x_val,y_train,y_val = train_test_split(count_train.toarray(),Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))


Accuracy : 0.837200


In [280]:
# tfidf_train

x_train,x_val,y_train,y_val = train_test_split(tfidf_train.toarray(),Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))


Accuracy : 0.844600


In [289]:
# word2vec_train

x_train,x_val,y_train,y_val = train_test_split(word2vec_train,Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))




Accuracy : 0.816600


In [290]:
word2vec_train.shape

(25000, 1000)

### RNN

In [28]:
import tensorflow as tf
tf.random.set_seed(99)
BATCH_SIZE = 128
epochs = 10
valid = 0.2


In [22]:
word2vec_train

array([[ 0.38466725, -0.01089948, -0.11358804, ..., -0.03559477,
         0.08085393, -0.02221583],
       [ 0.30409044, -0.02905423, -0.01033402, ..., -0.00157314,
         0.15747353,  0.00254611],
       [ 0.04273553, -0.05674532,  0.13614783, ..., -0.00226927,
         0.12489501, -0.05610893],
       ...,
       [ 0.4288284 ,  0.12031446, -0.04713427, ...,  0.05309478,
         0.18899845,  0.10103627],
       [ 0.28443012, -0.04825247, -0.07205519, ..., -0.00527934,
         0.09853455,  0.0077378 ],
       [ 0.5616765 ,  0.00865968, -0.08046958, ..., -0.0504946 ,
         0.05616886,  0.09665068]], dtype=float32)

In [26]:
tfidf_train.toarray().shape[1]

5000

In [53]:
'''
Tensorflow 2.0 모델 구축 방법
1. Sequential API : tf.keras.Sequential, model.add()
2. Functional API : Input - layers 
3. Custom layer : layers. layer 상속 : 여러 레이어를 하나로 묶은 레이어 구현 용이
4. Subclassing : tf.keras.Model  : 자유도가 가장 높아서 자주 사용. 

'''

# tf.keras.Model을 학습받아 클래스로 구현 
# Input은 

class RNN(tf.keras.Model):
    def __init__(self, input_shape, emb_shape,lstm_shape  ):
        super(RNN,self).__init__() #  부모 클래스에 있는__init__ 함수 호출 
        self.embedding = tf.keras.layers.Embedding(input_dim= input_shape, # 이전에 임베딩된 5000개 Feature 를 넣겠다
                                          output_dim= emb_shape # Dense 임베딩 결과shape                                         
                                         )
        self.lstm_1 = tf.keras.layers.LSTM(lstm_shape, #  lstm_shape : units = dimensionality of the output shape 의미
                                          return_sequences = True)
        self.lstm_2 = tf.keras.layers.LSTM(lstm_shape)
        self.dropout = tf.keras.layers.Dropout(0.2)
        self.fc1 = tf.keras.layers.Dense(units = 150, activation = tf.keras.activations.tanh)
        self.fc2 = tf.keras.layers.Dense(units = 1, activation = tf.keras.activations.sigmoid)
        
    def call(self,x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = self.lstm_1(x)
        x = self.lstm_2(x)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x
        

In [54]:
input_shape = tfidf_train.toarray().shape[1]
emb_shape =500
lstm_shape = 150

model = RNN(input_shape, emb_shape, lstm_shape)
model.compile(optimizer = tf.keras.optimizers.Adam(1e-4),
             loss = tf.keras.losses.BinaryCrossentropy(),
              metrics= [tf.keras.metrics.BinaryAccuracy(name = "accuracy")]
             )

In [55]:
model.fit(tfidf_train.toarray(),df['sentiment'].values)

Train on 25000 samples
   64/25000 [..............................] - ETA: 37:31 - loss: 0.6919 - accuracy: 0.5625  

InternalError:  [_Derived_]  Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 2, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 500, 150, 1, 5000, 32, 150] 
	 [[{{node gradients/CudnnRNN_grad/CudnnRNNBackprop}}]]
	 [[StatefulPartitionedCall_1]]
	 [[Reshape_14/_38]] [Op:__inference_distributed_function_8024]

Function call stack:
distributed_function -> distributed_function -> distributed_function
