In [6]:
import pandas as pd
import numpy as np
import os

'''
참고 URL
- https://programmers.co.kr/learn/courses/21/lessons/1693
- http://suanlab.com/assets/lectures/dpp/10.pdf

'''

'''
1. Data preprocessing
2. word2vec
3. modeling
4. 평가

'''


In [8]:
os.listdir('data')


['labeledTrainData.tsv',
 'sampleSubmission.csv',
 'testData.tsv',
 'unlabeledTrainData.tsv']

In [63]:
# 데이터 읽기
# -- quoting : 특수 문자가 포함된 필드를 감쌀 때 처리하는 방법, 문자를 따옴표로 묶는 방법
import csv
df = pd.read_csv('data/labeledTrainData.tsv', header=0, delimiter='\t',quoting=3)
# QUOTE_MINIMAL (0), QUOTE_ALL (1), 
# QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
'''
-QUOTE_ALL(1) : Quote everything, regardless of type.(문자열처리, 모든데이터 묶음)
-QUOTE_MINIMAL(0) :Quote fields with special characters (특수 문자가 포함된 따옴표 필드)
(anything that would confuse a parser configured with the same dialect and options). This is the default
-QUOTE_NONNUMERIC(2) :Quote all fields that are not integers or floats.(숫자가 아닌 경우 묶음) 
  When used with the reader, input fields that are not quoted are converted to floats.
-QUOTE_NONE(3) : Do not quote anything on output. 데이터를 묶지 않음
 When used with the reader, quote characters are included in the field values (normally, they are treated as delimiters and stripped).
 reader와 사용하면 쌍따옴표는 필드값으로 포함된다.
'''

df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [100]:
# 데이터 전처리 -- HTML 태그, \ 특수 문자 제거
from bs4 import BeautifulSoup
import re

def preprocessing(x):
    #HTML 태그 제거
    x= BeautifulSoup(x,'html.parser').get_text()
    # 특수문자 제거 # 영문자,숫자를 제외한 문자를 모드 변환 띄어쓰기로
    x = re.sub("\W"," ",x)    
    return x

df['review']=df['review'].map(lambda x: preprocessing(x))


0         With all this stuff going down at the moment ...
1           The Classic War of the Worlds   by Timothy ...
2         The film starts with a manager  Nicholas Bell...
3         It must be assumed that those who praised thi...
4         Superbly trashy and wondrously unpretentious ...
                               ...                        
24995     It seems like more consideration has gone int...
24996     I don t believe they made this film  Complete...
24997     Guy is a loser  Can t get girls  needs to bui...
24998     This 30 minute documentary Buñuel made in the...
24999     I saw this movie as a child and it broke my h...
Name: review, Length: 25000, dtype: object

In [147]:
# 토크나이징 + Stopwords 제거
# 장점 : 노이즈를 줄일 수 있음, 단점 : 문장 구조 모델링시 정보 유실 발생

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def tokenizing(words):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(words.lower())
    words = [x for x in words if x not in stop_words]
    return words

df['words'] = df['review'].map(lambda x : tokenizing(x))
df.head()
    

Unnamed: 0,id,sentiment,review,review2,words,vector
0,"""5814_8""",1,"""With all this stuff going down at the moment ...","""With all this stuff going down at the moment ...","[``, stuff, going, moment, mj, 've, started, l...","[16, 432, 94, 448, 8944, 70, 523, 2440, 142, 1..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...","""\""The Classic War of the Worlds\"" by Timothy ...","[``, \, '', classic, war, worlds\, '', timothy...","[16, 11, 7, 265, 248, 14492, 7, 3528, 7181, 34..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...","""The film starts with a manager (Nicholas Bell...","[``, film, starts, manager, (, nicholas, bell,...","[16, 10, 399, 2930, 13, 4346, 3834, 12, 610, 2..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi...","""It must be assumed that those who praised thi...","[``, must, assumed, praised, film, (, \, '', g...","[16, 131, 4773, 5646, 10, 13, 11, 7, 686, 703,..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...","""Superbly trashy and wondrously unpretentious ...","[``, superbly, trashy, wondrously, unpretentio...","[16, 3370, 4155, 44772, 11262, 986, 8, 2153, 1..."


In [133]:
'''

Tokenizing 방법 리뷰
1.  Split 함수
2.  NLTK 활용 
   - Tokenizing → Index로 벡터화 해야하는데 NLTK는 Tokenizing 까지만
3.  keras.preprocessing 활용
   - Keras는 Vector화 까지 가능, 

'''
# https://inuplace.tistory.com/536

from tensorflow.python.keras.preprocessing.text import Tokenizer

token = Tokenizer()
token.fit_on_texts(df['words']) # Fit
df['vector'] = token.texts_to_sequences(df['words']) # vector화 


In [134]:
df.head()

Unnamed: 0,id,sentiment,review,review2,words,vector
0,"""5814_8""",1,"""With all this stuff going down at the moment ...","""With all this stuff going down at the moment ...","[``, stuff, going, moment, mj, 've, started, l...","[16, 432, 94, 448, 8944, 70, 523, 2440, 142, 1..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...","""\""The Classic War of the Worlds\"" by Timothy ...","[``, \, '', classic, war, worlds\, '', timothy...","[16, 11, 7, 265, 248, 14492, 7, 3528, 7181, 34..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...","""The film starts with a manager (Nicholas Bell...","[``, film, starts, manager, (, nicholas, bell,...","[16, 10, 399, 2930, 13, 4346, 3834, 12, 610, 2..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi...","""It must be assumed that those who praised thi...","[``, must, assumed, praised, film, (, \, '', g...","[16, 131, 4773, 5646, 10, 13, 11, 7, 686, 703,..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...","""Superbly trashy and wondrously unpretentious ...","[``, superbly, trashy, wondrously, unpretentio...","[16, 3370, 4155, 44772, 11262, 986, 8, 2153, 1..."


In [145]:
# 단어 사전 확인
vocab = token.word_index
vocab["<PAD>"] = 0
vocab
len(vocab)

120745

In [158]:
# 최대 문장길이 :: 3사분위에 해당하는 214개를 고정길이로 설정
print("최소 단어수", df['vector'].map(lambda x : len(x)).min())
print("1사분위", df['vector'].map(lambda x : len(x)).quantile(0.25))
print("2사분위",df['vector'].map(lambda x : len(x)).quantile(0.50))
print("3사분위",df['vector'].map(lambda x : len(x)).quantile(0.75))
print("최대 단어수",df['vector'].map(lambda x : len(x)).max())



최소 단어수 7
1사분위 92.0
2사분위 130.0
3사분위 214.0
최대 단어수 1790


In [164]:
# Padding: 가변 길이 → 고정 길이

from tensorflow.python.keras.preprocessing.sequence import pad_sequences

max_padding = 214

X_train = pad_sequences(df['vector'],maxlen = max_padding, padding = 'post' )
Y_train = df['sentiment']
print(X_train)

[[  756  2365    18 ...  1412     2     7]
 [   16    11     7 ...     0     0     0]
 [  826 24276 59374 ...  6154     2     7]
 ...
 [   16   146  3353 ...     0     0     0]
 [   16   974   792 ...     0     0     0]
 [   16   128     9 ...     0     0     0]]


In [163]:
# Vector 화 
'''
1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)
2. TF-IDF (완료)
3. Countvectorizer -- TF를 의미하는 것
4. Word2vec(완료)

'''

'\n1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)\n2. TF-IDF\n3. Countvectorizer\n4. Word2vec\n'

In [213]:
# TF-IDF
# 입력이 텍스트여야함.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=0.0, analyzer = "word", sublinear_tf=True,
                           ngram_range=(1,3), max_features=5000,stop_words = 'english')

# min_df : 설정값보다 특정 토큰의 df(document Frequency)가 적으면 벡터화에서 제거
# analyzer : word/char 2가지 : word는 단위 : 단어 / char : 단위 : char 
# sublinear_tf : term frequency에 대한 smoothing 여부
# ngram_range = n-gram 의 범위 : 분석기에 의해 설정값을 사용하여 ngram자동 생성
# max_features = 벡터의 최대 길이, 

tfidf_train = vectorizer.fit_transform(list(df['review']))
tfidf_train
print(type(tfidf_train))
print(tfidf_train.shape)
print(tfidf_train[0]) 
# 5000개의 단어 각각에 대한 tf-idf Weight를 의미함
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
tfidf_train.toarray()

<class 'scipy.sparse.csr.csr_matrix'>
(25000, 5000)
  (0, 3883)	0.09933834228705851
  (0, 2559)	0.10246383707774773
  (0, 1376)	0.06760563802592813
  (0, 4225)	0.09765602879634694
  (0, 3320)	0.09183369558685252
  (0, 3039)	0.10031374229998055
  (0, 3882)	0.09933834228705851
  (0, 4487)	0.09642717568231131
  (0, 3319)	0.10461652393508376
  (0, 34)	0.09594181277732189
  (0, 2825)	0.09864400636809899
  (0, 519)	0.058425705309277265
  (0, 1721)	0.16963389092415215
  (0, 2505)	0.1008247366312985
  (0, 2872)	0.17049221180098909
  (0, 2243)	0.06312603561211018
  (0, 4309)	0.06253703899091029
  (0, 1644)	0.06809950863580468
  (0, 1656)	0.04997226926678856
  (0, 1396)	0.10217870074867391
  (0, 1297)	0.05669275727253205
  (0, 1371)	0.037508823151526205
  (0, 4315)	0.07504880385332162
  (0, 1956)	0.06565488069410813
  (0, 299)	0.07018631505870204
  :	:
  (0, 2364)	0.0827719690583781
  (0, 2094)	0.1599701293119485
  (0, 2936)	0.05789054099358346
  (0, 2809)	0.038399635337451506
  (0, 1487)	0.1061

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0897721 , 0.04699259, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.0837525 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [214]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer ## TF 

vectorizer = CountVectorizer(analyzer = "word", max_features = 5000)

count_train = vectorizer.fit_transform(list(df['review']))

In [283]:
# Word2Vec
import logging
from gensim.models import word2vec

logging.basicConfig(format='%(asctime)s : %(levelname)s :  %(message)s', level=logging.INFO)


# Word2vec 입력은 단어로 표현된 리스트를 입력값으로 받음
# n-gram으로 만들어서 넣을 수도 있지만 여기에서는 단순히 split만해서 넣는 것으로 함

sentences = []
for review in list(df['review']) :
    sentences.append(review.split())

# print(sentences[0])

# 하이퍼파라미터
num_features = 1000 # word2vec 특징 수
min_word_count =20 
num_workers = 6
context =10 # Word2vec 수행을 위한 컨텍스트 윈도 크기
# https://medium.com/@omicro03/%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC-nlp-13%EC%9D%BC%EC%B0%A8-word2vec-3c82ec870426
downsampling = 1e-3 #Word2vec 빠른 학습을 위해 정답 단어 라벨에 대한 다운 샘플링, 보통 0.001이 좋은 성능
#Downsampling of frequent words # 자주 나오는 단어에 대해서는 0.001 만큼 다운 샘플링하여 시간을 아낌



In [284]:
print("Training")
# https://wikidocs.net/50739
model = word2vec.Word2Vec(sentences,
                         workers = num_workers,
                          size = num_features,
                          min_count = min_word_count,
                          window =  context,
                          sample = downsampling,
                          iter = 10,
                          sg =0 # sg =0 CBOW, 1 : skip-gram
                         )

2020-10-21 07:12:34,544 : INFO :  collecting all words and their counts
2020-10-21 07:12:34,544 : INFO :  PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training


2020-10-21 07:12:34,883 : INFO :  PROGRESS: at sentence #10000, processed 2354780 words, keeping 163178 word types
2020-10-21 07:12:35,244 : INFO :  PROGRESS: at sentence #20000, processed 4686268 words, keeping 251892 word types
2020-10-21 07:12:35,416 : INFO :  collected 289705 word types from a corpus of 5844706 raw words and 25000 sentences
2020-10-21 07:12:35,417 : INFO :  Loading a fresh vocabulary
2020-10-21 07:12:35,541 : INFO :  effective_min_count=20 retains 16736 unique words (5% of original 289705, drops 272969)
2020-10-21 07:12:35,541 : INFO :  effective_min_count=20 leaves 5206111 word corpus (89% of original 5844706, drops 638595)
2020-10-21 07:12:35,583 : INFO :  deleting the raw counts dictionary of 289705 items
2020-10-21 07:12:35,587 : INFO :  sample=0.001 downsamples 45 most-common words
2020-10-21 07:12:35,587 : INFO :  downsampling leaves estimated 3929600 word corpus (75.5% of prior 5206111)
2020-10-21 07:12:35,620 : INFO :  estimated required memory for 16736 wo

2020-10-21 07:13:15,259 : INFO :  worker thread finished; awaiting finish of 3 more threads
2020-10-21 07:13:15,273 : INFO :  worker thread finished; awaiting finish of 2 more threads
2020-10-21 07:13:15,282 : INFO :  worker thread finished; awaiting finish of 1 more threads
2020-10-21 07:13:15,291 : INFO :  worker thread finished; awaiting finish of 0 more threads
2020-10-21 07:13:15,291 : INFO :  EPOCH - 5 : training on 5844706 raw words (3929917 effective words) took 7.4s, 527657 effective words/s
2020-10-21 07:13:16,305 : INFO :  EPOCH 6 - PROGRESS: at 11.37% examples, 443637 words/s, in_qsize 11, out_qsize 0
2020-10-21 07:13:17,322 : INFO :  EPOCH 6 - PROGRESS: at 23.16% examples, 455128 words/s, in_qsize 11, out_qsize 0
2020-10-21 07:13:18,335 : INFO :  EPOCH 6 - PROGRESS: at 35.68% examples, 466662 words/s, in_qsize 11, out_qsize 0
2020-10-21 07:13:19,336 : INFO :  EPOCH 6 - PROGRESS: at 48.50% examples, 474931 words/s, in_qsize 11, out_qsize 0
2020-10-21 07:13:20,342 : INFO :  

In [285]:
# man과 가장 유사한 단어 골라내기 
model.wv.most_similar("man")


2020-10-21 07:13:53,883 : INFO :  precomputing L2-norms of word weight vectors


[('man,', 0.716978907585144),
 ('woman', 0.6902350187301636),
 ('doctor', 0.647634744644165),
 ('soldier', 0.6302933692932129),
 ('boy', 0.6176154017448425),
 ('lady', 0.6128588914871216),
 ('man.', 0.5945290327072144),
 ('patient', 0.5821107625961304),
 ('woman,', 0.5750539302825928),
 ("man's", 0.5717611312866211)]

In [286]:
# word2vec 은 단어 하나하나가 벡터로 표현되어 있다.
# Review 데이터는 단어들의 조합이기에 Review를 벡터로 표현하기 위해
# Review에 포함된 단어 벡터들의 평균값을 만든다.
# 다른 방법으로는 Doc2vec, average of word2vec vectors with TF-IDF
# Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.
 # 단어 벡터에 TF-IDF를 곱해서 평균 내는 방법
    
# https://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence

def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features), dtype = np.float32)
    
    num_words = 0
    # 어휘 사전
    index2word_set = set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words +=1
            #사전에 해당하는 단어에 대해 단어 벡터를 더함
            
            feature_vector = np.add(feature_vector, model[w])
            # model은 단어들에 대한 vector를 다 가지고 있음
            # num_features 만큼 이미 학습할때 정의해서 만들어놓음 
            
    feature_vector = np.divide(feature_vector,num_words)
    
    return feature_vector

def get_dataset(reviews, model, num_features):
    dataset = list()

    
    for s in reviews :
        dataset.append(get_features(s,model,num_features))
    
    reviewFeaturevecs = np.stack(dataset)
    
    return reviewFeaturevecs

word2vec_train = get_dataset(sentences,model,num_features)



In [287]:
model['sky'].shape

  """Entry point for launching an IPython kernel.


(1000,)

In [288]:
word2vec_train[0]

array([-1.18139751e-01,  6.28871098e-02,  1.49180561e-01, -1.26731306e-01,
        5.12885191e-02,  1.32029548e-01, -3.20467204e-02, -1.41882464e-01,
       -1.59057543e-01, -1.76634882e-02,  2.67821364e-02,  1.29582128e-02,
        1.50080062e-02,  5.48530258e-02, -3.05359736e-02, -7.36697763e-02,
       -2.04020426e-01, -3.95200215e-02, -1.54978139e-02,  5.88751631e-03,
        8.21877643e-02,  1.17347911e-01,  3.05978972e-02, -8.75424780e-03,
       -3.54981497e-02, -1.51894391e-01, -7.06436113e-02,  1.31319031e-01,
       -4.01055254e-02, -3.94649893e-01, -1.29856095e-02,  5.32002859e-02,
       -6.21244013e-02, -7.38168210e-02, -1.01046986e-03,  1.91170737e-01,
       -2.61195868e-01, -1.95050910e-01,  5.21635175e-01, -1.45756811e-01,
       -2.58524239e-01,  1.35558635e-01, -1.34425178e-01,  9.70060900e-02,
       -2.55309343e-01,  2.10041985e-01, -7.42806494e-02, -1.94924381e-02,
       -2.27575526e-02,  7.24768415e-02,  4.58092019e-02, -2.66585886e-01,
       -2.17871964e-01, -

### RandomForest Train

In [272]:
Y_train = df['sentiment']

In [277]:
# vectorizer.get_feature_names()
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [279]:
# Count_train

x_train,x_val,y_train,y_val = train_test_split(count_train.toarray(),Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))


Accuracy : 0.837200


In [280]:
# tfidf_train

x_train,x_val,y_train,y_val = train_test_split(tfidf_train.toarray(),Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))


Accuracy : 0.844600


In [289]:
# word2vec_train

x_train,x_val,y_train,y_val = train_test_split(word2vec_train,Y_train,test_size =0.2, random_state =99)

RF = RandomForestClassifier(n_estimators=100)
RF.fit(x_train,y_train)

print("Accuracy : %f" % RF.score(x_val,y_val))




Accuracy : 0.816600


In [290]:
word2vec_train.shape

(25000, 1000)

### RNN

In [293]:
import tensorflow as tf
tf.random.set_seed(99)
BATCH_SIZE = 128
epochs = 10
valid = 0.2


In [304]:
count_train.toarray().shape

(25000, 5000)