In [6]:
import pandas as pd
import numpy as np
import os

'''
참고 URL
- https://programmers.co.kr/learn/courses/21/lessons/1693
- http://suanlab.com/assets/lectures/dpp/10.pdf

'''

'''
1. Data preprocessing
2. word2vec
3. modeling
4. 평가

'''


In [8]:
os.listdir('data')


['labeledTrainData.tsv',
 'sampleSubmission.csv',
 'testData.tsv',
 'unlabeledTrainData.tsv']

In [63]:
# 데이터 읽기
# -- quoting : 특수 문자가 포함된 필드를 감쌀 때 처리하는 방법, 문자를 따옴표로 묶는 방법
import csv
df = pd.read_csv('data/labeledTrainData.tsv', header=0, delimiter='\t',quoting=3)
# QUOTE_MINIMAL (0), QUOTE_ALL (1), 
# QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
'''
-QUOTE_ALL(1) : Quote everything, regardless of type.(문자열처리, 모든데이터 묶음)
-QUOTE_MINIMAL(0) :Quote fields with special characters (특수 문자가 포함된 따옴표 필드)
(anything that would confuse a parser configured with the same dialect and options). This is the default
-QUOTE_NONNUMERIC(2) :Quote all fields that are not integers or floats.(숫자가 아닌 경우 묶음) 
  When used with the reader, input fields that are not quoted are converted to floats.
-QUOTE_NONE(3) : Do not quote anything on output. 데이터를 묶지 않음
 When used with the reader, quote characters are included in the field values (normally, they are treated as delimiters and stripped).
 reader와 사용하면 쌍따옴표는 필드값으로 포함된다.
'''

df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [100]:
# 데이터 전처리 -- HTML 태그, \ 특수 문자 제거
from bs4 import BeautifulSoup
import re

def preprocessing(x):
    #HTML 태그 제거
    x= BeautifulSoup(x,'html.parser').get_text()
    # 특수문자 제거 # 영문자,숫자를 제외한 문자를 모드 변환 띄어쓰기로
    x = re.sub("\W"," ",x)    
    return x

df['review']=df['review'].map(lambda x: preprocessing(x))


0         With all this stuff going down at the moment ...
1           The Classic War of the Worlds   by Timothy ...
2         The film starts with a manager  Nicholas Bell...
3         It must be assumed that those who praised thi...
4         Superbly trashy and wondrously unpretentious ...
                               ...                        
24995     It seems like more consideration has gone int...
24996     I don t believe they made this film  Complete...
24997     Guy is a loser  Can t get girls  needs to bui...
24998     This 30 minute documentary Buñuel made in the...
24999     I saw this movie as a child and it broke my h...
Name: review, Length: 25000, dtype: object

In [147]:
# 토크나이징 + Stopwords 제거
# 장점 : 노이즈를 줄일 수 있음, 단점 : 문장 구조 모델링시 정보 유실 발생

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def tokenizing(words):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(words.lower())
    words = [x for x in words if x not in stop_words]
    return words

df['words'] = df['review'].map(lambda x : tokenizing(x))
df.head()
    

Unnamed: 0,id,sentiment,review,review2,words,vector
0,"""5814_8""",1,"""With all this stuff going down at the moment ...","""With all this stuff going down at the moment ...","[``, stuff, going, moment, mj, 've, started, l...","[16, 432, 94, 448, 8944, 70, 523, 2440, 142, 1..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...","""\""The Classic War of the Worlds\"" by Timothy ...","[``, \, '', classic, war, worlds\, '', timothy...","[16, 11, 7, 265, 248, 14492, 7, 3528, 7181, 34..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...","""The film starts with a manager (Nicholas Bell...","[``, film, starts, manager, (, nicholas, bell,...","[16, 10, 399, 2930, 13, 4346, 3834, 12, 610, 2..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi...","""It must be assumed that those who praised thi...","[``, must, assumed, praised, film, (, \, '', g...","[16, 131, 4773, 5646, 10, 13, 11, 7, 686, 703,..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...","""Superbly trashy and wondrously unpretentious ...","[``, superbly, trashy, wondrously, unpretentio...","[16, 3370, 4155, 44772, 11262, 986, 8, 2153, 1..."


In [133]:
'''

Tokenizing 방법 리뷰
1.  Split 함수
2.  NLTK 활용 
   - Tokenizing → Index로 벡터화 해야하는데 NLTK는 Tokenizing 까지만
3.  keras.preprocessing 활용
   - Keras는 Vector화 까지 가능, 

'''
# https://inuplace.tistory.com/536

from tensorflow.python.keras.preprocessing.text import Tokenizer

token = Tokenizer()
token.fit_on_texts(df['words']) # Fit
df['vector'] = token.texts_to_sequences(df['words']) # vector화 


In [134]:
df.head()

Unnamed: 0,id,sentiment,review,review2,words,vector
0,"""5814_8""",1,"""With all this stuff going down at the moment ...","""With all this stuff going down at the moment ...","[``, stuff, going, moment, mj, 've, started, l...","[16, 432, 94, 448, 8944, 70, 523, 2440, 142, 1..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...","""\""The Classic War of the Worlds\"" by Timothy ...","[``, \, '', classic, war, worlds\, '', timothy...","[16, 11, 7, 265, 248, 14492, 7, 3528, 7181, 34..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...","""The film starts with a manager (Nicholas Bell...","[``, film, starts, manager, (, nicholas, bell,...","[16, 10, 399, 2930, 13, 4346, 3834, 12, 610, 2..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi...","""It must be assumed that those who praised thi...","[``, must, assumed, praised, film, (, \, '', g...","[16, 131, 4773, 5646, 10, 13, 11, 7, 686, 703,..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...","""Superbly trashy and wondrously unpretentious ...","[``, superbly, trashy, wondrously, unpretentio...","[16, 3370, 4155, 44772, 11262, 986, 8, 2153, 1..."


In [145]:
# 단어 사전 확인
vocab = token.word_index
vocab["<PAD>"] = 0
vocab
len(vocab)

120745

In [158]:
# 최대 문장길이 :: 3사분위에 해당하는 214개를 고정길이로 설정
print("최소 단어수", df['vector'].map(lambda x : len(x)).min())
print("1사분위", df['vector'].map(lambda x : len(x)).quantile(0.25))
print("2사분위",df['vector'].map(lambda x : len(x)).quantile(0.50))
print("3사분위",df['vector'].map(lambda x : len(x)).quantile(0.75))
print("최대 단어수",df['vector'].map(lambda x : len(x)).max())



최소 단어수 7
1사분위 92.0
2사분위 130.0
3사분위 214.0
최대 단어수 1790


In [164]:
# Padding: 가변 길이 → 고정 길이

from tensorflow.python.keras.preprocessing.sequence import pad_sequences

max_padding = 214

X_train = pad_sequences(df['vector'],maxlen = max_padding, padding = 'post' )
Y_train = df['sentiment']
print(X_train)

[[  756  2365    18 ...  1412     2     7]
 [   16    11     7 ...     0     0     0]
 [  826 24276 59374 ...  6154     2     7]
 ...
 [   16   146  3353 ...     0     0     0]
 [   16   974   792 ...     0     0     0]
 [   16   128     9 ...     0     0     0]]


In [163]:
# Vector 화 
'''
1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)
2. TF-IDF (완료)
3. Countvectorizer
4. Word2vec
5. Onehotencoding
'''

'\n1. BOW를 이용해서 Vector Sequence로 변환(위에 완료)\n2. TF-IDF\n3. Countvectorizer\n4. Word2vec\n'

In [None]:
# TF-IDF
# 입력이 텍스트여야함.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=0.0, analyzer = "word", sublinear_tf=True,
                           ngram_range=(1,3), max_features=5000,stop_words = 'english')

# min_df : 설정값보다 특정 토큰의 df(document Frequency)가 적으면 벡터화에서 제거
# analyzer : word/char 2가지 : word는 단위 : 단어 / char : 단위 : char 
# sublinear_tf : term frequency에 대한 smoothing 여부
# ngram_range = n-gram 의 범위 : 분석기에 의해 설정값을 사용하여 ngram자동 생성
# max_features = 벡터의 최대 길이, 

X_train = vectorizer.fit_transform(list(df['review']))
X_train

In [None]:
print(X_train.shape)
print(X_train[0]) 
# 5000개의 단어 각각에 대한 tf-idf Weight를 의미함


In [None]:
vectorizer.get_feature_names()