In [2]:
# 데이터 다운로드
# !kaggle competitions download -c word2vec-nlp-tutorial

### 필요한 모듈 import

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

warnings.filterwarnings('ignore')

### 데이터 불러오기

- id - Unique ID of each review
- sentiment - Sentiment of the review; 1 for positive reviews and 0 for negative reviews
- review - Text of the review

In [2]:
data = "./word2vec-nlp-tutorial/"
SEED = 0

In [3]:
train = pd.read_csv(os.path.join(data, 'labeledTrainData.tsv'), delimiter='\t') # tsv 파일
test = pd.read_csv(os.path.join(data, 'testData.tsv'), delimiter='\t')
unlabeled_train = pd.read_csv(os.path.join(data, 'unlabeledTrainData.tsv'), delimiter='\t', error_bad_lines = False)

b'Skipping line 43043: expected 2 fields, saw 3\n'


In [4]:
print(train.shape)
train.head()

(25000, 3)


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
print(test.shape)
test.head()

(25000, 2)


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [6]:
print(unlabeled_train.shape)
unlabeled_train.head()

(49998, 2)


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


### 전처리
- html태그, 불용어, 특수기호 등 제거

In [7]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

sample 데이터로 프로세스 확인 후 함수로 묶어주기

In [8]:
# 특수기호 제거
import re

In [9]:
# 불용어 사전
eng_stopwords = stopwords.words('english')

### Lemmatizer
- ex) run, ran runs -> run 으로 통일해줌
- 문장이 아닌 단어 하나씩 넣어줘야함

In [10]:
from nltk.stem import WordNetLemmatizer

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
def process_lemma(sentence):
    return [lemmatizer.lemmatize(word,'v') for word in sentence]

In [13]:
def preprocessing(sentence):
    soup = BeautifulSoup(sentence, 'html.parser')
    cleaned = re.sub('[^a-zA-Z]',' ', soup.text)
    cleaned =  cleaned.lower()
    cleaned = [word for word in cleaned.split() if word not in eng_stopwords]
    cleaned = process_lemma(cleaned)
    return ' '.join(cleaned) # 단어를 문장으로 결합하여 리턴

In [15]:
all_review = pd.concat([train['review'], unlabeled_train['review'], test['review']])
all_review_clean = all_review.apply(preprocessing)

In [16]:
all_review_clean.head()

0    stuff go moment mj start listen music watch od...
1    classic war worlds timothy hines entertain fil...
2    film start manager nicholas bell give welcome ...
3    must assume praise film greatest film opera ev...
4    superbly trashy wondrously unpretentious explo...
Name: review, dtype: object

### Tokenizer
단어 사전 생성

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
tokenizer = Tokenizer(oov_token='<OOV>') # oov_token : 단어 사전에 없는 데이터 다음 문자열로 치환  

In [19]:
tokenizer.fit_on_texts(all_review_clean)

In [20]:
len(tokenizer.word_index)

126312

In [21]:
train_sentences = all_review_clean[:len(train)]
test_sentences = all_review_clean[-len(test):]
train_sentences.shape, test_sentences.shape

((25000,), (25000,))

In [22]:
# 단어 -> 숫자로 변환
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [23]:
# 딥러닝 모델에 적용하기 위해 문장 길이 맞추기
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
MAX_LENGTH = 150

In [25]:
train_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, truncating = 'post', padding = 'post')
test_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, truncating = 'post', padding = 'post')

In [26]:
train_padded.shape, test_padded.shape

((25000, 150), (25000, 150))

### train test split

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
train_labels = train['sentiment']

In [29]:
X_train,X_valid,y_train,y_valid = train_test_split(train_padded, train_labels, stratify=train_labels, test_size=0.1, random_state=SEED)

### Word2Vec

In [30]:
from gensim.models import KeyedVectors

In [31]:
word2vec = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [33]:
EMBEDDING_DIM = 300
VOCAB_SIZE = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((VOCAB_SIZE, 300))

In [34]:
for word, idx in tokenizer.word_index.items():
    embedding_vector = word2vec[word] if word in word2vec else None
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

#### Embedding Layer
- 단어의 차원을 줄여주는 역할
- 차원을 줄일수록 정보 손실 발생   
`Embedding(vocab_size, embedding_dim, input_length=max_length)`

### Model

In [35]:
EMBEDDING_DIM = 300
VOCAB_SIZE = len(tokenizer.word_index) + 1 # padding = 0 이므로

In [36]:
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

In [37]:
model = Sequential([
    Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH,
             weights = [embedding_matrix],
             trainable=False,),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(128)),
    Dropout(0.25),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [38]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          37893900  
                                                                 
 bidirectional (Bidirectiona  (None, 150, 256)         439296    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              394240    
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 32)                8224      
                                                                 
 dense_1 (Dense)             (None, 1)                 3

In [39]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [52]:
checkpoint_path = 'tmp/checkpoint.ckpt'
checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                            save_best_only=True,
                            save_weights_only=True,
                            monitor='val_loss',
                            verbose=1,)
# 학습 시 제일 잘 나온 모델의 weight 저장하고 있다가 해당 구간 load 해주는 유틸 클래스

In [53]:
model.fit(X_train, y_train,
         validation_data = (X_valid, y_valid),
         batch_size=128,
         epochs=10,
         callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.37551, saving model to tmp\checkpoint.ckpt
Epoch 2/10
Epoch 2: val_loss improved from 0.37551 to 0.36831, saving model to tmp\checkpoint.ckpt
Epoch 3/10
Epoch 3: val_loss improved from 0.36831 to 0.33973, saving model to tmp\checkpoint.ckpt
Epoch 4/10
Epoch 4: val_loss did not improve from 0.33973
Epoch 5/10
Epoch 5: val_loss did not improve from 0.33973
Epoch 6/10
Epoch 6: val_loss did not improve from 0.33973
Epoch 7/10
Epoch 7: val_loss improved from 0.33973 to 0.33669, saving model to tmp\checkpoint.ckpt
Epoch 8/10
Epoch 8: val_loss improved from 0.33669 to 0.33485, saving model to tmp\checkpoint.ckpt
Epoch 9/10
Epoch 9: val_loss improved from 0.33485 to 0.32886, saving model to tmp\checkpoint.ckpt
Epoch 10/10
Epoch 10: val_loss did not improve from 0.32886


<keras.callbacks.History at 0x2946bf80c70>

In [54]:
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x29443d9cb50>

In [55]:
model.evaluate(X_valid, y_valid)



[0.328861802816391, 0.8668000102043152]

In [56]:
prediction = model.predict(test_padded)

In [60]:
prediction[prediction >= 0.5] = 1
prediction[prediction < 0.5] = 0
prediction

array([[1.],
       [0.],
       [1.],
       ...,
       [0.],
       [1.],
       [0.]], dtype=float32)

### Submission

In [61]:
submission = pd.read_csv(os.path.join(data,'sampleSubmission.csv'))
submission.head()

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0


In [62]:
submission['sentiment'] = prediction
submission['sentiment'] = submission['sentiment'].astype('int')
submission['sentiment'].value_counts()

0    12952
1    12048
Name: sentiment, dtype: int64

In [63]:
import datetime

In [64]:
timestring = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

In [65]:
filename = f'submission/submission-{timestring}.csv'

In [66]:
submission.to_csv(filename, index=False)