In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv
/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec


# 데이터 불러오기

In [2]:
train = pd.read_table("/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip")
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
test = pd.read_table("/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip")
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


# 데이터 전처리

In [4]:
alldata = pd.concat([train,test])
alldata.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1.0
1,2,1,A series of escapades demonstrating the adage ...,2.0
2,3,1,A series,2.0
3,4,1,A,2.0
4,5,1,series,2.0


In [5]:
alldata["Sentiment"].value_counts()
# multi-classification 다중분류 대회

2.0    79582
3.0    32927
1.0    27273
4.0     9206
0.0     7072
Name: Sentiment, dtype: int64

# 텍스트 마이닝

In [6]:
# 텍스트 마이닝

# 1. 단어 등록

from keras.preprocessing.text import Tokenizer
tk = Tokenizer()
tk.fit_on_texts(alldata["Phrase"])
tk.word_index

Using TensorFlow backend.


{'the': 1,
 'a': 2,
 'of': 3,
 'and': 4,
 'to': 5,
 "'s": 6,
 'in': 7,
 'is': 8,
 'that': 9,
 'it': 10,
 'as': 11,
 'with': 12,
 'for': 13,
 'its': 14,
 'film': 15,
 'an': 16,
 'movie': 17,
 'this': 18,
 'but': 19,
 'be': 20,
 'on': 21,
 'you': 22,
 'by': 23,
 "n't": 24,
 'more': 25,
 'his': 26,
 'not': 27,
 'one': 28,
 'than': 29,
 'about': 30,
 'at': 31,
 'from': 32,
 'or': 33,
 'all': 34,
 'like': 35,
 'are': 36,
 'have': 37,
 'has': 38,
 'so': 39,
 "'": 40,
 'out': 41,
 'story': 42,
 'who': 43,
 'rrb': 44,
 'up': 45,
 'too': 46,
 'good': 47,
 'most': 48,
 'into': 49,
 'lrb': 50,
 'time': 51,
 'much': 52,
 'what': 53,
 'if': 54,
 'characters': 55,
 'no': 56,
 'comedy': 57,
 'their': 58,
 'just': 59,
 'i': 60,
 'some': 61,
 'can': 62,
 'even': 63,
 'life': 64,
 'your': 65,
 'little': 66,
 'does': 67,
 "''": 68,
 'way': 69,
 'well': 70,
 'will': 71,
 'make': 72,
 'been': 73,
 'funny': 74,
 'only': 75,
 'very': 76,
 'he': 77,
 'do': 78,
 'director': 79,
 'any': 80,
 'enough': 81,
 'us'

In [7]:
# 2. text 맵핑
all_text = tk.texts_to_sequences(alldata["Phrase"])

In [8]:
# 3. padding하기
from keras.preprocessing.sequence import pad_sequences

pad_sequence = pad_sequences(all_text)
pad_sequence

array([[   0,    0,    0, ...,    3,    2,   42],
       [   0,    0,    0, ...,   13,    1, 3940],
       [   0,    0,    0, ...,    0,    2,  315],
       ...,
       [   0,    0,    0, ...,    2,  118, 4456],
       [   0,    0,    0, ...,    2,  118, 4456],
       [   0,    0,    0, ...,    0,  343, 1623]], dtype=int32)

In [9]:
train_pad = pad_sequence[:len(train)]
test_pad = pad_sequence[len(train):]

# pretrain embedding


In [10]:
# pretrain embedding
# 이미 학습된 사전정보로 학습하는 것, ex) 1.GloVe / 2.fasttext

# train,test를 토대로 딥러닝 학습시킨 것은 결국, 이 데이터셋에 대해 과적합되있다는 것

def load_embeddings(file_name):
    embeddings = {}
    with open(file_name) as f:
        for line in f:
            values = line.rstrip().split()  # 특수문자 제거, 공백기준으로 단어가져오기
            word = values[0] # 단어1개
            vector = np.asarray(values[1:], dtype = "float32") # 숫자차원 300개 , but 200만x300개 다가져올시 많다->asarray로, float형32로 크기줄이기
            
            embeddings[word] = vector # 키word : 값value
            
    return embeddings #200만개의 딕셔녀리 값
    

# 200만개의 데이터에 대해 있지만, 이 대회의 데이터셋은 17만개이므로 17만개를 뽑아와야함(filtering)
def filter_embeddings(embeddings, word_index, vocab_size , dim = 300):           # 단어, 단어종류, 단어종류개수, 단어차원
    embedding_matrix = np.zeros([vocab_size,dim]) # 17781x300차원의 0행렬 생성
    
    for word,i in word_index.items():  # 딕셔너리의 key,value를 묶어서 가져옴 -> word에 the, i에 1을 각각가져옴
        vector = embeddings.get(word)     # get(word)은 word에 해당하는 딕셔너리 value를 가져옴 
        
        if vector is not None:
            embedding_matrix[i] = vector
            
    return embedding_matrix


In [11]:
%%time
embeddings = load_embeddings("/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec")
embeddings

CPU times: user 4min 6s, sys: 5.06 s, total: 4min 11s
Wall time: 4min 11s


{'2000000': array([ 2.0600e-02,  1.9530e-01, -9.0400e-02, -3.5390e-01, -6.2700e-02,
        -1.4600e-02, -1.3150e-01,  5.8600e-02,  5.9930e-01,  6.3100e-02,
        -9.3200e-02,  7.1720e-01, -3.4950e-01, -6.1100e-02, -3.0790e-01,
         3.6940e-01, -2.5880e-01, -3.0210e-01, -1.2800e-02,  3.2680e-01,
         6.9900e-02,  8.9400e-02, -1.1910e-01, -1.1900e-01, -1.2200e-01,
        -4.5400e-02, -2.5100e-02, -1.7630e-01,  7.6370e-01, -8.4900e-02,
        -4.1930e-01,  3.0050e-01, -9.8500e-02,  1.6770e-01, -2.0570e-01,
        -1.8150e-01, -2.7360e-01,  3.7540e-01, -6.6400e-02,  1.7150e-01,
         4.6900e-01,  3.0410e-01,  1.8830e-01,  1.0950e-01,  4.7590e-01,
         1.1540e-01, -2.9670e-01, -3.4600e-02,  1.4270e-01,  4.1870e-01,
         1.2770e-01,  1.7650e-01,  7.1670e-01,  5.0060e-01, -5.6000e-03,
        -1.0700e-01, -3.6280e-01,  5.4290e-01,  1.0060e-01, -4.4410e-01,
        -6.2800e-02, -3.6000e-03,  8.1900e-02, -4.5830e-01,  8.3400e-02,
        -9.4100e-02,  7.5100e-02,  3.737

In [12]:
embedding_matrix = filter_embeddings(embeddings,tk.word_index,len(tk.word_index)+1, 300)
embedding_matrix[1]

array([ 2.30999999e-02,  1.70000009e-02,  1.56999994e-02, -7.72999972e-02,
        1.08800001e-01,  3.10000009e-03, -1.48699999e-01, -2.67199993e-01,
       -3.57000008e-02, -4.87000011e-02,  8.07000026e-02,  1.53200001e-01,
       -7.38999993e-02, -2.91000009e-02, -4.45000008e-02, -1.39999995e-03,
        1.01400003e-01,  1.86000001e-02, -2.52999999e-02,  1.99999996e-02,
       -2.60000001e-03, -1.78999994e-02,  5.00000024e-04,  5.40000014e-03,
       -1.33999996e-02,  2.32999995e-02, -7.54999965e-02, -1.55999996e-02,
        4.14999984e-02, -4.98499990e-01,  4.10000011e-02, -6.15999997e-02,
        4.69999993e-03,  3.24999988e-02, -1.62000004e-02, -1.72000006e-02,
        9.88000035e-02,  7.66000003e-02, -7.95999989e-02, -3.44999991e-02,
        1.24000004e-02, -1.00699998e-01, -2.92000007e-02, -7.62000009e-02,
       -1.26100004e-01, -5.31000011e-02,  4.23999988e-02,  1.43999998e-02,
       -6.83000013e-02,  2.85899997e-01,  3.99000011e-02,  2.00999994e-02,
        3.24000001e-01, -

# 딥러닝 모델 설계

In [13]:
from keras import Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping,ModelCheckpoint
# 단어의미 이해 : 텍스트에선 embbeding 층

model = Sequential()

#model.add(Embedding(len(tk.word_index)+1, 10, input_length = 52)) 
model.add(Embedding(len(tk.word_index)+1, 300, input_length = 52, trainable = False, weights = [embedding_matrix])) #trainable = False 임베딩층안쓰고 가져온 것 쓰기 

#model.add(Flatten())

#model.add(SimpleRNN(32)) # RNN모델
model.add(LSTM(32))       # LSTN모델, 문맥적 정보 파악

model.add(Dense(5,activation = "softmax"))

model.compile(metrics = ["acc"], loss = "sparse_categorical_crossentropy", optimizer = "adam")

# sparse vs crossentropy
# 1,2,3,4를 분류 vs 0001 0010 0100 1000 으로 되있는 것 분류

es = EarlyStopping(patience = 3, verbose = 1)
mc = ModelCheckpoint("best.h5", save_best_only = True, verbose = 1)

model.fit(train_pad, train["Sentiment"], batch_size = 512, validation_split = 0.1, epochs = 20, callbacks = [es,mc])

result = model.predict(test_pad)


# RNN 
# 시간에 따른 패턴을 학습한다.
# *문맥적인 정보를 파악할 수 있다, 하지만 단어의 갯수가 많아질수록 학습을 못한다.
# 음성이나 텍스트처리에 사용

# LongShortermMemory (LSTM)
# 예전 내용 기억하는 모델
# 문맥적 정보 파악


Train on 140454 samples, validate on 15606 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.94871, saving model to best.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.94871 to 0.92204, saving model to best.h5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.92204
Epoch 4/20

Epoch 00004: val_loss improved from 0.92204 to 0.90496, saving model to best.h5
Epoch 5/20

Epoch 00005: val_loss improved from 0.90496 to 0.90385, saving model to best.h5
Epoch 6/20

Epoch 00006: val_loss improved from 0.90385 to 0.90189, saving model to best.h5
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.90189
Epoch 8/20

Epoch 00008: val_loss improved from 0.90189 to 0.90149, saving model to best.h5
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.90149
Epoch 10/20

Epoch 00010: val_loss did not improve from 0.90149
Epoch 11/20

Epoch 00011: val_loss did not improve from 0.90149
Epoch 00011: early stopping


In [14]:
result = result.argmax(1)
result
# 인자1은 axis가 1임을 뜻함, 가장 높은 값을 갖는 label을 가져옴

array([3, 3, 2, ..., 1, 1, 1])

In [15]:
# 제출 전, train과 test에서 같은 경우는, 예측말고 값 대입하기

mapping = {i:j for _,_,i,j in train.values}
mapping

for i,phrase in enumerate(test["Phrase"]):
    if phrase in mapping:
        result[i] = mapping[phrase]

result

array([3, 3, 2, ..., 1, 1, 1])

# 제출

In [16]:
sub = pd.read_csv("/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv")
sub.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [17]:
sub["Sentiment"] = result
sub.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3


In [18]:
sub.to_csv("sub.csv", index=False)

# Score : 0.68570