# 0530 imdb review

#### text

## import modules
  #### embedding, global average pooling 1d, pad_sequences

In [1]:
# TensorFlow and tf.keras
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

### load data 
- #### dataset (x: imdb article -> y: text type)
y: 0 bad 1 good
- #### word2idx dict (key: word; value: index)

In [3]:
vocab_size = 10000
(x_train, y_train), (x_test, y_test) = tensorflow.keras.datasets.imdb.load_data(num_words=vocab_size)
word2idx = tensorflow.keras.datasets.imdb.get_word_index()

num_words = vocab+size = 10000
- input node의 수를 직접 설정 (10000개에만 index를 매기고 나머지는 unknown)

word2idx = 사전: 단어와 그 단어의 번호를 정의

In [4]:
#번호(단어)의 그룹
print(x_train[0])

# 0 or 1 = bad or good
print(y_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
1


### modifiy word2idx dict (to reflect x_train, x_text)

##### modify 하는 이유

In [5]:
len(word2idx)

88584

In [6]:
word2idx = {k:v+3 for k, v in word2idx.items()}
word2idx['<PAD>'] = 0
word2idx['<START>'] = 1
word2idx['<UNK>'] = 2
word2idx['<UNUSED>'] = 3

1. word2idx.items() = 각 단어의 dict
    - 앞에 있는게 key, 뒤에 있는게 value
2. for ... in word2idx.items() = 88588번 돌면서 dict를 만듬 
    - 그냥 k:v 했으면 똑같은 dict가 나올 것
    - 단, k:v+3 을 했기 때문에 원래 dict의 value값에 3을 더해주었음 ->PAD, START, UNK, UNUSED를 넣기 위해서

이 네줄을 넣어서 사전을 update(modify)했다

### create idx2word: inverse of word2idx

    위에서 사전update를 한 것의 inverse version
    단어->숫자 에서 숫자값을 알면 그 단어가 뭐였는지 알도록 재참조를 하는 것

In [7]:
idx2word = {v:k for k, v in word2idx.items()}

### padding x_train, x_test

In [8]:
#pad_sequence 작업 전
len(x_train[0])

218

In [9]:
x_train = pad_sequences(x_train, value=word2idx['<PAD>'], padding='post', maxlen=256)
x_test = pad_sequences(x_test, value=word2idx['<PAD>'], padding='post', maxlen=256)

1. maxlen=256: 최대 256개까지
2. padding='post': 뒷부분에 padding을 해줘라
3. value=word2idx['<PAD>'] 앞에서 정해준 value값으로 padding

In [10]:
#pad_sequence 작업 후 ->pad로 채워져 256 (content를 보면 그 value값인 0으로 채워짐)
len(x_train[0])

256

In [11]:
x_train[0]
#이 256개의 숫자를 256x10000 으로 바꿔주는 것이 밑의 embedding(차원을 1차원에서 2차원으로 바꿈)

array([   1,   14,   22,   16,   43,  530,  973, 1622, 1385,   65,  458,
       4468,   66, 3941,    4,  173,   36,  256,    5,   25,  100,   43,
        838,  112,   50,  670,    2,    9,   35,  480,  284,    5,  150,
          4,  172,  112,  167,    2,  336,  385,   39,    4,  172, 4536,
       1111,   17,  546,   38,   13,  447,    4,  192,   50,   16,    6,
        147, 2025,   19,   14,   22,    4, 1920, 4613,  469,    4,   22,
         71,   87,   12,   16,   43,  530,   38,   76,   15,   13, 1247,
          4,   22,   17,  515,   17,   12,   16,  626,   18,    2,    5,
         62,  386,   12,    8,  316,    8,  106,    5,    4, 2223, 5244,
         16,  480,   66, 3785,   33,    4,  130,   12,   16,   38,  619,
          5,   25,  124,   51,   36,  135,   48,   25, 1415,   33,    6,
         22,   12,  215,   28,   77,   52,    5,   14,  407,   16,   82,
          2,    8,    4,  107,  117, 5952,   15,  256,    4,    2,    7,
       3766,    5,  723,   36,   71,   43,  530,  4

### create model

#### Embedding
- embedding의 input: 256개의 숫자
- embedding의 역할: 256x10000을 하고 128개의 hidden layer까지 만들어줌
- 그 다음 gap으로 average내줌?

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.


In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________
