# NLP Example: Imdb comments

## Step1: Prepare Data 
- #### Goal1: 下載Dataset並解壓縮

In [0]:
import tensorflow as tf
dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True)

- #### Goal2: 定義一個取得評論路徑的函數，並將內容與情緒正負整理成DataFrame

In [0]:
import os
import glob
import pandas as pd

dn = os.path.dirname(dataset)
dn = os.path.join(dn, "aclImdb")

def get_data(n):
    train_dn = os.path.join(dn, n)
    contents = []
    sentiment = []

    pos_fn = os.path.join(train_dn, "pos", "*.txt")
    for fn in glob.glob(pos_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(1)

    neg_fn = os.path.join(train_dn, "neg", "*.txt")
    for fn in glob.glob(neg_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(0)
    df = pd.DataFrame({
        "content": contents,
        "sentiment": sentiment
    },columns=["content", "sentiment"])
    return df

In [17]:
train_df = get_data("train")
test_df = get_data("test")
test_df

# train、test各有25000筆包含正負評的資料

Unnamed: 0,content,sentiment
0,The evil Professor Moriarty plots to gain cont...,1
1,"As a French, i found it very pleasant to be ab...",1
2,Mary Pickford plays Annie Rooney--the daughter...,1
3,I don't recall a film which so deftly shows th...,1
4,When is ART going to overcome racism? I believ...,1
...,...,...
24995,This film is a perfect example that a movie ca...,0
24996,"First, I should say that I've seen the '39 ver...",0
24997,Oh God. Why is it that Nickelodeon has such a ...,0
24998,"Lots of flames, thousands of extras in battle ...",0


## Step2-1: Tokenize - 找出詞種類數 (NLP步驟一)
- 此步驟類似CountVectorizer的fit (找出多少種詞)

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
# num_words: 抓出出現率高的幾種詞

tok.fit_on_texts(train_df["content"])

In [0]:
# 製作詞和其index正向轉換、反向轉換的字典
index_2_word = tok.index_word
word_2_index = {v:k for k, v in tok.index_word.items()}

# 順序會按照出現次數多寡降冪列出
# len(tok.index_word) = 3000

## Step2-2: Sequence - 文字對應數字(NLP步驟二)
#### 將每篇評論出現的詞，依語句順序列出該詞對應的index
- 此步驟類似CountVectorizer的transform

In [23]:
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])

pd.DataFrame(x_train_seq)
# DataFrame中所有數字(index)都在1-3000內(上步共選了3000個詞)
# 25000 rows × 1816 columns : 最多的那篇評論有1816個詞在3000單字裡
# 而其他不足長(小於1816)的尾則是NaN空值

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1776,1777,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,10,2278,316,11,19,20,245,46,150.0,593.0,2.0,21.0,2640.0,365.0,688.0,276.0,57.0,1009.0,1.0,83.0,317.0,35.0,10.0,382.0,5.0,1.0,1171.0,12.0,9.0,13.0,750.0,2.0,117.0,1146.0,10.0,869.0,5.0,9.0,233.0,311.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1650,2735,8,3,670,8,16,538,20.0,24.0,953.0,266.0,5.0,94.0,278.0,4.0,48.0,237.0,40.0,221.0,2078.0,3.0,2289.0,8.0,1.0,5.0,800.0,40.0,571.0,20.0,265.0,32.0,295.0,1404.0,20.0,1.0,4.0,24.0,8.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,13,578,140,655,338,51,56,119,11.0,19.0,735.0,13.0,747.0,1.0,347.0,4.0,24.0,6.0,8.0,318.0,808.0,2.0,2932.0,32.0,731.0,15.0,1422.0,667.0,6.0,2297.0,732.0,40.0,343.0,15.0,24.0,214.0,7.0,7.0,1.0,62.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10,101,1,19,6,187,9,993,5.0,1689.0,1.0,1829.0,60.0,35.0,73.0,1340.0,79.0,1.0,19.0,993.0,5.0,2588.0,48.0,1.0,1408.0,191.0,1429.0,13.0,1045.0,5.0,373.0,1.0,179.0,48.0,33.0,162.0,261.0,10.0,97.0,21.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2,11,6,3,84,17,8,407,54.0,548.0,86.0,9.0,30.0,210.0,109.0,3.0,17.0,41.0,9.0,1051.0,53.0,14.0,28.0,4.0,1.0,2086.0,99.0,8.0,60.0,359.0,178.0,5.0,735.0,43.0,18.0,1.0,1404.0,8.0,65.0,93.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,9,13,40,3,391,17,54,28,141.0,435.0,65.0,55.0,137.0,64.0,139.0,331.0,11.0,17.0,6.0,206.0,3.0,821.0,28.0,4.0,1.0,246.0,99.0,10.0,25.0,123.0,107.0,8.0,58.0,110.0,44.0,22.0,178.0,5.0,64.0,3.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,157,17,12,722,1,43,1592,4,1.0,1165.0,1651.0,822.0,1.0,17.0,184.0,3.0,623.0,6.0,5.0,719.0,1.0,348.0,142.0,5.0,110.0,10.0,121.0,10.0,121.0,871.0,112.0,553.0,12.0,28.0,156.0,4.0,262.0,14.0,29.0,4.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,161,67,22,15,157,2317,11,55,42.0,109.0,835.0,5.0,22.0,31.0,1.0,112.0,1796.0,1536.0,14.0,227.0,14.0,2225.0,99.0,137.0,11.0,28.0,149.0,14.0,1063.0,1216.0,42.0,1041.0,29.0,12.0,72.0,64.0,6.0,32.0,32.0,1558.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,204,107,46,2136,99,8,58,110,18.0,11.0,28.0,212.0,27.0,790.0,1.0,52.0,246.0,1322.0,1242.0,815.0,12.0,6.0,7.0,7.0,72.0,791.0,104.0,1.0,1509.0,2049.0,14.0,2.0,14.0,2.0,1.0,2329.0,14.0,2354.0,2.0,14.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Step2-3: Padding - 讓所有資料長度一致
pad_sequences(maxlen=, padding='pre', truncating='pre')  
- padding='pre': 在不足maxlen的資料前面補0
- truncating='pre': 從頭截掉超過maxlen的資料

> ### 為何要做Padding?
> 因為後面要接MLP，input一定要是一樣的

> ### maxlen(資料長度)要選多長? 
> 依據問題的複雜度做調整；而此例為情緒分析(正、負)，因此maxlen不必選擇太長 (ex. 256)

In [24]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_padseq = pad_sequences(x_train_seq, maxlen=256)
x_test_padseq = pad_sequences(x_test_seq, maxlen=256)

pd.DataFrame(x_train_padseq)
# DataFrame中的數字多了一種: 0；因此變為在0-3000內 (3001個)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,446,2,119,10,757,12,1,753,610,6,84,14,70,70,9,6,2,11,17,141,27,107,31,17,448,2576,7,7,313,141,64,11,17,15,1,581,40,89,532,3
1,620,5,2903,175,8,1,4,2,2502,7,7,9,6,79,239,1,115,19,487,64,8,1,434,11,288,7,7,42,40,35,1588,2735,555,5,2178,11,215,46,62,11,...,335,109,364,31,9,7,7,16,91,1588,202,267,1321,2095,1780,766,2,1086,4,1,90,8,1,400,4,2079,2502,1,2735,198,6,28,12,2338,1,4,260,556,6,91
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,794,59,21,920,48,33,23,19,124,3,49,289,976,3,62,162,886,3,751,5,27,90,30,1,127,148,597,123,13,90,148,395,721,1,1928,59,27,3,49,422
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,218,110,100,29,9,405,2616,80,295,194,20,2197,8,1,1114,437,33,78,46,20,11,1425,872,5,927,20,110,72,200,25,3,2211,46,248,100,29,1255,2161,15,2211
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,34,97,61,374,316,528,4,9,140,3,30,1,1275,8,2,26,12,11,6,28,4,1,84,99,5,27,146,1816,21,1,219,15,1,1331,968,1906,2,1,84,928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,54,28,141,435,65,55,137,64,139,331,11,17,6,206,3,821,28,4,1,246,99,10,25,123,107,8,58,110,44,22,178,5,64,3,49,17,89,64,90,346
24996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,224,2570,352,37,108,99,4,11,997,1,323,6,324,18,1,2594,2,1,226,6,1499,21,1,246,186,17,204,107,54,1,238,128,301,1,18,21,28,4,1,342
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,2359,20,52,309,18,12,128,584,94,11,28,84,471,37,5,210,43,157,19,443,1,31,1237,60,1,2040,4,9,135,347,2497,412,202,99,89,70,8,935,803
24998,893,2354,2078,215,96,249,44,24,6,607,35,237,394,9,31,3,1037,7,7,147,10,66,329,1,142,4,1,285,18,58,978,158,2,56,43,100,41,888,232,1240,...,1,1758,135,423,33,40,398,1,367,20,12,15,1551,232,7,7,28,59,532,50,36,11,174,258,63,494,18,22,188,63,1819,95,924,899,20,22,7,7,297,155


## Step3-1: 建立模型
#### !!!無論是什麼NLP，一定從嵌入層 (Embedding Layer) 著手

tensorflow.keras.layers.Embedding(input_dim=input input有幾種(此例:3001), mask_zero="True", input_length= )
- mask_zero="True": 是否屏蔽(因為0只是因padding產生的，沒有意義)
- input_length=?: 有幾個詞要進去(此例:256)

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, Dense, Dropout

INPUT_DIM = 3000 + 1
EMBEDDING_DIM = 64
INPUT_LENGTH = 256

# EMBEDDING_DIM = 64，output會是64種特徵(語意、感情) -> 電腦自己決定的

model = Sequential()
model.add(Embedding(INPUT_DIM,
                    EMBEDDING_DIM,
                    mask_zero=True,
                    input_length=INPUT_LENGTH))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(2, activation="softmax"))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 256, 64)           192064    
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               2097280   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 2,289,602
Trainable params: 2,289,602
Non-trainable params: 0
_________________________________________________________________


### Param
embedding_1層param: 192064 = 3001 * 64


## Step3-2: 決定模型訓練方式 

In [0]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(), 
              optimizer="adam",
              metrics=["accuracy"])

## Step3-3: 訓練模型

In [29]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

stop_callback = EarlyStopping(patience=3, restore_best_weights=True)
save_callback = ModelCheckpoint("nlp_embedding.h5", save_best_only=True)

model.fit(x_train_padseq, 
          y_train,
          batch_size=200,
          epochs=100,
          verbose=2,
          validation_split=0.1,
          callbacks=[stop_callback, save_callback])

Epoch 1/100
113/113 - 2s - loss: 0.5288 - accuracy: 0.7122 - val_loss: 0.3428 - val_accuracy: 0.8452
Epoch 2/100
113/113 - 1s - loss: 0.2066 - accuracy: 0.9180 - val_loss: 0.3038 - val_accuracy: 0.8660
Epoch 3/100
113/113 - 1s - loss: 0.0676 - accuracy: 0.9812 - val_loss: 0.5162 - val_accuracy: 0.8164
Epoch 4/100
113/113 - 1s - loss: 0.0146 - accuracy: 0.9979 - val_loss: 0.5069 - val_accuracy: 0.8496
Epoch 5/100
113/113 - 1s - loss: 0.0039 - accuracy: 0.9998 - val_loss: 0.5622 - val_accuracy: 0.8508


<tensorflow.python.keras.callbacks.History at 0x7f856b14e128>

## Step4: Evaluate

In [30]:
model.evaluate(x_test_padseq, y_test)



[0.3163883090019226, 0.867680013179779]

### 訓練完完整模型，但只需要其中一層(Embedding層)
- Embedding層: 想像為可產生詞向量的機器

In [31]:
infer = Sequential()
infer.add(Embedding(INPUT_DIM,
                    EMBEDDING_DIM))

# 把訓練好的weights拿來用
w = model.layers[0].get_weights()
infer.set_weights(w)
infer.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 64)          192064    
Total params: 192,064
Trainable params: 192,064
Non-trainable params: 0
_________________________________________________________________


### 把輸入詞彙轉換成詞向量

In [33]:
# 輸入的長相: data = [[1, 2, 3], [4, 5, 6]]
w = input("你要轉換哪的詞? ")
data = [[word_2_index[w]]]
print("詞向量: ", infer.predict(data))

你要轉換哪的詞? the
詞向量:  [[[-0.01138842 -0.0244815  -0.01297793  0.02080549  0.00363106
   -0.03247054 -0.04346778  0.03906973  0.01139266 -0.0397626
   -0.0081189  -0.0265445  -0.02179519  0.02952045 -0.00529313
   -0.0132681   0.02774608  0.03252506 -0.01963926 -0.03194793
   -0.01348299  0.0430054   0.01899976  0.02063266 -0.00278283
    0.01864485  0.01894192  0.03434108 -0.00574317 -0.02737129
    0.01581663  0.01658983 -0.01045542  0.02538743 -0.01701528
   -0.02005273 -0.02911117  0.01746017 -0.03761707  0.03061545
    0.01950994  0.02936767 -0.03756617  0.03319656  0.01959328
    0.03415286 -0.01248208  0.00382367 -0.03062304 -0.00460404
   -0.00131209 -0.0091872  -0.04910583  0.00215393  0.02670221
   -0.00791561  0.00944699 -0.00720445  0.00163057 -0.00496887
   -0.03803299 -0.02136712  0.03516245  0.02360376]]]


- #### 詞向量內64個值，表示該詞與64個特徵(語意感受)的距離(接近程度)
> ### a、the這些詞需要在訓練前事先排除嗎?
> 不用，模型會自行做調整、決定它的重要性