<a href="https://colab.research.google.com/github/allenwang0713/tebaml0315/blob/main/imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:

import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [3]:
test_df

Unnamed: 0,content,sentiment
0,ROCK STAR / (2001) *** (out of four)<br /><br ...,1
1,The Cure is a fantastic film about a boy with ...,1
2,"Not having seen this film in quite some time, ...",1
3,One of the flat-out drollest movies of all-tim...,1
4,"NVA combines eastalgia-humor, military comedy ...",1
...,...,...
24995,Strangers with candy overacts in all the wrong...,0
24996,"What a disaster! Normally, when one critiques ...",0
24997,The Robot vs. the Aztec Mummy was one of the s...,0
24998,I'm not prone to ranting and my expectations w...,0


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
# 3000種常用詞彙+1padding(0): 美一篇文章進入的時候只取512在常用詞彙列表的詞, 每一個詞化做100維度的語意像量
layers = [
    # 沒有激活, 3001(種詞彙) * 100 -> 300100
    Embedding(input_dim=3001, output_dim=100, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [5]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       metrics=["accuracy"],
       optimizer="adam")

In [6]:
# Tokenize: 詞彙換成數字, 建立一個3000常用詞彙辭典
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [None]:
# tok.word_index
# tok.index_word
# 檢查: 這個case, 標點和換行是可以去掉的
# tok.word_index["?"]
# 停用詞(忽略一些無意義的): 不用, 根據答案就會把無意義的東西調整出來

In [7]:
import pandas as pd
# Sequenize: 把我的字轉換成數字(利用剛剛列表)
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,389,1357,7,7,51,10,605,30,1.0,15.0,...,,,,,,,,,,
1,57,148,11,17,6,176,151,54,548.0,86.0,...,,,,,,,,,,
2,10,1056,3,224,4,11,20,1067,695.0,2.0,...,,,,,,,,,,
3,10,1816,120,16,3,324,1918,705,18.0,204.0,...,,,,,,,,,,
4,3,758,4,1059,309,392,1534,294,3.0,2735.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,2,419,38,93,140,11,1976,706,60.0,44.0,...,,,,,,,,,,
24996,45,10,97,199,9,3,1454,471,650.0,58.0,...,,,,,,,,,,
24997,100,109,2177,31,440,236,14,3,8.0,1.0,...,,,,,,,,,,
24998,11,17,6,32,5,19,228,10,188.0,262.0,...,,,,,,,,,,


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,...,5,1,372,1706,2,190,1,233,1370,341
1,0,0,0,0,0,0,0,0,0,0,...,37,543,108,405,34,25,293,9,487,27
2,0,0,0,0,0,0,0,0,0,0,...,2,96,487,178,5,398,9,742,772,2095
3,0,0,0,0,0,0,0,0,0,0,...,48,56,6,2246,4,900,76,142,5,12
4,0,0,0,0,0,0,0,0,0,0,...,24,2547,1,1605,624,124,1,2863,49,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,32,4,1199,250,8,1,223,297,238,36
24996,0,0,0,0,0,0,0,0,0,0,...,21,167,5,78,70,452,141,27,443,302
24997,0,0,0,0,0,0,0,0,0,0,...,344,4,805,119,94,69,1822,287,690,979
24998,0,0,0,0,0,0,0,0,0,0,...,43,47,12,188,76,2,33,120,11,592


In [9]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])