In [1]:
import urllib.request
import os
import tarfile

In [2]:
url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

In [3]:
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
    result=tfile.extractall('data/')

# 1. Import Library

In [4]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# 資料準備

# 讀取檔案

In [5]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [6]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list=[]

    positive_path=path + filetype+"/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
    
    negative_path=path + filetype+"/neg/"
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]
        
    print('read',filetype, 'files:',len(file_list))
       
    all_labels = ([1] * 12500 + [0] * 12500) 
    
    all_texts  = []
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels,all_texts

In [7]:
y_train,train_text=read_files("train")

read train files: 25000


In [8]:
y_test,test_text=read_files("test")

read test files: 25000


# 查看正面評價的影評

In [9]:
train_text[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [10]:
y_train[0]

1

# 查看負面評價的影評

In [11]:
train_text[12499]

'Working-class romantic drama from director Martin Ritt is as unbelievable as they come, yet there are moments of pleasure due mostly to the charisma of stars Jane Fonda and Robert De Niro (both terrific). She\'s a widow who can\'t move on, he\'s illiterate and a closet-inventor--you can probably guess the rest. Adaptation of Pat Barker\'s novel "Union Street" (a better title!) is so laid-back it verges on bland, and the film\'s editing is a mess, but it\'s still pleasant; a rosy-hued blue-collar fantasy. There are no overtures to serious issues (even the illiteracy angle is just a plot-tool for the ensuing love story) and no real fireworks, though the characters are intentionally a bit colorless and the leads are toned down to an interesting degree. The finale is pure fluff--and cynics will find it difficult to swallow--though these two characters deserve a happy ending and the picture wouldn\'t really be satisfying any other way. *** from ****'

In [12]:
y_train[12499]

1

# 先讀取所有文章建立字典，限制字典的數量為nb_words=2000

In [13]:
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)

# Tokenizer屬性

# fit_on_texts 讀取多少文章

In [14]:
print(token.document_count)

25000


In [15]:
print(token.word_index)



# 將每一篇文章的文字轉換一連串的數字
# 只有在字典中的文字會轉換為數字

In [16]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)

In [17]:
print(train_text[0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [18]:
print(x_train_seq[0])

[308, 6, 3, 1068, 208, 8, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 6, 72, 5, 631, 70, 6, 1, 5, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 798, 5, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 142, 129, 5, 27, 4, 125, 1470, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 11, 8, 214]


# 讓轉換後的數字長度相同

# 文章內的文字，轉換為數字後，每一篇的文章地所產生的數字長度都不同，因為後需要進行類神經網路的訓練，所以每一篇文章所產生的數字長度必須相同
# 以下列程式碼為例maxlen=100，所以每一篇文章轉換為數字都必須為100

In [19]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen=100)

# 如果文章轉成數字大於0,pad_sequences處理後，會truncate前面的數字

In [20]:
print('before pad_sequences length=',len(x_train_seq[0]))
print(x_train_seq[0])

before pad_sequences length= 106
[308, 6, 3, 1068, 208, 8, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 6, 72, 5, 631, 70, 6, 1, 5, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 798, 5, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 142, 129, 5, 27, 4, 125, 1470, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 11, 8, 214]


In [21]:
print('after pad_sequences length=',len(x_train[0]))
print(x_train[0])

after pad_sequences length= 100
[  29    1  168   54   13   45   81   40  391  109  137   13   57  149
    7    1  481   68    5  260   11    6   72    5  631   70    6    1
    5    1 1530   33   66   63  204  139   64 1229    1    4    1  222
  899   28   68    4    1    9  693    2   64 1530   50    9  215    1
  386    7   59    3 1470  798    5  176    1  391    9 1235   29  308
    3  352  343  142  129    5   27    4  125 1470    5  308    9  532
   11  107 1466    4   57  554  100   11  308    6  226   47    3   11
    8  214]


# 如果文章轉成數字不足100,pad_sequences處理後，前面會加上0

In [22]:
print('before pad_sequences length=',len(x_train_seq[1]))
print(x_train_seq[1])

before pad_sequences length= 335
[38, 13, 739, 43, 73, 31, 1828, 14, 149, 17, 111, 3, 1338, 5, 335, 144, 19, 1, 886, 11, 67, 276, 1190, 402, 33, 118, 282, 35, 166, 5, 391, 153, 38, 14, 1, 546, 87, 80, 100, 4, 1, 13, 39, 3, 412, 1199, 133, 40, 179, 137, 13, 1, 321, 19, 358, 5, 1, 38, 44, 25, 371, 5, 126, 52, 19, 1, 1980, 17, 47, 44, 21, 67, 344, 3, 5, 408, 19, 1, 1980, 14, 3, 205, 1, 21, 276, 65, 35, 3, 340, 1, 719, 725, 3, 1264, 19, 1, 1506, 3, 1220, 2, 282, 21, 276, 5, 63, 47, 41, 36, 5, 25, 11, 6, 33, 33, 379, 13, 294, 3, 1022, 128, 33, 43, 282, 7, 1, 178, 362, 5, 93, 3, 15, 3, 5, 63, 44, 26, 66, 408, 7, 1, 1980, 14, 499, 205, 1, 44, 26, 66, 78, 47, 26, 490, 15, 3, 701, 1181, 4, 227, 49, 1, 19, 117, 6, 1367, 19, 1, 886, 15, 3, 19, 23, 5, 23, 171, 843, 117, 26, 187, 1483, 121, 1, 236, 344, 1, 30, 3, 99, 41, 394, 19, 23, 117, 888, 81, 101, 582, 3, 251, 30, 1, 399, 4, 1956, 31, 1230, 33, 184, 154, 37, 340, 2, 37, 2, 33, 22, 454, 338, 5, 1, 1980, 502, 214, 236, 20, 338, 5, 36, 26, 276, 1

In [23]:
print('after pad_sequences length=',len(x_train[1]))
print(x_train[1])

after pad_sequences length= 100
[ 544   38  511   38   25  551  133    1  115  196    2  281 1660    5
  110    9  254  109    5   25   27    4  104  117    5  108    3  208
    8  283    3  495 1074    5   23  153  137   13  181   38   14    1
  546    5  119   47   41   36  256  138  154    8    1  371  247   38
   19    1   81  503  227    3  374   36   28 1022   80   78   50   32
   88  120   47    5   78   15   64  274  275   32  140  198    8    5
    1  300    4  767    8   36  274   38  275   10   18   76   21    5
  335  404]


# 資料預處理

In [24]:
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)

In [25]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)

In [26]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen=100)