In [1]:
import urllib.request
import os
import tarfile

In [2]:
# 下載 IMDb 資料集到指定路徑
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath = "/Users/PChomeIM/pywork/Dataset/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url, filepath)
    print('download', result)

In [3]:
# 解壓縮下載檔案
if not os.path.exists("/Users/PChomeIM/pywork/Dataset/aclImdb"):
    tfile = tarfile.open(filepath, 'r:gz')
    result = tfile.extractall("/Users/PChomeIM/pywork/Dataset/")

In [4]:
# 將評論的htmltag替換成空字串的函數
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [5]:
# 讀取訓練或測試資料的函數
def read_files(filetype):
    path = "/Users/PChomeIM/pywork/Dataset/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list.append(positive_path + f)
    
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list.append(negative_path + f)
        
    print('read', filetype, 'files:', len(file_list))
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding='utf8') as file_input:
            all_texts.append(rm_tags(" ".join(file_input.readlines())))
            
    all_labels = [1]*12500 + [0]*12500
    
    return all_texts, all_labels

In [6]:
# 讀取訓練集
train_text, train_label = read_files('train')

read train files: 25000


In [7]:
# 讀取測試集
test_text, test_label = read_files('test')

read test files: 25000


In [8]:
train_text[1]

'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they\'ll be next to end up on the streets.But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it\'s like to be homeless? That is Goddard Bolt\'s lesson.Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days without the luxuries; if Bolt 

In [9]:
train_label[1]

1

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

Using TensorFlow backend.


In [11]:
# 將影評最常出現的3800字作為Token，並建立字典。
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)

In [12]:
token.word_index

{'anchía': 60328,
 'posers': 37578,
 'drekish': 75268,
 'smiting': 57993,
 'blurry': 11522,
 'calamai': 17142,
 'oragami': 71128,
 'inflection': 25369,
 'buckwheat': 44121,
 'østbye': 69277,
 'deliberation': 27795,
 'ging': 67741,
 'chucklethis': 56159,
 'reisman': 61947,
 'entrée': 30121,
 'summation': 17134,
 'transparent': 7835,
 'mileage': 22548,
 'barreled': 55606,
 '31st': 33139,
 'rojar': 71570,
 'gouge': 23287,
 'kippei': 52231,
 'stretta': 79352,
 'suburbanite': 31058,
 'exec': 13193,
 "'intelligence'": 51172,
 'couldve': 78070,
 'disagreeing': 31968,
 'usefully': 78713,
 'auster': 85359,
 "mess'": 48855,
 'curiosity': 3615,
 'isabel\x97who': 69906,
 'sat1': 40793,
 'yvaine': 15437,
 'fortuneately': 78371,
 "'s'": 51550,
 'pits': 10563,
 'asserts': 15714,
 'mccall': 29506,
 "cheese'": 88491,
 'feverishly': 36099,
 "o'shea": 16176,
 "pbs's": 55834,
 'destructed': 42887,
 'perfection': 3198,
 'gum': 10417,
 'clincher': 30312,
 'reg': 22238,
 'ria': 53370,
 'anakin': 12538,
 'sta

In [13]:
# 根據字典，將影評文字轉成數字list，不在字典中的文字不轉換。
train_seq = token.texts_to_sequences(train_text)
test_seq = token.texts_to_sequences(test_text)

In [14]:
train_seq[1]

[38,
 13,
 739,
 3413,
 43,
 73,
 31,
 1828,
 14,
 149,
 17,
 111,
 3,
 1338,
 5,
 335,
 144,
 19,
 1,
 886,
 11,
 67,
 276,
 1190,
 402,
 33,
 118,
 282,
 35,
 166,
 5,
 391,
 153,
 38,
 2304,
 14,
 1,
 546,
 87,
 80,
 100,
 4,
 1,
 3263,
 13,
 39,
 3,
 412,
 1199,
 133,
 40,
 179,
 137,
 13,
 3080,
 1,
 321,
 19,
 358,
 5,
 3107,
 2126,
 1,
 38,
 44,
 3656,
 25,
 371,
 5,
 126,
 52,
 19,
 1,
 1980,
 17,
 47,
 44,
 21,
 67,
 344,
 3,
 2128,
 5,
 408,
 19,
 1,
 1980,
 14,
 3,
 3229,
 205,
 1,
 21,
 276,
 65,
 35,
 3,
 340,
 1,
 719,
 725,
 3,
 1264,
 19,
 1,
 1506,
 3,
 1220,
 2,
 282,
 21,
 276,
 2523,
 5,
 63,
 47,
 41,
 36,
 5,
 25,
 3263,
 11,
 6,
 2030,
 3763,
 3208,
 33,
 33,
 379,
 13,
 294,
 3,
 1022,
 128,
 33,
 43,
 282,
 7,
 1,
 178,
 362,
 5,
 93,
 3,
 2128,
 15,
 3,
 2995,
 5,
 63,
 44,
 26,
 66,
 408,
 7,
 1,
 1980,
 14,
 3247,
 499,
 205,
 1,
 44,
 2875,
 26,
 66,
 78,
 47,
 26,
 490,
 15,
 3,
 701,
 1181,
 4,
 227,
 49,
 1,
 19,
 117,
 6,
 1367,
 19,
 1,
 886,
 15,
 3,


In [15]:
# 每個影評的數字list長度不同，要進行深度學習，長度必須固定。
train_feature = sequence.pad_sequences(train_seq, maxlen=380)
test_feature = sequence.pad_sequences(test_seq, maxlen=380)

In [16]:
train_feature[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [17]:
len(train_feature[0])

380