In [12]:
from keras.datasets import imdb
import numpy as np
import os
from keras.preprocessing import sequence
import pickle

def load_data(max_features=5000, skip_top=0, maxlen=400):
    """
    Load data if data have been created.
    Create data otherwise.

    """

    if 'data' not in os.listdir('.'):
        os.mkdir('data') 
        
    if 'id_to_word.pkl' not in os.listdir('data'):
        print('Loading data...')
        (x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=max_features, skip_top=skip_top, index_from=3)
        word_to_id = imdb.get_word_index()
        word_to_id ={k:(v+3) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        id_to_word = {value:key for key,value in word_to_id.items()}

        print(len(x_train), 'train sequences')
        print(len(x_val), 'test sequences')

        print('Pad sequences (samples x time)')
        x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
        x_val = sequence.pad_sequences(x_val, maxlen=maxlen)
        y_train = np.eye(2)[y_train]
        y_val = np.eye(2)[y_val] 

        np.save('./data/x_train.npy', x_train)
        np.save('./data/y_train.npy', y_train)
        np.save('./data/x_val.npy', x_val)
        np.save('./data/y_val.npy', y_val)
        with open('data/id_to_word.pkl','wb') as f:
            pickle.dump(id_to_word, f)  

    else:
        x_train, y_train, x_val, y_val = np.load('data/x_train.npy'),np.load('data/y_train.npy'),np.load('data/x_val.npy'),np.load('data/y_val.npy')
        with open('data/id_to_word.pkl','rb') as f:
            id_to_word = pickle.load(f)

    return x_train, y_train, x_val, y_val, id_to_word

In [13]:
x_train, y_train, x_val, y_val, id_to_word = load_data(max_features=5000, skip_top=0)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)


In [18]:
x_val.shape

(25000, 400)

In [20]:
x = np.vstack([x_train, x_val])

In [21]:
x.shape

(50000, 400)