# imdb_fasttext.py
[imdb_fasttext.py][1]  

[1]: https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py

In [18]:
'''This example demonstrates the use of fasttext for text classification
Based on Joulin et al's paper:
Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759
Results on IMDB datasets with uni and bi-gram embeddings:
    Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
    Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
'''

from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.datasets import imdb


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 2
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5

In [19]:
# print('Loading data...')
# (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

Loading data...


In [20]:
# import pickle

# pickle_file="./imdb_fasttext.pickle"

# with open(pickle_file, 'wb') as f:
#     pickle.dump(x_train, f)
#     pickle.dump(y_train, f)
#     pickle.dump(x_test, f)
#     pickle.dump(y_test, f)

In [21]:
import pickle

pickle_file="./imdb_fasttext.pickle"

with open(pickle_file, 'rb') as f:
    x_train = pickle.load(f)
    y_train = pickle.load(f)
    x_test = pickle.load(f)
    y_test = pickle.load(f)

In [22]:
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(
    np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(
    np.mean(list(map(len, x_test)), dtype=int)))

25000 train sequences
25000 test sequences
Average train sequence length: 238
Average test sequence length: 230


In [23]:
%%time
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(
        np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Adding 2-gram features
Average train sequence length: 476
Average test sequence length: 428
Pad sequences (samples x time)
x_train shape: (25000, 400)
x_test shape: (25000, 400)
Build model...
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
  736/25000 [..............................] - ETA: 20:57 - loss: 0.6926 - acc: 0.5503

KeyboardInterrupt: 

# 結果
## imdb_fasttext.py(オリジナル。ngram=1)
acc:92.5% & val_acc:88.8%  total: 4min 49s
## imdb_fasttext.py(オリジナル。ngram=2)
acc:99.5% & val_acc:90.4%  total: 3h 4s
## imdb_fasttext.py(colab。ngram=2)
acc:99.5% & val_acc:90.4%  total:5m 5s 

## fasttextロジックでロイターデータ処理
```
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(GlobalAveragePooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  110894 embedding_dims:  50 input_length:  400 batch_size:  32 epochs:  5
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
8982/8982 [==============================] - 4s 401us/step - loss: 2.8477 - acc: 0.3599 - val_loss: 2.2456 - val_acc: 0.3980
Epoch 2/5
8982/8982 [==============================] - 3s 347us/step - loss: 2.1460 - acc: 0.3957 - val_loss: 2.0688 - val_acc: 0.4239
Epoch 3/5
8982/8982 [==============================] - 3s 361us/step - loss: 1.9780 - acc: 0.4505 - val_loss: 1.9322 - val_acc: 0.4835
Epoch 4/5
8982/8982 [==============================] - 3s 361us/step - loss: 1.8313 - acc: 0.5137 - val_loss: 1.8073 - val_acc: 0.5508
Epoch 5/5
8982/8982 [==============================] - 3s 356us/step - loss: 1.6894 - acc: 0.5916 - val_loss: 1.6923 - val_acc: 0.6046
<keras.callbacks.History at 0x7f2b21207320>
```

## fasttextロジックでロイターデータ処理
```
embedding_dims=92
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(GlobalAveragePooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  110894 embedding_dims:  92 input_length:  400 batch_size:  32 epochs:  5
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
8982/8982 [==============================] - 5s 560us/step - loss: 2.6932 - acc: 0.3784 - val_loss: 2.1609 - val_acc: 0.4127
Epoch 2/5
8982/8982 [==============================] - 5s 507us/step - loss: 2.0320 - acc: 0.4378 - val_loss: 1.9462 - val_acc: 0.4728
Epoch 3/5
8982/8982 [==============================] - 5s 517us/step - loss: 1.8201 - acc: 0.5331 - val_loss: 1.7741 - val_acc: 0.5761
Epoch 4/5
8982/8982 [==============================] - 5s 526us/step - loss: 1.6302 - acc: 0.6240 - val_loss: 1.6287 - val_acc: 0.6327
Epoch 5/5
8982/8982 [==============================] - 5s 528us/step - loss: 1.4653 - acc: 0.6716 - val_loss: 1.5118 - val_acc: 0.6621
<keras.callbacks.History at 0x7f2b232df1d0>
```

## fasttextロジックでロイターデータ処理
```
embedding_dims=184
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(GlobalAveragePooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  110894 embedding_dims:  184 input_length:  400 batch_size:  32 epochs:  5
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
8982/8982 [==============================] - 9s 970us/step - loss: 2.5318 - acc: 0.3890 - val_loss: 2.0511 - val_acc: 0.4479
Epoch 2/5
8982/8982 [==============================] - 8s 865us/step - loss: 1.8905 - acc: 0.5070 - val_loss: 1.7928 - val_acc: 0.5712
Epoch 3/5
8982/8982 [==============================] - 8s 867us/step - loss: 1.6199 - acc: 0.6264 - val_loss: 1.5926 - val_acc: 0.6438
Epoch 4/5
8982/8982 [==============================] - 8s 866us/step - loss: 1.4012 - acc: 0.6818 - val_loss: 1.4466 - val_acc: 0.6687
Epoch 5/5
8982/8982 [==============================] - 8s 872us/step - loss: 1.2287 - acc: 0.7147 - val_loss: 1.3363 - val_acc: 0.6910
<keras.callbacks.History at 0x7f2b22c4fcf8>
```

## fasttextロジックでロイターデータ処理
```
embedding_dims=184
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(GlobalAveragePooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  93394 embedding_dims:  184 input_length:  400 batch_size:  32 epochs:  5
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
8982/8982 [==============================] - 108s 12ms/step - loss: 2.5311 - acc: 0.3899 - val_loss: 2.0695 - val_acc: 0.4417
Epoch 2/5
8982/8982 [==============================] - 104s 12ms/step - loss: 1.9066 - acc: 0.4923 - val_loss: 1.8054 - val_acc: 0.5654
Epoch 3/5
8982/8982 [==============================] - 107s 12ms/step - loss: 1.6344 - acc: 0.6181 - val_loss: 1.6014 - val_acc: 0.6345
Epoch 4/5
8982/8982 [==============================] - 107s 12ms/step - loss: 1.4095 - acc: 0.6777 - val_loss: 1.4489 - val_acc: 0.6719
Epoch 5/5
8982/8982 [==============================] - 108s 12ms/step - loss: 1.2326 - acc: 0.7144 - val_loss: 1.3383 - val_acc: 0.6901
<keras.callbacks.History at 0x7f5fa6702470>
```

## fasttextロジックでロイターデータ処理
```
max_features:  93394 embedding_dims:  184 input_length:  400 batch_size:  32 epochs:  5
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
8982/8982 [==============================] - 108s 12ms/step - loss: 2.5311 - acc: 0.3899 - val_loss: 2.0695 - val_acc: 0.4417
Epoch 2/5
8982/8982 [==============================] - 104s 12ms/step - loss: 1.9066 - acc: 0.4923 - val_loss: 1.8054 - val_acc: 0.5654
Epoch 3/5
8982/8982 [==============================] - 107s 12ms/step - loss: 1.6344 - acc: 0.6181 - val_loss: 1.6014 - val_acc: 0.6345
Epoch 4/5
8982/8982 [==============================] - 107s 12ms/step - loss: 1.4095 - acc: 0.6777 - val_loss: 1.4489 - val_acc: 0.6719
Epoch 5/5
8982/8982 [==============================] - 108s 12ms/step - loss: 1.2326 - acc: 0.7144 - val_loss: 1.3383 - val_acc: 0.6901
<keras.callbacks.History at 0x7f5fa6702470>
max_features:  93394 embedding_dims:  368 input_length:  400 batch_size:  32 epochs:  5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, 400, 368)          34368992  
_________________________________________________________________
global_average_pooling1d_2 ( (None, 368)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 46)                16974     
=================================================================
Total params: 34,385,966
Trainable params: 34,385,966
Non-trainable params: 0
_________________________________________________________________
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
8982/8982 [==============================] - 14s 2ms/step - loss: 2.3918 - acc: 0.4105 - val_loss: 1.9521 - val_acc: 0.4884
Epoch 2/5
8982/8982 [==============================] - 12s 1ms/step - loss: 1.7406 - acc: 0.5765 - val_loss: 1.6333 - val_acc: 0.6269
Epoch 3/5
8982/8982 [==============================] - 12s 1ms/step - loss: 1.4137 - acc: 0.6798 - val_loss: 1.4235 - val_acc: 0.6750
Epoch 4/5
8982/8982 [==============================] - 12s 1ms/step - loss: 1.1830 - acc: 0.7267 - val_loss: 1.2888 - val_acc: 0.7008
Epoch 5/5
8982/8982 [==============================] - 12s 1ms/step - loss: 1.0108 - acc: 0.7735 - val_loss: 1.1930 - val_acc: 0.7302
<keras.callbacks.History at 0x7f1d5259ec50>
```

## fasttextロジックでロイターデータ処理
```
from keras.layers import GlobalMaxPooling1D
embedding_dims=368
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# model.add(GlobalAveragePooling1D())
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  93394 embedding_dims:  368 input_length:  400 batch_size:  32 epochs:  5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_5 (Embedding)      (None, 400, 368)          34368992  
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 368)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 46)                16974     
=================================================================
Total params: 34,385,966
Trainable params: 34,385,966
Non-trainable params: 0
_________________________________________________________________
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
8982/8982 [==============================] - 14s 2ms/step - loss: 2.2130 - acc: 0.4660 - val_loss: 1.7213 - val_acc: 0.5726
Epoch 2/5
8982/8982 [==============================] - 12s 1ms/step - loss: 1.4644 - acc: 0.6672 - val_loss: 1.3551 - val_acc: 0.6848
Epoch 3/5
8982/8982 [==============================] - 13s 1ms/step - loss: 1.0976 - acc: 0.7552 - val_loss: 1.1304 - val_acc: 0.7453
Epoch 4/5
8982/8982 [==============================] - 12s 1ms/step - loss: 0.8324 - acc: 0.8201 - val_loss: 0.9905 - val_acc: 0.7725
Epoch 5/5
8982/8982 [==============================] - 12s 1ms/step - loss: 0.6292 - acc: 0.8595 - val_loss: 0.9165 - val_acc: 0.7845
<keras.callbacks.History at 0x7f1d18481908>
```

## fasttextロジックでロイターデータ処理
```
from keras.layers import GlobalMaxPooling1D
embedding_dims=368; epochs=10
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# model.add(GlobalAveragePooling1D())
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  93394 embedding_dims:  368 input_length:  400 batch_size:  32 epochs:  10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_7 (Embedding)      (None, 400, 368)          34368992  
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 368)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 46)                16974     
=================================================================
Total params: 34,385,966
Trainable params: 34,385,966
Non-trainable params: 0
_________________________________________________________________
Train on 8982 samples, validate on 2246 samples
Epoch 1/10
8982/8982 [==============================] - 14s 2ms/step - loss: 2.2225 - acc: 0.4787 - val_loss: 1.7340 - val_acc: 0.5557
Epoch 2/10
8982/8982 [==============================] - 12s 1ms/step - loss: 1.4701 - acc: 0.6730 - val_loss: 1.3616 - val_acc: 0.6906
Epoch 3/10
8982/8982 [==============================] - 12s 1ms/step - loss: 1.0971 - acc: 0.7561 - val_loss: 1.1406 - val_acc: 0.7413
Epoch 4/10
8982/8982 [==============================] - 13s 1ms/step - loss: 0.8271 - acc: 0.8169 - val_loss: 0.9997 - val_acc: 0.7783
Epoch 5/10
8982/8982 [==============================] - 12s 1ms/step - loss: 0.6232 - acc: 0.8559 - val_loss: 0.9194 - val_acc: 0.7832
Epoch 6/10
8982/8982 [==============================] - 12s 1ms/step - loss: 0.4669 - acc: 0.8967 - val_loss: 0.8753 - val_acc: 0.7925
Epoch 7/10
8982/8982 [==============================] - 12s 1ms/step - loss: 0.3441 - acc: 0.9255 - val_loss: 0.8531 - val_acc: 0.7956
Epoch 8/10
8982/8982 [==============================] - 12s 1ms/step - loss: 0.2522 - acc: 0.9432 - val_loss: 0.8475 - val_acc: 0.7979
Epoch 9/10
8982/8982 [==============================] - 12s 1ms/step - loss: 0.1897 - acc: 0.9532 - val_loss: 0.8557 - val_acc: 0.8001
Epoch 10/10
8982/8982 [==============================] - 12s 1ms/step - loss: 0.1489 - acc: 0.9548 - val_loss: 0.8640 - val_acc: 0.7983
<keras.callbacks.History at 0x7f1d181039b0>
```

## fasttextロジックでロイターデータ処理
```
from keras.layers import GlobalMaxPooling1D
embedding_dims=368; epochs=20
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# model.add(GlobalAveragePooling1D())
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  93394 embedding_dims:  368 input_length:  400 batch_size:  32 epochs:  20
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_8 (Embedding)      (None, 400, 368)          34368992  
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 368)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 46)                16974     
=================================================================
Total params: 34,385,966
Trainable params: 34,385,966
Non-trainable params: 0
_________________________________________________________________
Train on 8982 samples, validate on 2246 samples
Epoch 1/20
8982/8982 [==============================] - 14s 2ms/step - loss: 2.2099 - acc: 0.4986 - val_loss: 1.7325 - val_acc: 0.5508
Epoch 2/20
8982/8982 [==============================] - 12s 1ms/step - loss: 1.4735 - acc: 0.6650 - val_loss: 1.3595 - val_acc: 0.6923
Epoch 3/20
8982/8982 [==============================] - 12s 1ms/step - loss: 1.1021 - acc: 0.7585 - val_loss: 1.1299 - val_acc: 0.7467
Epoch 4/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.8327 - acc: 0.8174 - val_loss: 0.9942 - val_acc: 0.7765
Epoch 5/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.6300 - acc: 0.8577 - val_loss: 0.9140 - val_acc: 0.7827
Epoch 6/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.4737 - acc: 0.8926 - val_loss: 0.8680 - val_acc: 0.7939
Epoch 7/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.3505 - acc: 0.9240 - val_loss: 0.8449 - val_acc: 0.7974
Epoch 8/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.2576 - acc: 0.9446 - val_loss: 0.8421 - val_acc: 0.7961
Epoch 9/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.1926 - acc: 0.9540 - val_loss: 0.8446 - val_acc: 0.8001
Epoch 10/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.1499 - acc: 0.9555 - val_loss: 0.8555 - val_acc: 0.7974
Epoch 11/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.1237 - acc: 0.9549 - val_loss: 0.8676 - val_acc: 0.7979
Epoch 12/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.1081 - acc: 0.9537 - val_loss: 0.8920 - val_acc: 0.7943
Epoch 13/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.0976 - acc: 0.9542 - val_loss: 0.9020 - val_acc: 0.7970
Epoch 14/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.0908 - acc: 0.9561 - val_loss: 0.9260 - val_acc: 0.7939
Epoch 15/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.0871 - acc: 0.9542 - val_loss: 0.9303 - val_acc: 0.7934
Epoch 16/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.0835 - acc: 0.9559 - val_loss: 0.9565 - val_acc: 0.7890
Epoch 17/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.0830 - acc: 0.9540 - val_loss: 0.9502 - val_acc: 0.8001
Epoch 18/20
8982/8982 [==============================] - 13s 1ms/step - loss: 0.0791 - acc: 0.9556 - val_loss: 0.9794 - val_acc: 0.7952
Epoch 19/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.0775 - acc: 0.9550 - val_loss: 0.9802 - val_acc: 0.7996
Epoch 20/20
8982/8982 [==============================] - 12s 1ms/step - loss: 0.0772 - acc: 0.9539 - val_loss: 0.9943 - val_acc: 0.7930
<keras.callbacks.History at 0x7f1d180509b0>
```

## fasttextロジックでロイターデータ処理
```
embedding_dims=736; epochs=10
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# model.add(GlobalAveragePooling1D())
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
max_features:  93394 embedding_dims:  736 input_length:  400 batch_size:  32 epochs:  10
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_9 (Embedding)      (None, 400, 736)          68737984  
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 736)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 46)                33902     
=================================================================
Total params: 68,771,886
Trainable params: 68,771,886
Non-trainable params: 0
_________________________________________________________________
Train on 8982 samples, validate on 2246 samples
Epoch 1/10
8982/8982 [==============================] - 27s 3ms/step - loss: 2.0450 - acc: 0.5249 - val_loss: 1.5911 - val_acc: 0.6483
Epoch 2/10
8982/8982 [==============================] - 24s 3ms/step - loss: 1.2851 - acc: 0.7197 - val_loss: 1.2060 - val_acc: 0.7275
Epoch 3/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.8895 - acc: 0.8045 - val_loss: 0.9875 - val_acc: 0.7752
Epoch 4/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.6182 - acc: 0.8567 - val_loss: 0.8814 - val_acc: 0.7943
Epoch 5/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.4265 - acc: 0.9056 - val_loss: 0.8412 - val_acc: 0.7996
Epoch 6/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.2886 - acc: 0.9370 - val_loss: 0.8272 - val_acc: 0.8085
Epoch 7/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.1989 - acc: 0.9505 - val_loss: 0.8363 - val_acc: 0.8059
Epoch 8/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.1475 - acc: 0.9544 - val_loss: 0.8427 - val_acc: 0.8063
Epoch 9/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.1213 - acc: 0.9539 - val_loss: 0.8640 - val_acc: 0.8072
Epoch 10/10
8982/8982 [==============================] - 24s 3ms/step - loss: 0.1068 - acc: 0.9550 - val_loss: 0.8905 - val_acc: 0.8041
<keras.callbacks.History at 0x7f1d17b609e8>
```

# データの比較

In [4]:
import pickle

pickle_file_i="./imdb_fasttext.pickle"
pickle_file_r="./reuters.pickle"

with open(pickle_file_i, 'rb') as f:
    x_train_i = pickle.load(f)
    y_train_i = pickle.load(f)
    x_test_i = pickle.load(f)
    y_test_i = pickle.load(f)
with open(pickle_file_r, 'rb') as f:
    x_train_r = pickle.load(f)
    y_train_r = pickle.load(f)
    x_test_r = pickle.load(f)
    y_test_r = pickle.load(f)

In [18]:
print("ibmdb   lenght(train and test): ",len(x_train_i), len(x_test_i))
print("reuters lenght(train and test): ",len(x_train_r), len(x_test_r),"\n")

print("ibmdb     ave lenght(train and test): ",
    np.mean(list(map(len, x_train_i)), dtype=int),
    np.mean(list(map(len, x_test_i)), dtype=int))
print("reuters   ave lenght(train and test): ",
    np.mean(list(map(len, x_train_r)), dtype=int),
    np.mean(list(map(len, x_test_r)), dtype=int))

print("ibmdb     max lenght(train and test): ",
    np.max(list(map(len, x_train_i))),
    np.max(list(map(len, x_test_i))))
print("reuters   max lenght(train and test): ",
    np.max(list(map(len, x_train_r))),
    np.max(list(map(len, x_test_r))))

print("ibmdb     min lenght(train and test): ",
    np.min(list(map(len, x_train_i))),
    np.min(list(map(len, x_test_i))))
print("reuters   min lenght(train and test): ",
    np.min(list(map(len, x_train_r))),
    np.min(list(map(len, x_test_r))))

ibmdb   lenght(train and test):  25000 25000
reuters lenght(train and test):  8982 2246 

ibmdb     ave lenght(train and test):  238 230
reuters   ave lenght(train and test):  145 147
ibmdb     max lenght(train and test):  2494 2315
reuters   max lenght(train and test):  2376 1032
ibmdb     min lenght(train and test):  11 7
reuters   min lenght(train and test):  13 2


## ロイターデータについて

In [41]:
lst=[1,2,3,4,5,6,7]
lst=list(map(len,x_test_r))
print("len of x_test_r: ",len(lst))
def bigger(_l):
    return _l > 7
    
np.sum(list(map(bigger,lst)))

len of x_test_r:  2246


2245

# reuters_mlpデータをロード

In [30]:
'''This example demonstrates the use of fasttext for text classification
Based on Joulin et al's paper:
Bags of Tricks for Efficient Text Classification
https://arxiv.org/abs/1607.01759
Results on IMDB datasets with uni and bi-gram embeddings:
    Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu.
    Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.
'''

from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.layers import GlobalMaxPooling1D
from keras.datasets import imdb


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 2
# max_features = 20000
max_features = 2500
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 5

In [25]:
from keras.datasets import reuters
max_words = 1000
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)

In [26]:
import pickle
pickle_file_r = "./reuters.pickle"
with open(pickle_file_r, 'wb') as f:
  pickle.dump(x_train, f)
  pickle.dump(y_train, f)
  pickle.dump(x_test, f)
  pickle.dump(y_test, f)

In [31]:
import pickle

pickle_file="./reuters.pickle"

with open(pickle_file, 'rb') as f:
    x_train = pickle.load(f)
    y_train = pickle.load(f)
    x_test = pickle.load(f)
    y_test = pickle.load(f)

In [32]:
l_x_train = len(x_train)
l_x_test = len(x_test)
m_x_train =  np.mean(list(map(len, x_train)), dtype=int)
m_x_test =  np.mean(list(map(len, x_test)), dtype=int)

print(l_x_train, 'train sequences')
print(l_x_test, 'test sequences')
print('Average train sequence length: {}'.format(m_x_train))
print('Average test sequence length: {}'.format(m_x_test))

8982 train sequences
2246 test sequences
Average train sequence length: 145
Average test sequence length: 147


In [33]:
%%time
if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    print("max_features before np.max()",max_features)
    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1
    print("max_features after  np.max()",max_features)

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(
        np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, x_test)), dtype=int)))

Adding 2-gram features
max_features before np.max() 2500
max_features after  np.max() 93394
Average train sequence length: 290
Average test sequence length: 289
CPU times: user 1.94 s, sys: 5.85 ms, total: 1.95 s
Wall time: 1.95 s


In [34]:
# l_x_train = len(x_train)
# l_x_test = len(x_test)
# m_x_train =  np.mean(list(map(len, x_train)), dtype=int)
# m_x_test =  np.mean(list(map(len, x_test)), dtype=int)

# print(l_x_train, 'train sequences')
# print(l_x_test, 'test sequences')
print("\nbefoe ngram")
print('Average train sequence length: {}'.format(m_x_train))
print('Average test sequence length: {}'.format(m_x_test))
print("max(y_train): ", max(y_train))


befoe ngram
Average train sequence length: 145
Average test sequence length: 147
max(y_train):  45


In [35]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (8982, 400)
x_test shape: (2246, 400)


In [36]:
print("max_features: ",max_features)
print("embedding_dims: ",embedding_dims)
print("maxlen: ",maxlen)
print("batch_size: ",batch_size)
print("epochs: ",epochs)

max_features:  93394
embedding_dims:  50
maxlen:  400
batch_size:  32
epochs:  5


In [37]:
import keras
num_classes = np.max(y_train) + 1
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [38]:
embedding_dims=184
print("max_features: ",max_features,"embedding_dims: ",embedding_dims,"input_length: ",maxlen,"batch_size: ",batch_size,"epochs: ",epochs)
model = Sequential()
model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
# model.add(GlobalAveragePooling1D())
model.add(GlobalMaxPooling1D())
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

max_features:  93394 embedding_dims:  184 input_length:  400 batch_size:  32 epochs:  5
Train on 8982 samples, validate on 2246 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5fa6702470>

In [17]:
print(x_train.ndim,y_train.ndim,x_train.shape,y_train.shape)
print(x_test.ndim,y_test.ndim,x_test.shape,y_test.shape)

2 1 (8982, 400) (8982,)
2 1 (2246, 400) (2246,)
