In [1]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Activation, Dropout, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model,Sequential
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import random
import sys

BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/GloVe-1.2/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/books'
MAX_NB_WORDS = 20000 # Ennyi különböző szót kezelünk majd
EMBEDDING_DIM = 100 # Ekkora lesz a használt beágyazás
LENGTH_OF_CHUNKS = 100 # Ekkora lesz egy-egy minta a szovegbol

Using TensorFlow backend.


In [2]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split() # "tokenizáljuk" a sort
    word = values[0] # maga a szó
    coefs = np.asarray(values[1:], dtype='float32') # a szót követő beágyazás 100 koordinátán
    embeddings_index[word] = coefs
f.close()

print('Betöltött beágyazások száma:', len(embeddings_index))

Betöltött beágyazásaok száma: 400000
A `the` beágyazó vektorának első 10 eleme: [-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953
 -0.39140999  0.3344     -0.57545     0.087459  ]


In [20]:
texts = []         # text of books
labels_index = {}  # IDs of the authors
labels = []        # for the storage of IDs

for name in sorted(os.listdir(TEXT_DATA_DIR)):
    print (name)
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        print(label_id)
        labels_index[name] = label_id # new ID for a new author
        for fname in sorted(os.listdir(path)):            
            fpath = os.path.join(path, fname)
            print(fpath)
            f = open(fpath)
            texts.append(f.read())
            f.close()
            labels.append(label_id)

print('Number of books: ', len(texts))

austen
0
./books/austen/mansfield.txt
./books/austen/pride.txt
./books/austen/sense.txt
poe
1
./books/poe/cask.txt
./books/poe/fall.txt
./books/poe/masque.txt
wells
2
./books/wells/island.txt
./books/wells/redRoom.txt
./books/wells/timeMachine.txt
Number of books:  9


In [4]:
for i in range(0,9):
    print(len(texts[i]))

926649
725522
705197
32530
62470
33227
266106
41319
201262


In [29]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Különböző szavak száma az összes szövegben: ', len(word_index))

labels = to_categorical(np.asarray(labels))
print('A label tenzor alakja:', labels.shape)

Különböző szavak száma az összes szövegben:  19909
A label tenzor alakja: (9, 3)


In [30]:
for i in range(0,9):
    print(len(sequences[i]))

169515
131382
128410
5765
10637
5711
50524
7365
36951


In [31]:
print(labels)

[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 0.  0.  1.]
 [ 0.  0.  1.]]


In [32]:
def cutUp(s): 
    sequence = np.squeeze(np.asarray(s))
    data = [sequence[i:i + LENGTH_OF_CHUNKS] for i in range(0, len(sequence), LENGTH_OF_CHUNKS)] 
    if(len(sequence) / LENGTH_OF_CHUNKS != 0):
        data = np.delete(data, (len(data)-1), axis=0)
    return data, len(data)

In [33]:
x_data = []     # input data 
y = []          #labels
for i in range (0, 9):
    data_raw, length = cutUp(sequences[i])
    x_data = np.append(x_data, np.squeeze(np.asarray(data_raw)))
    x_data = np.squeeze(np.asarray(x_data))
    
    for j in range (0,length):
        y[len(y):] = [labels[i]]
    y_data = np.squeeze(np.asarray(y))
    
    print("X: ", x_data.shape)
    print("Y: ", y_data.shape)

X:  (1695,)
Y:  (1695, 3)
X:  (3008,)
Y:  (3008, 3)
X:  (4292,)
Y:  (4292, 3)
X:  (4349,)
Y:  (4349, 3)
X:  (4455,)
Y:  (4455, 3)
X:  (4512,)
Y:  (4512, 3)
X:  (5017,)
Y:  (5017, 3)
X:  (5090,)
Y:  (5090, 3)
X:  (5459,)
Y:  (5459, 3)


In [34]:
print(x_data[10], y_data[10])
print(x_data[4500], y_data[4500])
print(x_data[5400], y_data[5400])

[    3  1050     5    11     4   980    44    18   109  3028    21     1
  4583     5     1  2734    67  1226   276  1147    21     9   793   124
    83    72   168    56   198    67  1212    13  2843   156   973    20
     1   684    32     2   131   231    35    15   167    33  5734     3
  8800     6  1607   210 18978     3  1498    11    10     6   695   716
     6   506    33   370  1666    62    18 18559  1107    20    39    52
    44    18  2783  8538     5   500     8   637     9    38     9   185
    17   134     5    22    88  3372   616     5    22   128  2824     5
   871     8   228  2077] [ 1.  0.  0.]
[  54   15  162 2858    4   14   20 2286 1590 3686 3873 4175  359 4238 1463
  149   63   14  167  532    5    1 1087    5  599 2508    2    2  164  617
  630 1080  279    5 1189   39  704   63   14 1888   98 1492    8   36  298
  138  288 2498  319    5 1013   11   14 1156  760    6  704    5    1  689
   63   57   14  678   20   11   30 1611   98  775 1099    4    1  339  

In [78]:
indices = np.arange(x_data.shape[0])
np.random.shuffle(indices)
data = x_data[indices]
labels = y_data[indices]

nb_train_samples = int(len(data)*0.7)
nb_val_samples = int(len(data)*0.15)
nb_test_samples = int(len(data)*0.15)

train_end = nb_train_samples
val_end = nb_train_samples + nb_val_samples

x_tr = data[:train_end]
y_train = labels[:train_end]
x_v = data[train_end:val_end]
y_val = labels[train_end:val_end]
x_t = data[val_end:]
y_test = labels[val_end:]

In [79]:
print(x_tr.shape, y_train.shape)
print(x_v.shape, y_val.shape)
print(x_t.shape, y_test.shape)

(3821,) (3821, 3)
(818,) (818, 3)
(820,) (820, 3)


In [80]:
x_train[3820].size

100

In [84]:
x_train = np.zeros((x_tr.size, LENGTH_OF_CHUNKS))
x_val = np.zeros((x_v.size, LENGTH_OF_CHUNKS))
x_test = np.zeros((x_t.size, LENGTH_OF_CHUNKS))

for i in range (0, x_tr.size):
    x_train[i] = x_tr[i]
    
for i in range (0, x_v.size):
    x_val[i] = x_v[i]
    
for i in range (0, x_t.size):
    x_test[i] = x_t[i]
    
print("Train data shape: ", x_train.shape)
print("Validation data shape: ", x_val.shape)
print("Test data shape: ", x_test.shape)

Train data shape:  (3821, 100)
Validation data shape:  (818, 100)
Test data shape:  (820, 100)


In [85]:
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) 
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector
        
print ('A szövegre alkalmazott beágyazási mátrix sorainak száma:', len(embedding_matrix))

A szövegre alkalmazott beágyazási mátrix sorainak száma: 19910


In [90]:
model = Sequential()

model.add(Embedding(nb_words + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=LENGTH_OF_CHUNKS,
                            trainable=False))

model.add(Conv1D(nb_filter=128,filter_length=5, border_mode='valid', activation='relu', subsample_length=2))
model.add(MaxPooling1D(pool_length=5))
model.add(Dropout(0.5))

model.add(Conv1D(nb_filter=128,filter_length=5, border_mode='valid', activation='relu', subsample_length=2))
model.add(MaxPooling1D(pool_length=3))
model.add(Dropout(0.5))

model.add(LSTM(50))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [91]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=5, batch_size=50)

model.evaluate(x_test, y_test)

Train on 3821 samples, validate on 818 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.2104394991223405, 0.91463414663221776]

In [92]:
model.layers[0].trainable=True

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=5, batch_size=50)

model.evaluate(x_test, y_test)

Train on 3821 samples, validate on 818 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.1839982146170081, 0.91463414663221776]