In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import os.path as op
import shutil
from zipfile import ZipFile
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from utils import get_image_paths, word_from_image_path, preprocess_image, print_im, TextTransform, N_CHARS, SEQUENCE_LENGTH, IMAGE_DIMENSIONS

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


In [2]:
from multi_gpu import make_parallel

In [3]:
base_dir = '/mnt/mjsynth/mnt/ramdisk/max/90kDICT32px/'

In [4]:
%%time
images_paths = get_image_paths(base_dir)

CPU times: user 14.4 s, sys: 9.64 s, total: 24 s
Wall time: 45.9 s


In [5]:
len(images_paths)

8919273

In [6]:
text_transformer = TextTransform()

### creating X and Y

In [7]:
from tqdm import tqdm, tqdm_notebook

In [15]:
#%%time
ims = []
errors_1, errors_2 = [], []
for i, im in tqdm_notebook(enumerate(images_paths[:]), total=len(images_paths)):
    try:
        ims.append(preprocess_image(img_to_array(load_img(im, grayscale=True))))
    except OSError:
        errors_1.append(i)





### save on disk

In [20]:
batch1 = np.array(ims[:1000000])
ims = ims[1000000:]
batch2 = np.array(ims[:1000000])
one_million = 1000000
del ims[:one_million]
batch3 = np.array(ims[:one_million])
del ims[:one_million]
batch4 = np.array(ims[:one_million])
del ims[:one_million]
batch5 = np.array(ims[:one_million])
del ims[:one_million]
batch6 = np.array(ims[:one_million])
del ims[:one_million]
batch7 = np.array(ims[:one_million])
del ims[:one_million]
batch8 = np.array(ims[:one_million])
del ims[:one_million]
batch9 = np.array(ims)

In [32]:
import h5py

with h5py.File('/mnt/x1.py', 'w') as hf:
    hf.create_dataset("x",  data=batch1)
with h5py.File('/mnt/x2.py', 'w') as hf:
    hf.create_dataset("x",  data=batch2)
with h5py.File('/mnt/x3.py', 'w') as hf:
    hf.create_dataset("x",  data=batch3)
with h5py.File('/mnt/x4.py', 'w') as hf:
    hf.create_dataset("x",  data=batch4)
with h5py.File('/mnt/x5.h5', 'w') as hf:
    hf.create_dataset("x",  data=batch5)
with h5py.File('/mnt/x6.h5', 'w') as hf:
    hf.create_dataset("x",  data=batch6)
with h5py.File('/mnt/x7.h5', 'w') as hf:
    hf.create_dataset("x",  data=batch7)
with h5py.File('/mnt/x8.h5', 'w') as hf:
    hf.create_dataset("x",  data=batch8)
with h5py.File('/mnt/x9.h5', 'w') as hf:
    hf.create_dataset("x",  data=batch9)
with h5py.File('/mnt/errors.h5', 'w') as hf:
    hf.create_dataset("x",  data=np.array(errors_1))

In [58]:
del batch1, batch2, batch3, batch4, batch5, batch6, batch7, batch8, batch9

In [None]:
del ims

In [17]:
image_paths_without_errors = []

for i, im in tqdm_notebook(enumerate(images_paths), total=len(images_paths)):
    if i not in errors_1:
        image_paths_without_errors.append(im)




In [19]:
%%time
batch_y = text_transformer.make_batch_labels(image_paths_without_errors)

CPU times: user 4min 33s, sys: 3min 33s, total: 8min 7s
Wall time: 8min 19s


In [30]:
b_y = batch_y.astype(np.float32)

In [31]:
with h5py.File('/mnt/y.h5', 'w') as hf:
    hf.create_dataset("x",  data=np.array(b_y))

### Read data

In [6]:
import h5py
import numpy as np

In [2]:
with h5py.File('/mnt/errors.h5', 'r') as hf:
    errors_1 = hf['x'][:]

In [4]:
with h5py.File('/mnt/x1.h5', 'r') as hf:
    x1 = hf['x'][:]

In [20]:
for i in range(5, 9):
    print(i)
    with h5py.File('/mnt/x{}.h5'.format(i), 'r') as hf:
        x1 = np.concatenate((x1, hf['x'][:]))

5
6
7
8


MemoryError: 

In [None]:
x1.shape

In [24]:
%%time
with open('/mnt/x.npy', 'wb+') as f:
    np.save(f, x1[:])

CPU times: user 16.8 s, sys: 1min 12s, total: 1min 28s
Wall time: 3min 25s


In [21]:
x1.shape

(7000000, 32, 100)

In [9]:
x2 = np.concatenate((x1, x1))

In [10]:
x2.shape

(2000000, 32, 100)

In [None]:
%%time
batch = batch - batch.mean(axis=(1, 2)).reshape((-1, 1, 1))
batch = batch / batch.std(axis=(1, 2)).reshape((-1, 1, 1))

### Clean data (remove errors from y)

### save data

In [None]:
with h5py.File('/mnt/y.h5', 'w') as hf:
    hf.create_dataset("y",  data=batch_y)

### Model creation

In [25]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Dense, Reshape, Activation
from keras.layers import Flatten

Using TensorFlow backend.


In [29]:
from utils import IMAGE_DIMENSIONS, SEQUENCE_LENGTH, N_CHARS

In [30]:
convolutions = [64, 128, 256, 512, 512]
kernels = [5, 5, 3, 3, 3]

model = Sequential()
input_shape = (None,) + IMAGE_DIMENSIONS + (1,)

model.add(Conv2D(nb_filter=64,
                     nb_row=kernels[0],
                     nb_col=kernels[0],
                     activation='relu',
                     border_mode='same',
                     batch_input_shape=input_shape, name="convo" + str(0)))

model.add(MaxPooling2D(pool_size=(2, 2), border_mode='same'))

for i, (kernel, convolution_size) in enumerate(zip(convolutions[1:], kernels[1:])):
    model.add(Conv2D(nb_filter=convolution_size,
                     nb_row=kernel,
                     nb_col=kernel,
                     activation='relu',
                     border_mode='same',
                     name="convo" + str(i + 1)))
    
    if i <= 3 :
        model.add(MaxPooling2D(pool_size=(2, 2), border_mode='same',))

model.add(Flatten())
# model.add(Dense(128, activation='relu'))
model.add(Dense(4096, activation='relu'))
model.add(Dense(4096, activation='relu'))

model.add(Dense(SEQUENCE_LENGTH * N_CHARS))

model.add(Reshape((SEQUENCE_LENGTH, N_CHARS)))
model.add(Activation('softmax'))


model.compile(optimizer='adam', loss='categorical_crossentropy')

In [232]:
p_model = make_parallel(model, 4)
p_model.compile(optimizer='adam', loss='categorical_crossentropy')

In [32]:
model.save('model_random_weights.keras')

In [None]:
biggest_batches = 32 * (batch.shape[0] // 32)

In [None]:
b = batch[:biggest_batches]
b_y = batch_y[:biggest_batches]

In [247]:
import pickle

In [269]:
import datetime

In [None]:
for epoch in range(10):
    history = p_model.fit(b.reshape((b.shape[0],) + IMAGE_DIMENSIONS + (1,)), b_y, nb_epoch=1)
    
    now = datetime.datetime.now().isoformat().split('.')[0]

    with open("state", 'a+') as f:
        f.write("epoch={}, loss={}, now={}\n".format(epoch, history.history['loss'][0], now))
    
    p_model.save('p_model_{}.h5'.format(epoch))

In [None]:
pickle.dump()

In [147]:
for i, by in enumerate(batch_y):
    try:
        print(text_transformer.word_from_matrix(res[i]), text_transformer.word_from_matrix(by))
    except:
        pass

sarti                   crustal                
sart                    paths                  
sart                    pace                   
sereiii                 arapahoes              
seeeiiie                retorts                
sereiii                 corrosively            
sarti                   betas                  
seeeiiii                interpenetration       
sarte                   temps                  
sait                    pb                     
seeeriiin               transliteration        
sereii                  buskin                 
sartie                  briton                 
seeeiii                 interacted             
sartie                  chained                
seeeiii                 regularizing           
sereiii                 cadenzas               
seeeiii                 teaspoons              
seeeiiie                
sartie                  overcoats              
sartie                  eulogizes              
seeeriiin      

  
