In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import os.path as op
import shutil
from zipfile import ZipFile
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from utils import get_image_paths, word_from_image_path, preprocess_image, print_im, TextTransform, N_CHARS, SEQUENCE_LENGTH, IMAGE_DIMENSIONS
from utils import NgramTransform, load_trained_CNN_weights, base_cnn_in_keras
from multi_gpu import make_parallel
import time 
import datetime

%load_ext autoreload
%autoreload 2

Using TensorFlow backend.


#### Building N-grams Neural Network

In [2]:
from keras.models import Sequential, Model, load_model
from keras.layers import Conv2D, MaxPooling2D, Dropout, Dense, Reshape, Activation, GlobalAveragePooling2D
from keras.layers import Flatten
from utils import IMAGE_DIMENSIONS, SEQUENCE_LENGTH, N_CHARS



In [3]:
# loading base CNN model - no pre trained weights
base_cnn = base_cnn_in_keras()
import pickle
# loading weights pretrained
with open('cnn_weights.pickle', 'rb') as handler:
    trained_weights = pickle.load(handler)

In [4]:
layers_names = ['convo0', 'convo1', 'convo2', 'convo3', 'convo4', 'Dense1', 'Dense2']
# 

for layer in layers_names:
    get_layer = base_cnn.get_layer(name=layer)
    get_layer.set_weights([trained_weights[layer][0], trained_weights[layer][1]])

In [5]:
# N-grams model
model_ngrams = base_cnn
model_ngrams.add(Dense(10000, activation='sigmoid', name= "10kdense")) 
model_ngrams.compile(optimizer='adam', loss='categorical_crossentropy')

In [6]:
## Parellelize model 
p_model_ngrams = make_parallel(model_ngrams, 4)
p_model_ngrams.compile(optimizer='adam', loss='categorical_crossentropy')

### Loading Images and Batch

In [8]:
import h5py, pickle
# %%time
with h5py.File('/mnt/x_5m_2ndprocess.h5', 'r') as hf:
    x = hf['x'][:]
# with h5py.File('/mnt/y_5m_2ndprocess.h5', 'r') as hf:
#     y = hf['x'][:]
with open('/mnt/y_ngrams_5m_2ndprocess.h5', 'rb') as f:
    y_final = pickle.load(f)

In [10]:
with open('tt_new.pickle', 'rb') as f:
    tt = pickle.load(f)

In [11]:
### NGram model output vector 
ngram_transformer = NgramTransform()

In [12]:
# y_list =[]
# for i in range(y.shape[0]):
#     y_list.append(ngram_transformer.transform(tt.word_from_matrix(y[i])))

In [13]:
# from scipy.sparse import vstack
# y_final = vstack(y_list)

In [14]:
# y_final.shape

In [15]:
# import pickle
# with open('/mnt/y_ngrams_5m_2ndprocess.h5', 'wb') as f:
#     pickle.dump(y_final,f)

### Scaling gradients/classes to labels frequencies

In [16]:
# check which are the most frequent ngrams
ngram_transformer.ngram_from_matrix(ngram_transformer.X_tf > 5000000)

['a', 'e', 'i', 'n', 'r', 's']

In [17]:
# Creating labels dictionnary
labels_dict = {i: ngram_transformer.X_tf[i] for i in range(10000) }

In [18]:
## small batches 
one_million = 1000000
hundred_thousands = 100000
thousands = 1000
ten_thousands = 10000

In [19]:
x.shape, y_final.shape

((4998656, 32, 100), (4998656, 10000))

In [20]:
# fitting with respect to the labels scaling
# p_model_ngrams.fit(batch, batch_y, nb_epoch=10, class_weight=labels_dict)

In [None]:
for i in range(0,2500):
    begin = ((i - 1) % 500 ) * ten_thousands
    if begin == 4990000:
        end = y_final.shape[0]
    else :
        end = (((i - 1) % 500) +1) * ten_thousands
    run = i // 500 + 1
    x_batch = x[begin:end]
    y_batch = y_final[begin:end,:]
    y_batch = y_batch.toarray()
    
#     model_ngrams.fit(x_batch.reshape((x_batch.shape[0],) + IMAGE_DIMENSIONS + (1,)), y_batch, nb_epoch=1, class_weight=labels_dict)
    
    history = p_model_ngrams.fit(x_batch.reshape((x_batch.shape[0],) + IMAGE_DIMENSIONS + (1,)), y_batch, nb_epoch=1, class_weight=labels_dict)
        
    now = datetime.datetime.now().isoformat().split('.')[0]
    
    if i and not i % 10:
        with open("state_ngrams", 'a+') as f:
            f.write("run={} epoch={} loss={} now={} from={} to={}\n".format(run, (i - 1) % 500, history.history['loss'][0], now, begin, end))
    
    if i and not i % 100:
        p_model_ngrams.save('p_model_ngrams_{}.h5'.format(i))

Epoch 1/1
 544/8656 [>.............................] - ETA: 221s - loss: 737310366.1176

In [43]:
p_model_ngrams.save('p_model_ngrams_final.h5')

In [87]:
model_ngrams.save("n_grams_model_final.h5")

### Predictions

In [50]:
idxs = np.random.randint(0,x.shape[0],32)
x_test = x[idxs]
y_test = y_final[idxs].toarray()

In [62]:
predictions = p_model_ngrams.predict(x_test.reshape((x_test.shape[0],) + IMAGE_DIMENSIONS + (1,)))

In [91]:
predictions_top_10 = predictions.argmax(axis=0, out=10)

TypeError: output must be an array

In [85]:
predictions_ = predictions > 0.01
ngram_transformer.ngram_from_matrix(predictions_[0])

['a',
 'ar',
 'art',
 'arti',
 'at',
 'ate',
 'ates',
 'c',
 'ci',
 'cipa',
 'cu',
 'cul',
 'cula',
 'd',
 'e',
 'es',
 'i',
 'ic',
 'icip',
 'icu',
 'icul',
 'ipa',
 'ipat',
 'l',
 'la',
 'lat',
 'late',
 'lia',
 'll',
 'lla',
 'o',
 'p',
 'pa',
 'par',
 'part',
 'pat',
 'pate',
 'pi',
 'r',
 'ri',
 'rs',
 'rt',
 'rti',
 'rtic',
 's',
 't',
 'te',
 'tes',
 'ti',
 'tic',
 'tici',
 'ticu',
 'tul',
 'tula',
 'u',
 'ul',
 'ula',
 'ular',
 'ulat']

In [86]:
ngram_transformer.ngram_from_matrix(y_test[0])

['ar',
 'art',
 'arti',
 'at',
 'ate',
 'ates',
 'c',
 'cu',
 'cul',
 'cula',
 'e',
 'es',
 'i',
 'ic',
 'icu',
 'icul',
 'l',
 'la',
 'lat',
 'late',
 'p',
 'pa',
 'par',
 'part',
 'r',
 'rt',
 'rti',
 'rtic',
 's',
 'te',
 'tes',
 'ti',
 'tic',
 'ticu',
 'u',
 'ul',
 'ula',
 'ulat']

In [88]:
# ### Y construction

# import h5py
# with h5py.File('/mnt/y.h5', 'r') as hf:
#     y = hf['x'][:]

# import pickle
# with open('/datadrive/tt_new.pickle', 'rb') as f:
#     tt = pickle.load(f)

# with open('y_ngrams_m5_2.pickle', 'rb') as f:
#     y2 = pickle.load(f)
# with open('y_ngrams_m5.pickle', 'rb') as f:
#     y1 = pickle.load(f)

# y_ngrams = vstack([y1,y2])

# with open('y_ngrams.pickle', 'wb') as f:
#     pickle.dump(y_ngrams,f)

# %%time 
# for i in range(1468,1469):
#     t = tt.word_from_matrix(y[i])

# count_wrong_labels = 0
# blank_labels = []
# for i in range(y.shape[0]): 
#     if tt.word_from_matrix(y[i]) == "" :
#         blank_labels.append(i)

# with open('y_idx_blanks.pickle', 'wb') as hf: 
#     pickle.dump(blank_labels, hf)

