In [66]:
import os
import numpy as np
import pandas as pd
from collections import Counter


from keras import backend as K
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.applications.mobilenet import MobileNet
from keras.layers import Input, Dense, Lambda, Dropout, Activation
from keras.optimizers import rmsprop, Adam, SGD, Adagrad


from keras.utils import to_categorical
from keras.models import Model


def get_str2numb_numb2dict(vect):
    str_to_ind_dict = {}
    count = 0
    for v in vect:
        if v not in str_to_ind_dict.keys():
            str_to_ind_dict[v] = count
            count += 1
    reverse_dict = {v:k for k, v in str_to_ind_dict.items()}
    return str_to_ind_dict, reverse_dict

    
def apply_dict(dict_keys, X):
    res = []
    for x in X:
        res.append(dict_keys[x])
    return res
    
    
class Classification_model:
    def __init__(self, alpha, input_shape, num_classes, cache_dir, train_head, batch_size=32):
        self.alpha = alpha
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.cache_dir = cache_dir
        self.batch_size = batch_size
        if not os.path.isdir(self.cache_dir):
            os.makedirs(self.cache_dir)
        self.build_model(train_head)
    
    def build_model(self, train_head):        
        # If imagenet weights are being loaded, alpha can be one of`0.25`, `0.50`, `0.75` or `1.0` only.
        base_model = MobileNet(input_shape=self.input_shape, alpha=self.alpha, weights=None, 
                               include_top=False, pooling='avg') 
        
        # Use to pretrain head
        if train_head == True:
            for layer in base_model.layers[:-4]:
                layer.trainable = False
            
        # base_model.summary()
        op = Dense(128, activation='relu')(base_model.output)
        op = Dropout(0.00001)(op)
        output_tensor = Dense(self.num_classes, activation='softmax')(op)
       
        self.model = Model(inputs=base_model.input, outputs=output_tensor)
        self.model.summary()
        
    def train(self, train_dir, train_csv, epochs, learning_rate=0.00001):
        train = pd.read_csv(train_csv)
        train_x, train_y = train['file_name'].as_matrix(), train['label'].as_matrix()
        
        self.str2ind_dict, self.ind2str_dict = get_str2numb_numb2dict(train_y)
        train_y = np.array(apply_dict(self.str2ind_dict, train_y))

        train_generator = WordsSequence(img_dir=train_dir,
                                        input_shape = self.input_shape,
                                        x_set=train_x,
                                        y_set=train_y,
                                        batch_size=self.batch_size,
                                        classification=True)
                                        
        optimize = rmsprop(lr=learning_rate, decay=1e-6)
        # optimize = Adam(lr=0.00000001) 
        # optimize = SGD()
        # optimize = Adagrad(lr=0.0001)
        
        self.model.compile(loss='categorical_crossentropy', optimizer=optimize, metrics=['categorical_accuracy'])
        file_path_to_checkpoint = self.cache_dir + '/checkpoint-{epoch:02d}.h5'
        print(file_path_to_checkpoint)
        self.model.fit_generator(train_generator,
                                 steps_per_epoch=len(train_x)//self.batch_size,
                                 shuffle=True,
                                 epochs=epochs,
                                 verbose=1, 
                                 callbacks=[ModelCheckpoint(filepath=file_path_to_checkpoint, save_weights_only=True)])
        path_to_save_model = self.cache_dir + '/' + 'final_model.h5'
        path_to_save_weights = self.cache_dir + '/' + 'final_weights.h5'
        self.model.save(path_to_save_cache)
        self.save_weights(path_to_save_weights)  
        
    def save_weights(self, filename):
        self.model.save_weights(filename)
        
    def load_weights(self, filename):
        self.model.load_weights(filename, by_name=True, skip_mismatch=True)
        
    def predict(self, test_dir, test_csv): 

        test = pd.read_csv(test_csv)
        test_x, test_y = test['file_name'].as_matrix(), test['label'].as_matrix()
        self.str2ind_dict, self.ind2str_dict = get_str2numb_numb2dict(test_y)
        test_generator = WordsSequence(img_dir=test_dir,
                                        input_shape = self.input_shape,
                                        x_set=test_x,
                                        batch_size=self.batch_size,
                                        classification=True)
                                        
        pred = np.argmax(self.model.predict_generator(test_generator, verbose=1), axis=1)  
        res = np.array(apply_dict(self.ind2str_dict, pred))
        
        count = 0
        for i,j in zip(res, test_y):
            if i == j:
                count += 1
        print('word accuracy: ', count / len(test_y))
        
        count = 0
        autors = np.unique(test_y)
        autor_ind = [np.argwhere(test_y == a) for a in autors]
        for i,inds in enumerate(autor_ind):
            p = Counter(np.ravel(res[inds])).most_common(1)[0][0]
            if p == autors[i]:
                count += 1

        print('top-1 autor accuracy: ', count / len(autors))
        
        сount = 0
        for i,inds in enumerate(autor_ind):
            p = [pair[0] for pair in Counter(np.ravel(res[inds])).most_common(5)]
            if autors[i] in p:
                сount += 1

        print('top-5 autor accuracy: ', сount / len(autors))


In [3]:
import sys 
sys.path.insert(0, '../')
import numpy as np
import pandas as pd
from keras.utils import Sequence
from sklearn.utils import shuffle

from keras.utils import to_categorical

    
class WordsSequence(Sequence):
    def __init__(self, img_dir, input_shape, x_set, y_set=None, batch_size=1, classification=False):
        if classification:
            if y_set is not None:
                self.x, self.y = x_set, y_set
                self.dataset = pd.DataFrame(data={'x': self.x, 'y': self.y, 'used': np.zeros_like(self.y)})
            else:
                self.x, self.y = x_set, None
        else:
            if y_set is not None:
                self.x, self.y = x_set, y_set
                self.x, self.y = shuffle(self.x, self.y)
            else:
                self.x, self.y = x_set, None

        self.img_dir = img_dir
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.classification = classification


    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        if self.classification:
            if self.y is None:
                batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
                return np.array([self.preprocess(fetch(self.img_dir, name)) for name in batch_x])
            
            unused = self.dataset.loc[self.dataset['used'] == 0]
            if len(unused) >= self.batch_size:
                batch_indices = unused.sample(n=self.batch_size).index
            else:
                batch_indices = unused.sample(n=self.batch_size, replace=True).index

            self.dataset.loc[batch_indices, 'used'] = 1
            batch_x = self.dataset.iloc[batch_indices]['x'].values
            batch_y = self.dataset.iloc[batch_indices]['y'].values 
            return np.array([self.preprocess(fetch(self.img_dir, name)) for name in batch_x]), to_categorical(batch_y, 95)

        if self.y is None:
            x = self.x[idx]
            return np.expand_dims(self.preprocess(fetch(self.img_dir, x)), axis=0)
            
        
        curr_x = self.x[idx]
        curr_y = self.y[idx]

        x_1_images = self.preprocess(fetch(self.img_dir, curr_x[0]))
        x_2_images = self.preprocess(fetch(self.img_dir, curr_x[1]))
        return [np.expand_dims(x_1_images, axis=0), np.expand_dims(x_2_images, axis=0)], np.array([curr_y])
       

    def preprocess(self, img):
        assert len(img.shape) == 3

        h, w, _ = img.shape
        if h / w <= self.input_shape[0] / self.input_shape[1]:
            img = resize(img, (self.input_shape[1], int(self.input_shape[1] * h / w)))
        else:
            img = resize(img, (int(self.input_shape[0] * w / h), self.input_shape[0]))

        img = pad(img, (self.input_shape[1], self.input_shape[0]))
        return img / 255. 

    def on_epoch_end(self):
        if (not self.classification) and (self.y is not None):
            self.x, self.y = shuffle(self.x, self.y)
        
        if self.classification and self.y is not None:
            self.dataset = pd.DataFrame(data={'x': self.x, 'y': self.y, 'used': np.zeros_like(self.y)})
            self.dataset = self.dataset.sample(n=len(self.dataset))


In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import cv2
from os.path import join


def fetch(img_dir, name):
    img = cv2.imread(join(img_dir, name))
    if img.shape == 2:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    elif img.shape == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img


def resize(img, size=(1024, 768)):
    assert len(size) == 2
    return cv2.resize(img, size, interpolation=cv2.INTER_CUBIC)


def pad(img, size=(1024, 768)):
    assert len(img.shape) == 3
    assert len(size) == 2
    h, w, _ = img.shape
    assert w <= size[0] and h <= size[1]
    pad_vert = np.ceil((size[1]-h) / 2).astype(np.uint32)
    pad_hor = np.ceil((size[0]-w) / 2).astype(np.uint32)

    padded = np.full((size[1], size[0], 3), 255).astype(np.uint8)
    padded[pad_vert:pad_vert+h, pad_hor:pad_hor+w, :] = img.copy()
    return padded


In [42]:


cache_dir = 'Literature data/classification_cache'
train_dir = 'Literature data/train_set'
# validation_dir = 'C:/Users/Anastasia/Pictures/words_validation'
test_dir = 'Literature data/test_set'

# Train
model = Classification_model(alpha=1, input_shape=(160,160,3), num_classes=39, cache_dir=cache_dir, train_head=False)
#model.load_weights('writer_identification-master/classification/final_weigths_alpha_0.75/final.h5')
model.train(train_dir, "train.csv", epochs=100)

# Predict
#model = Classification_model(alpha=1, input_shape=(160,160,3), num_classes=95, cache_dir=cache_dir, train_head=False)
#model.load_weights('final_weigths_alpha_1/final.h5') #42
#model.predict(test_dir, "../data/test.csv")

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 160, 160, 3)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 161, 161, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 80, 80, 32)        864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 80, 80, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 80, 80, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 80, 80, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 80, 80, 32)        128



Epoch 1/100


ValueError: Error when checking target: expected dense_20 to have shape (39,) but got array with shape (95,)

In [17]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

def filter_word(image):
    img = image[:,:,0]
    hist = np.sum(1 - img/255, axis=1)
    indices_black = np.ravel(np.argwhere(hist > 0))
    start = indices_black[0]
    black_areas = []
    for i, ind in enumerate(indices_black[:-1]):
        if ind != indices_black[i + 1] - 1:
            black_areas.append((start, ind))
            start = indices_black[i + 1]
        if indices_black[i + 1] == indices_black[-1]:
            black_areas.append((start, indices_black[i + 1]))
    if len(black_areas) == 0:
        return image
    
    max_length = -1
    word_area = ()
    for area in black_areas:
        if (area[1] - area[0]) > max_length:
            max_length = area[1] - area[0]
            word_area = area
    n, m, _ = image.shape
    start = 0 if (word_area[0] - 5 < 0 ) else word_area[0] - 5
    end = n if (word_area[1] + 5 >= n ) else word_area[1] + 5
    return image[start:end, :]
    
    
def move_in_one_folder(path, save_path, csv_name):
    names = []
    autor_inds = []
    for variant in os.listdir(path):
        autor_path = os.path.join(path, variant)
        for autor in os.listdir(autor_path):
            words_path = os.path.join(autor_path, autor)
            for word in os.listdir(words_path):
                img = cv2.imread(os.path.join(words_path, word))
                new_name = str(variant[-1]) + '_' + str(autor) + '_' + word
                if np.sum(1 - (img[:,:, 0] /255)) > 150:
                    img = filter_word(img)
                    cv2.imwrite(os.path.join(save_path, new_name), img)
                    names.append(new_name) 
                    autor_inds.append(str(variant[-1]) + '_' + str(autor))
    pd.DataFrame({"file_name": names, "label": autor_inds}) \
        .to_csv(csv_name, index=False, header=True, columns = ["file_name", "label"])    
        
        
def compute_max_shape(path):
    height, width = 0, 0 
    for word in os.listdir(path):
        word_path = os.path.join(path, word)
        img = cv2.imread(word_path)
        n, m, _ = img.shape
        if n > height:
            height = n
        if m >  width:
             width = m
    return (height, width)

In [16]:
train_path = 'Literature Data/train_set'
words_path = train_path
csv_name = 'train.csv'
names = []
authors = []
for word_file in os.listdir(words_path):
    label =  word_file[:word_file.find('word' )-1]
    names.append(word_file)
    authors.append(label)
pd.DataFrame({"file_name": names, "label": authors}) \
        .to_csv(csv_name, index=False, header=True, columns = ["file_name", "label"])    

In [48]:
test_path = 'Literature Data/test_set'
words_path = test_path
csv_name = 'test.csv'
names = []
authors = []
for word_file in os.listdir(words_path):
    label =  word_file[:word_file.find('word' )-1]
    names.append(word_file)
    authors.append(label)
pd.DataFrame({"file_name": names, "label": authors}) \
        .to_csv(csv_name, index=False, header=True, columns = ["file_name", "label"])    

In [24]:
model1 = Classification_model(alpha=1, input_shape=(160,160,3), num_classes=39, cache_dir=cache_dir, train_head=False)

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 160, 160, 3)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 161, 161, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 80, 80, 32)        864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 80, 80, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 80, 80, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 80, 80, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 80, 80, 32)        128 

In [25]:
model1.cache_dir

'Literature data/classification_cache'

In [33]:
model2 = model1.build_model(model1 )

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 160, 160, 3)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 161, 161, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 80, 80, 32)        864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 80, 80, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 80, 80, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 80, 80, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 80, 80, 32)        128 

In [41]:
model1.load_weights('Literature Data/classification_cache')

OSError: Unable to open file (unable to open file: name = 'Literature Data/classification_cache', errno = 13, error message = 'Permission denied', flags = 0, o_flags = 0)

In [36]:
model1

<__main__.Classification_model at 0x2068e4ffdd8>

In [45]:
try:
    model.predict(test_dir, 'test.csv')
except (TypeError, AttributeError):
    print('None type Exception')

None type Exception




In [64]:
self = model1
test_csv = 'test.csv'
test = pd.read_csv(test_csv)
test_x, test_y = test['file_name'].as_matrix(), test['label'].as_matrix()
self.str2ind_dict, self.ind2str_dict = get_str2numb_numb2dict(test_y)
test_generator = WordsSequence(img_dir=test_dir,
                                input_shape = self.input_shape,
                                x_set=test_x,
                                batch_size=self.batch_size,
                                classification=True)

pred = np.argmax(self.model.predict_generator(test_generator, verbose=1), axis=1)  
res = np.array(apply_dict(self.ind2str_dict, pred))

count = 0
for i,j in zip(res, test_y):
    if i == j:
        count += 1
print('word accuracy: ', count / len(test_y))

count = 0
autors = np.unique(test_y)
autor_ind = [np.argwhere(test_y == a) for a in autors]
for i,inds in enumerate(autor_ind):
    p = Counter(np.ravel(res[inds])).most_common(1)[0][0]
    if p == autors[i]:
        count += 1

print('top-1 autor accuracy: ', count / len(autors))

сount = 0
for i,inds in enumerate(autor_ind):
    p = [pair[0] for pair in Counter(np.ravel(res[inds])).most_common(5)]
    if autors[i] in p:
        сount += 1

print('top-5 autor accuracy: ', сount / len(autors))

  after removing the cwd from sys.path.


word accuracy:  0.020572450805008944
top-1 autor accuracy:  0.05405405405405406
top-5 autor accuracy:  0.05405405405405406


In [53]:
indexes = model1.ind2str_dict


In [67]:
def get_str2numb_numb2dict(vect):
    str_to_ind_dict = {}
    count = 0
    for v in vect:
        if v not in str_to_ind_dict.keys():
            str_to_ind_dict[v] = count
            count += 1
    reverse_dict = {v:k for k, v in str_to_ind_dict.items()}
    return str_to_ind_dict, reverse_dict

    
def apply_dict(dict_keys, X):
    res = []
    for x in X:
        if x in dict_keys.keys():
            res.append(dict_keys[x])
        else:
            res.append(-1)
    return res

In [54]:
for x in X: 
    print(res = np.array(apply_dict(indexes, pred))

KeyError: 38

In [56]:
indexes[0]

'1_0'

In [58]:
dist_res = []
for p in pred:
    if p not in dist_res:
        dist_res.append(p)
print(dist_res)

[28, 34, 5, 27, 4, 23, 10, 12, 32, 19, 0, 38, 1, 13]


In [61]:
indexes.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36])

In [65]:
count

2

In [68]:
cache_dir = 'Literature data/classification_cache'
train_dir = 'Literature data/train_set'
# validation_dir = 'C:/Users/Anastasia/Pictures/words_validation'
test_dir = 'Literature data/test_set'

# Train
model10 = Classification_model(alpha=1, input_shape=(160,160,3), num_classes=36, cache_dir=cache_dir, train_head=False)
#model.load_weights('writer_identification-master/classification/final_weigths_alpha_0.75/final.h5')
model.train(train_dir, "train.csv", epochs=100)

# Predict
model10 = Classification_model(alpha=1, input_shape=(160,160,3), num_classes=36, cache_dir=cache_dir, train_head=False)
#model.load_weights('final_weigths_alpha_1/final.h5') #42
model.predict(test_dir, 'test.csv')

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 160, 160, 3)       0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 161, 161, 3)       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 80, 80, 32)        864       
_________________________________________________________________
conv1_bn (BatchNormalization (None, 80, 80, 32)        128       
_________________________________________________________________
conv1_relu (ReLU)            (None, 80, 80, 32)        0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 80, 80, 32)        288       
_________________________________________________________________
conv_dw_1_bn (BatchNormaliza (None, 80, 80, 32)        128



Literature data/classification_cache/checkpoint-{epoch:02d}.h5
Epoch 1/100


ValueError: Error when checking target: expected dense_20 to have shape (39,) but got array with shape (95,)