In [35]:
from keras.applications.vgg19 import VGG19
from keras.preprocessing import image as k_image
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
import numpy as np
from keras.utils import plot_model

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import math
import h5py

import os
import random
import pandas
import time

import collections

In [36]:
base_model = VGG19(weights='imagenet')

In [3]:
model_1 = Model(inputs=base_model.input, outputs=base_model.get_layer('block1_pool').output)
model_2 = Model(inputs=base_model.input, outputs=base_model.get_layer('block2_pool').output)
model_3 = Model(inputs=base_model.input, outputs=base_model.get_layer('block3_pool').output)
model_4 = Model(inputs=base_model.input, outputs=base_model.get_layer('block4_pool').output)
model_5 = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)

<keras.engine.training.Model object at 0x138dcfd30>


In [6]:
def get_block_features(img_path, model_):
    img = k_image.load_img(img_path, target_size=(224, 224))
    x = k_image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    y = x
    x = preprocess_input(x)
    print(x.shape)
    return y, model_.predict(x)

#     block1_pool_features = model_1.predict(x)
#     block2_pool_features = model_2.predict(x)
#     block3_pool_features = model_3.predict(x)
#     block4_pool_features = model_4.predict(x)
#     block5_pool_features = model_5.predict(x)
    
#     return (block1_pool_features,
#             block2_pool_features,
#             block3_pool_features,
#             block4_pool_features,
#             block5_pool_features)


In [44]:
foo = random.sample(os.listdir('yelp_photos/photos'), 1000)
images = []
block_features = []
s_t = time.time()
for img_path in foo:
    image, block_feature = get_block_features('yelp_photos/photos/'+img_path, model_5)
    images.append(image)
    block_features.append(block_feature)
print(time.time()-s_t)

1055.9451739788055


TypeError: Error when checking model : data should be a Numpy array, or list/dict of Numpy arrays. Found: yelp_photos/photos/mgrsnF_xwXOF3DzuxvinqA.jpg...

In [8]:
def get_index_by_convolution(block_features):
    foo = block_features
    foo = np.swapaxes(foo, 1, 2)
    foo = np.swapaxes(foo, 1, 3)
    return foo[0]

def get_max_values(block_features):
    block_features_ = \
        np.resize(block_features, (block_features.shape[0], block_features.shape[1]*block_features.shape[2]))
    return block_features_.max(axis=1)

def get_df_from_blocks(blocks):
    dataframe = []
    for doc_index, block in enumerate(blocks):
        block = get_index_by_convolution(block)
        max_values = get_max_values(block)
        for pool_index, pool_feature in enumerate(max_values):
            dataframe.append({'doc_id':doc_index, 
                              'word_id': pool_index, 
                              'count': pool_feature})

    return pandas.DataFrame(dataframe)


In [10]:
def bmm_gibbs(doc_label, word_id, count, W, α, γ, K):
    # doc_labels = distinct values of doc_label
    # doc_index = a list as long as doc_label
    #             such that doc_labels[doc_index[j]] = doc_label[j]
    doc_labels, doc_index = np.unique(doc_label, return_inverse=True)

    # z[i] = class of document i, where i enumerates the distinct doc_labels
    # doc_count[k] = number of documents of class k
    z = np.random.choice(K, len(doc_labels))
    doc_count = np.zeros(K, dtype=int)
    for k in z: doc_count[k] += 1

    # occurrences[k,w] = number of occurrences of word_id w in documents of class k
    # word_count[k] = total number of words in documents of class k
    x = pandas.DataFrame({'doc_class': z[doc_index], 'word_id': word_id, 'count': count}) \
        .groupby(['doc_class', 'word_id']) \
        ['count'].apply(sum) \
        .unstack(fill_value=0)
    occurrences = np.zeros((K, W))
    occurrences[x.index.values.reshape((-1,1)), x.columns.values] = x
    word_count = np.sum(occurrences, axis=1)

    while True:
        for i in range(len(doc_labels)):
    
            # get the words,counts for document i
            # and remove this document from the counts
            w,c = word_id[doc_index==i].values, count[doc_index==i].values
            occurrences[z[i], w] -= c
            word_count[z[i]] -= sum(c)
            doc_count[z[i]] -= 1

            # Find the log probability that this document belongs to class k, marginalized over θ and β
        
            logp = np.zeros(K)
            
            for k in range(K):
                logp[k] = (c*(np.log(occurrences[k,w] + γ)-np.log(W*γ + word_count[k]))).sum()
                logp[k] += np.log(doc_count[k] + α)
                
            p = np.exp(logp - np.max(logp))
            p = p/sum(p)

            # Assign this document to a new class, chosen randomly, and add back the counts
            k = np.random.choice(K, p=p)
            z[i] = k
            occurrences[k, w] += c
            word_count[k] += sum(c)
            doc_count[k] += 1

        yield np.copy(z), np.copy(occurrences)

In [45]:
A = get_df_from_blocks(block_features)

In [56]:
g = bmm_gibbs(A['doc_id'], A['word_id'], A['count'], W=512, α=10, γ=.1, K=20)
NUM_ITERATIONS = 20
res = np.stack([next(g)[0] for _ in range(NUM_ITERATIONS)])
# this produces a matrix with one row per iteration and a column for each unique doc_id
# converged_topic_distribution, converged_word_distribution = res[-1]

In [58]:
def show_images(images, savepath, cols = 1, titles = None):
    """Display a list of images in a single figure with matplotlib.
    
    Parameters
    ---------
    images: List of np.arrays compatible with plt.imshow.
    
    cols (Default = 1): Number of columns in figure (number of rows is 
                        set to np.ceil(n_images/float(cols))).
    
    titles: List of titles corresponding to each image. Must have
            the same length as titles.
    """
    assert((titles is None)or (len(images) == len(titles)))
    n_images = len(images)
    if titles is None: titles = ['Image (%d)' % i for i in range(1,n_images + 1)]
    fig = plt.figure()
    for n, (image, title) in enumerate(zip(images, titles)):
        a = fig.add_subplot(cols, np.ceil(n_images/float(cols)), n + 1)
        if image.ndim == 2:
            plt.gray()
        plt.imshow(image)
        a.set_title(title)
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_images)
#     plt.show()
    plt.savefig(savepath, bbox_inches='tight')
    plt.close()

In [59]:
a = np.array(res[-1])
topic_to_index = {}

for img_index, topic in enumerate(res[-1]):
    if topic in topic_to_index:
        topic_to_index[topic].append(img_index)
    else:
        topic_to_index[topic] = [img_index]

In [62]:
plt.close('all')
for topic in topic_to_index:
    print(topic, len(topic_to_index[topic]))
    img_to_show = []
    for images_index in topic_to_index[topic]:
        img_path = foo[images_index]
        path = 'yelp_photos/photos/'+img_path
        img=mpimg.imread(path)
        img_to_show.append(img)
    if(len(img_to_show) > 16):
        img_to_show = random.sample(img_to_show, 16)
    
    num_cols = math.ceil(math.sqrt(len(img_to_show)))
    show_images(img_to_show, savepath='class_img/class_'+str(topic)+'.png', cols=num_cols)
        
        

3 227
8 116
15 99
7 53
6 142
4 95
11 174
5 48
19 33
14 1
12 1
17 1
10 4
1 1
16 1
2 1
9 1
0 1
18 1


In [None]:
agree = []
for i,r in enumerate(res):
    if i > 0:
        agree.append((np.equal(res[i][0], res[i-1][0])).sum())
plt.plot(range(len(agree)), agree)