In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from subprocess import check_output
print(check_output(["ls", "/mnt/Cdiscount"]).decode("utf8"))

test.bson
train.bson



In [2]:
import io
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from skimage.data import imread   # or, whatever image library you prefer
from skimage import transform
import multiprocessing as mp      # will come in handy due to the size of the data
import random

%matplotlib inline

def transform_image(pic_data):
    picture = imread(io.BytesIO(pic_data))
    img = transform.resize(picture, (192, 192))
    return img

In [3]:
def get_category_dict():
    category = pd.read_csv('Cdiscount/category_names.csv')
    mp = dict()
    C = category.shape[0]
    for i in range(C):
        mp[category["category_id"][i]] = i
    return mp

category_dict = get_category_dict()
num_classes = len(category_dict)

In [4]:
import tensorflow as tf

import models.resnet as resnet

with tf.device('/gpu:0'):
    x = tf.placeholder(tf.float32, shape=(None, 192, 192, 3))
    y_real = tf.placeholder(tf.float32, shape=(None, num_classes))

    y_net = resnet.inference(x, True, num_classes, [3, 4, 6, 3])

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_real, logits=y_net))
    
    regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

    loss = tf.add_n([cross_entropy] + regularization_losses)
    tf.summary.scalar('loss', loss)
    
    train_step = tf.train.AdamOptimizer(1e-5, name='adam2').minimize(loss)
    correct_prediction = tf.equal(tf.argmax(y_net, 1), tf.argmax(y_real, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))


In [5]:
sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

sess.run(tf.global_variables_initializer())

writer = tf.summary.FileWriter("graph/Cdiscount")
writer.add_graph(sess.graph)

In [6]:
# scan dataset and generate testing set

def generate_teseting_set(filename = '/mnt/Cdiscount/train.bson'):
    data = bson.decode_file_iter(open(filename, 'rb'))
    test_img_list = list()
    test_lab_list = list()
    test_ids = set()
    for c, d in enumerate(data):
        if random.random() > 1 / 700.0:
            continue
        product_id = d['_id']
        test_ids.add(product_id)
        category_id = d['category_id']
        for e, pic in enumerate(d['imgs']):
            img_data = transform_image(pic['picture'])
            test_img_list.append(img_data)
            test_lab_list.append(category_dict[category_id])
    test_img = np.ndarray((len(test_img_list), 160, 160, 3), buffer=np.asarray(test_img_list), dtype = float)
    test_lab = np.ndarray((len(test_lab_list),), buffer=np.asarray(test_lab_list), dtype = int)
    return test_ids, test_img, test_lab

%time test_ids, test_img, test_lab = generate_teseting_set()

  warn("The default mode, 'constant', will be changed to 'reflect' in "


CPU times: user 3min 3s, sys: 5min 42s, total: 8min 46s
Wall time: 3min 30s


In [7]:
print(len(test_ids))
print(test_img.shape)
print(test_lab.shape)

10071
(17422, 160, 160, 3)
(17422,)


In [8]:
# define next function

iter_counter = 1

def get_train_data_iter():
    global iter_counter
    data = bson.decode_file_iter(open('/mnt/Cdiscount/train.bson', 'rb'))
    for c, d in enumerate(data):
        product_id = d['_id']
        if product_id in test_ids:
            continue
        category_id = d['category_id']
        for e, pic in enumerate(d['imgs']):
            img_data = transform_image(pic['picture'])
            yield img_data, category_dict[category_id]
    iter_counter += 1
        
train_iter = get_train_data_iter()

INIT_OUTS = [0] * num_classes
        
def next_batch(bsize = 50):
    global train_iter
    global INIT_OUTS
    batch_img_list = list()
    batch_out_list = list()
    for _ in range(bsize):
        try:
            img, lab = next(train_iter)
        except StopIteration:
            train_iter = get_train_data_iter()
            img, lab = next(train_iter)
        batch_img_list.append(img)
        out = np.asarray(INIT_OUTS, dtype = float)
        out[lab] = 1
        batch_out_list.append(out)
    batch_img = np.ndarray((bsize, 160, 160, 3), buffer=np.asarray(batch_img_list), dtype = float)
    batch_out = np.ndarray((bsize, num_classes), buffer=np.asarray(batch_out_list), dtype = float)
    return batch_img, batch_out

In [12]:
import sys

for i in range(sys.maxsize):
    img, lab = next_batch(200)
    _, loss_value = sess.run([train_step, cross_entropy],
                             feed_dict={x: img, y_real: lab})
    if i % 100 == 0:
        train_accuracy = accuracy.eval(feed_dict={x: img, y_real: lab})
        print('iter %d, step %d, training accuracy %g, loss %g'
              % (iter_counter, i, train_accuracy, loss_value))

  warn("The default mode, 'constant', will be changed to 'reflect' in "


iter 1, step 0, training accuracy 0.13, loss 5.63124
iter 1, step 100, training accuracy 0.13, loss 5.70038


KeyboardInterrupt: 