In [1]:
"""In this assignment, you should train your own net on cifar10 classification with deep learning framework MXNet.
   With MXNet, you only need to define the nets with symbol connection, then set hyperparameters to train the 
   network. You can also save your model and load pretrained model to finetune the network. Make sure using GPU 
   mode. You should achieve at least 80% on the validation set."""

"""vist http://mxnet.io/get_started/index.html to get familar with mxnet!"""
   
import sys
import os
import numpy as np
import mxnet as mx
import logging

# download data if necessary
def _download(data_dir):
    if not os.path.isdir(data_dir):
        os.system("mkdir " + data_dir)
    os.chdir(data_dir)
    if (not os.path.exists('train.rec')) or \
       (not os.path.exists('test.rec')) :
        os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip")
        os.system("unzip -u cifar10.zip")
        os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
    os.chdir("..")


# data
def get_iterator(data_shape=(3, 28, 28)):
    if '://' not in data_dir:
        _download(data_dir)

    train = mx.io.ImageRecordIter(
        path_imgrec = os.path.join(data_dir, "train.rec"),
        mean_img    = os.path.join(data_dir, "mean.bin"),
        data_shape  = data_shape,
        batch_size  = batch_size,
        rand_crop   = True,
        rand_mirror = True)

    val = mx.io.ImageRecordIter(
        path_imgrec = os.path.join(data_dir, "test.rec"),
        mean_img    = os.path.join(data_dir, "mean.bin"),
        rand_crop   = False,
        rand_mirror = False,
        data_shape  = data_shape,
        batch_size  = batch_size)

    return (train, val)


def get_net(num_classes=10):
    #####################################################################################
    # TODO: define your net                                                             #
    # Define symbols that using convolution and max pooling to extract better features  #
    # from input image.                                                                 #
    #####################################################################################
    
    data = mx.symbol.Variable(name="data")
    # group 1
    conv1_1 = mx.symbol.Convolution(data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
    pool1 = mx.symbol.Pooling(
        data=relu1_1, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool1")
    # group 2
    conv2_1 = mx.symbol.Convolution(
        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
    pool2 = mx.symbol.Pooling(
        data=relu2_1, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool2")
    # group 3
    conv3_1 = mx.symbol.Convolution(
        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
    conv3_2 = mx.symbol.Convolution(
        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
    pool3 = mx.symbol.Pooling(
        data=relu3_2, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool3")
    """
    # group 4
    conv4_1 = mx.symbol.Convolution(
        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
    conv4_2 = mx.symbol.Convolution(
        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
    pool4 = mx.symbol.Pooling(
        data=relu4_2, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool4")
    
    # group 5
    conv5_1 = mx.symbol.Convolution(
        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
    conv5_2 = mx.symbol.Convolution(
        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="conv1_2")
    pool5 = mx.symbol.Pooling(
        data=relu5_2, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool5")
    """
    # group 6
    flatten = mx.symbol.Flatten(data=pool3, name="flatten")
    fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
    relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6")
    drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
    # group 7
    fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
    relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7")
    drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
    # output
    fc8 = mx.symbol.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
    softmax = mx.symbol.SoftmaxOutput(data=fc8, name='softmax')

    pass
    #####################################################################################
    #                              END OF YOUR CODE                                     #
    #####################################################################################
    return softmax

ImportError: No module named mxnet

In [19]:
network = get_net()

################################################################################
# TODO: this is similar as solver                                              #
################################################################################

############################ set hyperparameters ###############################
batch_size = 128
weight_decay = 0.0  # same as weight reg
num_epoch = 100
learning_rate = 0.0001
devs=mx.gpu(0)     # set device id

################################  path #########################################
data_dir = 'cifar10/'
chk_dir = 'model/'
chk_prefix = chk_dir +'net12'
load_model = True   ## set true if you want to load a pretrained model and finetune with lower learning rate

if not os.path.isdir(chk_dir):
     os.system("mkdir " + chk_dir)

reload(logging)
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

eval_metrics = ['accuracy']

## TopKAccuracy only allows top_k > 1
#eval_metrics.append(mx.metric.create('top_k_accuracy', top_k = 5))

if load_model:
    model_prefix = 'model/net1'
    model_iter = 29  # which model to load

    _, arg_params,__ = mx.model.load_checkpoint(model_prefix, model_iter)
else:
    arg_params = None
    model_iter = 0

model=mx.model.FeedForward(
       ctx      = devs,
       symbol   = network,
       arg_params = arg_params,
       begin_epoch = model_iter,
       num_epoch  = num_epoch,
       learning_rate = learning_rate,
       momentum      = 0.9,
       wd            = weight_decay,
      initializer   = mx.init.Xavier(factor_type='in', magnitude=2.34)    ## weight initialization
       )

train_ite, val_ite = get_iterator()

model.fit(
        X          = train_ite,
        eval_data  = val_ite,
        eval_metric = eval_metrics,
        batch_end_callback = mx.callback.Speedometer(batch_size, 50), 
        epoch_end_callback=mx.callback.do_checkpoint(chk_prefix, 1)   ## save your model after each 10 epochs
        )

################################################################################
#                              END OF YOUR CODE                                #
################################################################################

2016-10-28 01:01:29,650 Start training with [gpu(0)]
2016-10-28 01:01:33,679 Epoch[29] Batch [50]	Speed: 2042.30 samples/sec	Train-accuracy=0.808281
2016-10-28 01:01:36,853 Epoch[29] Batch [100]	Speed: 2016.93 samples/sec	Train-accuracy=0.812031
2016-10-28 01:01:40,053 Epoch[29] Batch [150]	Speed: 2000.51 samples/sec	Train-accuracy=0.820469
2016-10-28 01:01:43,251 Epoch[29] Batch [200]	Speed: 2001.40 samples/sec	Train-accuracy=0.809375
2016-10-28 01:01:46,446 Epoch[29] Batch [250]	Speed: 2003.36 samples/sec	Train-accuracy=0.815000
2016-10-28 01:01:49,645 Epoch[29] Batch [300]	Speed: 2001.21 samples/sec	Train-accuracy=0.821875
2016-10-28 01:01:52,844 Epoch[29] Batch [350]	Speed: 2001.17 samples/sec	Train-accuracy=0.819219
2016-10-28 01:01:55,469 Epoch[29] Resetting Data Iterator
2016-10-28 01:01:55,469 Epoch[29] Time cost=25.570
2016-10-28 01:01:55,629 Saved checkpoint to "model/net12-0030.params"
2016-10-28 01:01:57,621 Epoch[29] Validation-accuracy=0.804885
2016-10-28 01:02:00,787 Epo