# In the notebook, I will write a code from beginning vs cifar10

https://www.kaggle.com/c/cifar-10

In [1]:
#import needed package
import sys
sys.path.insert(0,'../../')

import os
import shutil
from mxnet import gluon, init, autograd, nd
from mxnet.gluon import data as gdata, nn, loss as gloss, utils as gutils
import datetime
import mxnet as mx

In [2]:
#First read the file, all the files are zip in the ../../kaggle_cifar10, need first unzip them
demo = True

if demo == True:
    import zipfile
    for f in ["train_tiny.zip","test_tiny.zip","trainLabels.csv.zip"]:
        with zipfile.ZipFile("../../data/kaggle_cifar10/"+f,'r') as z:
                             z.extractall("../../data/kaggle_cifar10/")
#for train.7z, use the "7z x train.7z" to unzip in the shell

In [3]:
# next, we need to reorg the data
def reorg_data(data_dir, label_file, train_dir,test_dir,input_dir,valid_ratio):
    
    #read label id:label in the file
    with open(os.path.join(data_dir,label_file)) as f:
        lines = f.readlines()[1:]
        #print(lines)
        #without rstrip, there will be \n, for example ['1', 'frog\n']
        tokens = [l.rstrip().split(",") for l in lines]
        #print(tokens)
        #idx_label= dict(((int(idx), label) for idx, label in tokens))
        idx_label= dict((int(idx), label) for idx, label in tokens)

        #print(idx_label)
    labels = set(idx_label.values())
    print(labels)
    
    # got the train data number
    n_train_all = len(os.listdir(os.path.join(data_dir,train_dir)))
    n_train = n_train_all*(1-valid_ratio)
    assert 0< n_train < n_train_all
    n_train_per_label = n_train // len(labels)
    print(n_train_per_label)
    
    def mkdir_if_no_exist(path):
        if not os.path.exists(os.path.join(*path)):
            os.makedirs(os.path.join(*path))
   
    #next we need to put the files into different folders with the folder name as the label
    #track the file number copid to the folder
    label_count={}
    
    #copy file process
    for file in os.listdir(os.path.join(data_dir,train_dir)):
        idx = int(file.split(".")[0])
        label = idx_label[idx]
        
        if label not in label_count or label_count[label] < n_train_per_label:
            
            # put the label into dict or add 1 if already exist, make sure we can the number of train samples
            label_count[label]=label_count.get(label,0)+1
            #mkdir for the label 
            mkdir_if_no_exist([data_dir,input_dir,"train",label])
            shutil.copy(os.path.join(data_dir,train_dir,file), os.path.join(data_dir,input_dir,"train",label))
        else:
            mkdir_if_no_exist([data_dir,input_dir,"valid",label])
            shutil.copy(os.path.join(data_dir,train_dir,file), os.path.join(data_dir,input_dir,"valid",label))
    
    for file in os.listdir(os.path.join(data_dir, test_dir)):
        idx = int(file.split(".")[0])
        #print(idx)
        #label = idx_label[idx]
        #print(label)
        mkdir_if_no_exist([data_dir,input_dir,"test","unknown"])
        shutil.copy(os.path.join(data_dir,test_dir,file), os.path.join(data_dir,input_dir,"test","unknown"))

In [4]:
if demo:
    # 注意：此处使用小训练集和小测试集并将批量大小相应设小。
    # 使用 Kaggle 比赛的完整数据集时可设批量大小为较大整数。
    train_dir, test_dir, input_dir = 'train_tiny', 'test_tiny', "tiny_dataset"
else:
    train_dir, test_dir, input_dir = 'train', 'test', "final_dataset"

data_dir, label_file = '../../data/kaggle_cifar10', 'trainLabels.csv'
valid_ratio = 0.1
reorg_data(data_dir, label_file, train_dir, test_dir, input_dir,
                   valid_ratio)

{'bird', 'automobile', 'cat', 'truck', 'horse', 'dog', 'airplane', 'ship', 'frog', 'deer'}
9.0


In [5]:
#image transformation, resize, crop, flip and nomorlize
transform_train = gdata.vision.transforms.Compose([
    gdata.vision.transforms.Resize(40),
    gdata.vision.transforms.RandomResizedCrop(32, scale=(0.64,1), ratio = (1.0,1.0)),
    gdata.vision.transforms.RandomFlipLeftRight(),
    #for ToTensor, check this: https://pytorch.org/docs/0.2.0/_modules/torchvision/transforms.html#ToTensor
    gdata.vision.transforms.ToTensor(),
    gdata.vision.transforms.Normalize([0.4914, 0.4822, 0.4465],
                                      [0.2023, 0.1994, 0.2010])
])

# for test, just normalize

transform_test = gdata.vision.transforms.Compose([
    gdata.vision.transforms.ToTensor(),
    gdata.vision.transforms.Normalize([0.4914, 0.4822, 0.4465],
                                      [0.2023, 0.1994, 0.2010])])

In [6]:
#load data https://mxnet.incubator.apache.org/api/python/gluon/data.html#mxnet.gluon.data.vision.datasets.ImageFolderDataset
train_ds = gdata.vision.ImageFolderDataset(os.path.join(data_dir, input_dir, "train"), flag=1)
valid_ds = gdata.vision.ImageFolderDataset(os.path.join(data_dir, input_dir, "valid"), flag=1)
test_ds = gdata.vision.ImageFolderDataset(os.path.join(data_dir, input_dir, "test"), flag=1)

In [8]:
train_data = gdata.DataLoader(train_ds.transform_first(transform_train),10, shuffle=True, last_batch='keep')
for data,label in train_data:
    print(data.shape)

(10, 3, 32, 32)
(10, 3, 32, 32)
(10, 3, 32, 32)
(10, 3, 32, 32)
(10, 3, 32, 32)
(10, 3, 32, 32)
(10, 3, 32, 32)
(10, 3, 32, 32)
(10, 3, 32, 32)


In [11]:
#def the resnet-18 network
class Residual(nn.HybridBlock):
    def __init__(self, num_channels, use_1x1conv = False, strides =1, **kargs):
        
        #for the supper, need further check why need to put the Resiual and self in the parameters
        super(Residual, self).__init__(**kargs)
        
        # conv2 will not use the strides which will keep the shape I think
        self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides)
        self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm()
        self.bn2 = nn.BatchNorm()
        
    def hybrid_forward(self, F, X):
        #print("X:"+str(X.shape))
        Y = F.relu(self.bn1(self.conv1(X)))
        #print("Y:"+str(Y.shape))
        Y = self.bn2(self.conv2(Y))
        
        # please note, if I use "if use_1x1conv", I will get NameError: name 'use_1x1conv' is not defined
        #if use_1x1conv:
        if self.conv3:
            X=self.conv3(X)
        # if 1x1 conv, the channel size maybe different and then there should be broadcast for the channel level.
        # for the broad cast, can broadcast from 1 to n, not 2 to n, for example:
        #Check failed: l == 1 || r == 1 operands could not be broadcast together with shapes [4,6,6,6] [4,2,6,6]
        # But it will work if [4,6,6,6] plus [4,1,6,6]<- broadcat this
        return F.relu(Y+X)

In [12]:
#shape for x and y: (1, 3, 32, 32) (1,)
#for x, y in train_data:
#    print(x.shape, y.shape)
#    print(y.astype('float32'))
#    blk = nn.HybridSequential()
#!!!there will be error if here I put the channel to 16, because the x has 3 channel and can not be broad cast to 16
#    blk.add(Residual(16))
#    blk.initialize()
    #blk.hybridize()
#    y=blk(x)
# 1. I got "module 'mxnet.symbol' has no attribute 'Relu'" error -- should be relu
# 2. How the x (1, 3, 32, 32) can be broadcast to more channels such as (1,64,x,x)? - answer: In resnet18, the first
# layter is conv2d with 64 channel

In [13]:
def resnet_18(number_classes):
    net = nn.HybridSequential()
    
    # first layer, 64 channel with batch normal and relu activate. One strange thing is there should be maxpool, 
    #nn.MaxPool2D(pool_size=3, strides=2, padding=1), but no here
    net.add(nn.Conv2D(64, kernel_size=3, padding=1, strides=1), nn.BatchNorm(), nn.Activation('relu'))
    
    def resnet_block(num_channels, resnet_size, first_block=False):
        blk = nn.HybridSequential()
        for f in range(resnet_size):
            
            #there will be 4 residula block, each has 2 residual object. The first residual object will not half
            # the width and height, other block will half the w and h by strides 2.
            if f == 0 and not first_block:
                blk.add(Residual(num_channels, use_1x1conv=True, strides=2))
            else:
                blk.add(Residual(num_channels))
        return blk
    
    #add 4 residual net object, double the channel number and half the W and H, the first object should not do the 
    #half W and H before in the previous one there should be a maxpool with strides 2, however, in this project, there
    # is no max pool, so I think we should change the W and H, need to verfiy further?????? If no 1x1conv, the  Y+X,
    # X will be broadcast
    net.add(resnet_block(64, 2, first_block=True))
    net.add(resnet_block(128, 2))
    net.add(resnet_block(256, 2))
    net.add(resnet_block(512, 2))
    
    #last add last dense layer last
    net.add(nn.GlobalAvgPool2D(), nn.Dense(number_classes))
    
    # 1 conv + 4 * 4 + 1 last dense = 18, thats how resnet18 come from
    
    return net

In [14]:
#define and initialize the net
def get_net(ctx):
    num_classes = 10
    net = resnet_18(num_classes)
    # initialize the net with context and init from mxnet
    net.initialize(ctx=ctx, init = init.Xavier())
    return net

In [15]:
# get the predict result which equal to y, and then mean the value in one batch
def train_accuracy(y_hat, y):
    #print(y_hat.shape)
    #print("y.shape"+str(y.shape))
    # In the example, its the mean(), I am not sure why, should be sum()
    # I was wrong, should be mean(), the train_data len is the iterate times, for example, if 90 samples and 5 batch
    #size, the len(train_data) should be 18 instead of 90
    return (y_hat.argmax(axis=1)==y.astype('float32')).mean().asscalar()

In [16]:
def _get_batch(batch, ctx):
    features, labels = batch
    #print(type(features))
    # guarantee the labels and features have the same time
    if (features.dtype != labels.dtype):
        labels = labels.astype(features.dtype)
    # copy all the data into ctx
    #Splits an NDArray into len(ctx_list) slices along batch_axis and loads each slice to one context in ctx_list.
    # please note, the split_and_load will return list of ndarray, so to use the list, use zip
    return (gutils.split_and_load(features, ctx), gutils.split_and_load(labels,ctx), features.shape[0])

In [17]:
def evaluate_accuracy(data_iter, net, ctx=[mx.cpu()]):
    
    #data_iter type: mxnet.gluon.data.dataloader.DataLoader
    if isinstance(ctx, mx.Context):
        ctx=[ctx]
    acc = nd.array([0])
    n = 0
    for batch in data_iter:
        features, labels, _ = _get_batch(batch, ctx)
        
        #features and labels are list type
        #use zip to convert to ndarray
        # features type<class 'list'>
        #print("features type" + str(type(features)))
        #print(labels)
        
        #!!!!!!!!!features and labels are list of ndarray, need zip to iterate the value
        for X,y in zip(features, labels):
        # in the original example, its sum the result, however, in the train_acc, its mean(), why????? should be sum
        #even for the train, right?
        # Why need to copy the cpu? Maybe because the calculation on cpu will be faster?
            # the type for X and y is <class 'mxnet.ndarray.ndarray.NDArray'>
            acc += (y.astype('float32') == net(X).argmax(axis=1)).sum().copyto(mx.cpu())
            n += y.size
    return acc.asscalar()/n

In [18]:
def try_gpu():
    """If GPU is available, return mx.gpu(0); else return mx.cpu()."""
    try:
        ctx = mx.gpu()
        _ = nd.array([0], ctx=ctx)
    except mx.base.MXNetError:
        ctx = mx.cpu()
    return ctx

In [19]:
#define the loss function
loss=gloss.SoftmaxCrossEntropyLoss()

In [20]:
# define the train function
# lr_decay will change the lr, lr_period decide the period to change the lr
def train(net, train_data, valid_data, lr, wd, epochs, lr_period, lr_decay,ctx):
    
    # Need to check the details for sgd, momentum defined how many steps to use for the weigth change, wd weight decay
    # for the normalization for the loss
    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':lr, 'momentum':0.9, 'wd':wd})
    
    prev_time = datetime.datetime.now()
    print(prev_time)
    for epoch in range(epochs):
        #update the learning rate periodly
        if epoch > 0 and epoch%lr_period==0:
            trainer.set_learning_rate(trainer.learning_rate * lr_decay)
        
        train_l = 0
        train_acc = 0
        loop_number = 0
        
        for X, y in train_data:
            #loop_number += 1
            #print("loop number %d"%loop_number)
            # in one for loop, the batch_size examples will be processed
            y = y.astype('float32').as_in_context(ctx)
            with autograd.record():
                y_hat = net(X.as_in_context(ctx))
                #print(y_hat)
                # this loss is very interesting, the net will output 10 output, and then SoftmaxCrossEntropyLoss will
                #calculate the loss based on the y which like a index???
                l=loss(y_hat,y)
            l.backward()
            # apply the gredient based on the batch_size
            #https://beta.mxnet.io/api/gluon/_autogen/mxnet.gluon.Trainer.step.html
            trainer.step(batch_size)
            # in one epoch, all the loss will be sum together into the train_l, will be divided by train sample number
            
            #print(l)
            train_l += l.mean().asscalar()
            
            # sum the whole acc, will be divided by train sample number
            train_acc += train_accuracy(y_hat,y)
        #The divmod() method takes two numbers and returns a pair of numbers (a tuple) consisting of their 
        #quotient and remainder.
        current_time = datetime.datetime.now()
        h,reminder = divmod((current_time-prev_time).seconds, 3600)
        m,s = divmod(reminder,60)
        #time taken time: 0:1:37 if "time: %d:%d:%d
        #time taken time: 00:01:37 if "time %02d:02d:02d"
        time_s = "time: %02d:%02d:%02d" % (h,m,s)
        
        if epoch%1 == 0:
            if valid_data is not None:
                valid_acc = evaluate_accuracy(valid_data, net, ctx)
                print("epoch %d, time taken %s, train loss %f, train_acc %f, valid_cc %f, learning_rate %f" 
                      %(epoch, time_s, train_l/len(train_data), train_acc/len(train_data), valid_acc, trainer.learning_rate))
            else:
                print("epoch %d, time taken %s, train loss %f, train_acc %f, learning_rate %f" 
                      %(epoch, time_s, train_l/len(train_data),train_acc/len(train_data), trainer.learning_rate))
        prev_time = current_time

In [21]:
#generate the train data, with the loader, the label will be the 0,1,2 which will be used to mark the label
batch_size = 256
train_data = gdata.DataLoader(train_ds.transform_first(transform_train),batch_size, shuffle=True, last_batch='keep')
valid_data = gdata.DataLoader(valid_ds.transform_first(transform_train),batch_size, shuffle=True, last_batch='keep')
test_data = gdata.DataLoader(test_ds.transform_first(transform_test),batch_size, shuffle=True, last_batch='keep')

In [None]:
ctx, num_epochs, lr, wd = try_gpu(), 100, 0.01, 5e-4, 
lr_period, lr_decay, net = 80, 0.1, get_net(ctx)
net.hybridize()
train(net, train_data, valid_data,  lr, wd, num_epochs, lr_period,
      lr_decay, ctx)

2019-09-28 23:42:25.104644
epoch 0, time taken time: 00:00:02, train loss 2.811000, train_acc 0.100000, valid_cc 0.200000, learning_rate 0.010000
epoch 1, time taken time: 00:00:01, train loss 2.598645, train_acc 0.155556, valid_cc 0.100000, learning_rate 0.010000
epoch 2, time taken time: 00:00:00, train loss 2.265529, train_acc 0.144444, valid_cc 0.100000, learning_rate 0.010000
epoch 3, time taken time: 00:00:00, train loss 2.151677, train_acc 0.222222, valid_cc 0.100000, learning_rate 0.010000
epoch 4, time taken time: 00:00:00, train loss 2.038168, train_acc 0.288889, valid_cc 0.100000, learning_rate 0.010000
epoch 5, time taken time: 00:00:00, train loss 2.028507, train_acc 0.277778, valid_cc 0.100000, learning_rate 0.010000
epoch 6, time taken time: 00:00:00, train loss 1.979931, train_acc 0.255556, valid_cc 0.100000, learning_rate 0.010000
epoch 7, time taken time: 00:00:00, train loss 1.885706, train_acc 0.333333, valid_cc 0.100000, learning_rate 0.010000
epoch 8, time taken t

epoch 69, time taken time: 00:00:00, train loss 0.016444, train_acc 1.000000, valid_cc 0.200000, learning_rate 0.010000
epoch 70, time taken time: 00:00:00, train loss 0.035818, train_acc 0.988889, valid_cc 0.200000, learning_rate 0.010000
epoch 71, time taken time: 00:00:00, train loss 0.013187, train_acc 1.000000, valid_cc 0.200000, learning_rate 0.010000
epoch 72, time taken time: 00:00:00, train loss 0.012391, train_acc 1.000000, valid_cc 0.200000, learning_rate 0.010000
epoch 73, time taken time: 00:00:00, train loss 0.023558, train_acc 1.000000, valid_cc 0.200000, learning_rate 0.010000
epoch 74, time taken time: 00:00:00, train loss 0.014689, train_acc 1.000000, valid_cc 0.200000, learning_rate 0.010000
epoch 75, time taken time: 00:00:00, train loss 0.012212, train_acc 1.000000, valid_cc 0.200000, learning_rate 0.010000
epoch 76, time taken time: 00:00:00, train loss 0.019275, train_acc 1.000000, valid_cc 0.200000, learning_rate 0.010000
epoch 77, time taken time: 00:00:00, tra

In [None]:
#save the parameter
filename = os.path.join("../../data/", "resnet18.params")
net.save_parameters(filename)

In [None]:
#load the parameters and continue the training.
net2 = get_net(mx.gpu())
net2.load_parameters(filename, ctx=mx.gpu())

In [None]:
ctx, num_epochs, lr, wd = try_gpu(), 100, 0.01, 5e-4, 
lr_period, lr_decay = 80, 0.1
train(net2, train_data, valid_data,  lr, wd, num_epochs, lr_period,
      lr_decay, ctx)

# the continue training working, 

#net last output:

#epoch 14, time taken time: 0:1:36, train loss 0.272228, train_acc 0.905361, valid_cc 0.818800, learning_rate 0.010000
#epoch 15, time taken time: 0:1:36, train loss 0.259643, train_acc 0.908694, valid_cc 0.804000, learning_rate 0.010000
        
#net2 first ouptu:
#epoch 0, time taken time: 0:1:35, train loss 0.227526, train_acc 0.920060, valid_cc 0.839400, learning_rate 0.010000
#epoch 1, time taken time: 0:1:38, train loss 0.212250, train_acc 0.925165, valid_cc 0.845400, learning_rate 0.010000

In [None]:
# summary will not work for hybridized model, there will be error:
# |      The network must have been initialized, and must not have been hybridized.
#net2.summary()

In [None]:
#save the parameter
filename = os.path.join("../../data/", "resnet18_2019_09_01_10_14.params")
net2.save_parameters(filename)