**Focus on how to feed data into a training or inference program**

- Data Iterators in MXNet are similar to Python iterator objects.
    - Return a batch of data as `DataBatch` on each call to `next`
    - `DataBatch` contain *n* (`batch_size`) training examples and their corresponding labels 
    - Information such as name, shape, type, and layout on each training example and their corresponding label can be provided as `DataDesc` data descriptor objects via the provided `provide_data` and `provide_label` properties in `DataBatch`
- All IO in MXNet is handled via `mx.io.DataIter` and its subclasses

In [1]:
import os
import numpy as np
import mxnet as mx

**Reading data in memory**
- When data is stored in memory backed by either and NDArray or numpy ndarray use `NDArrayIter` to read data

In [2]:
# NDArrayIter
data = np.random.rand(6, 3)
label = np.random.randint(0, 2, (6,))
data_iter = mx.io.NDArrayIter(data=data, label=label, batch_size=3)
for batch in data_iter:
    print([batch.data, batch.label, batch.pad])

[[
[[ 0.29455951  0.42215776  0.17254411]
 [ 0.11621431  0.02364128  0.62608337]
 [ 0.47626403  0.93825299  0.45832431]]
<NDArray 3x3 @cpu(0)>], [
[ 1.  0.  1.]
<NDArray 3 @cpu(0)>], 0]
[[
[[ 0.48885286  0.49731106  0.45700079]
 [ 0.79351342  0.70532483  0.95223075]
 [ 0.70784718  0.9611398   0.44309041]]
<NDArray 3x3 @cpu(0)>], [
[ 0.  1.  1.]
<NDArray 3 @cpu(0)>], 0]


**Reading data from CSV files**
- `CSVIter`

In [3]:
# Save data into csv file
np.savetxt('data.csv', data, delimiter=',')

data_iter = mx.io.CSVIter(data_csv='data.csv', data_shape=(3,), batch_size=3)
for batch in data_iter:
    print([batch.data, batch.pad])

[[
[[ 0.29455951  0.42215776  0.17254412]
 [ 0.11621431  0.02364128  0.62608337]
 [ 0.476264    0.93825305  0.45832434]]
<NDArray 3x3 @cpu(0)>], 0]
[[
[[ 0.48885289  0.49731103  0.45700079]
 [ 0.79351342  0.70532483  0.95223075]
 [ 0.70784718  0.9611398   0.44309038]]
<NDArray 3x3 @cpu(0)>], 0]


**Custom Iterator**

In [4]:
class SimpleIter(mx.io.DataIter):
    def __init__(self, data_names, data_shapes, data_gen,label_names, 
                 label_shapes, label_gen, num_batches=10):
        self._provide_data = zip(data_names, data_shapes)
        self._provide_label = zip(label_names, label_shapes)
        self.num_batches = num_batches
        self.data_gen = data_gen
        self.label_gen = label_gen
        self.cur_batch = 0 # Current batch
        
    def __iter__(self):
        return self
    
    def reset(self):
        self.cur_batch = 0
        
    def __next__(self):
        return self.next()
    
    @property
    def provide_data(self):
        return self._provide_data
    
    @property
    def provide_label(self):
        return self._provide_label
    
    def next(self):
        if self.cur_batch < self.num_batches:
            self.cur_batch += 1
            data = [mx.nd.array(g(d[1]) for d, g in zip(self._provide_data, self.data_gen))]
            label = [mx.nd.array(g(d[1]) for d, g in zip(self._provide_label, self.label_gen))]
            return mx.io.DataBatch(data, label)
        else:
            raise StopIteration

In [5]:
# Train a simple MLP
num_classes = 10

# Define network
net = mx.sym.Variable('data')
net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=64)
net = mx.sym.Activation(data=net, name='relu1', act_type='relu')
net = mx.sym.FullyConnected(data=net, name='fc2', num_hidden=num_classes)
net = mx.sym.SoftmaxOutput(data=net, name='softmax')

print('Net arguments: ', net.list_arguments())
print('Net inputs: ', net.list_inputs())
print('Net outputs: ', net.list_outputs())

('Net arguments: ', ['data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias', 'softmax_label'])
('Net inputs: ', ['data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias', 'softmax_label'])
('Net outputs: ', ['softmax_output'])


- 4 variables that are learnable parameters
    - fc1_weight
    - fc1_bias
    - fc2_weight
    - fc2_bias
    
- 2 variables for input data variables (called `free variables` in MXNet's Symbol API)
    - data
    - softmax_label
    
- To excute a Symbol data variables need to be bound with data.

In [7]:
import logging
logging.basicConfig(level=logging.INFO)

n = 32
data_iter = SimpleIter(data_names=['data'], data_shapes=[(n, 100)], 
                       data_gen=[lambda s: np.random.uniform(-1, 1, s)],
                       label_names=['softmax_label'], label_shapes=[(n,)], 
                       label_gen=[lambda s: np.random.uniform(0, num_classes, s)])

mod = mx.mod.Module(symbol=net)
# mod.fit(data_iter, num_epoch=5)

# Scratch

In [56]:
# Test
import pandas as pd

l = [
 {'UID': 'A', 'content': 'Apple'}, 
 {'UID': 'A', 'content': ''}, 
 {'UID': 'E', 'content': ''},
 {'UID': 'B', 'content': 'Boy'},
 {'UID': 'B', 'content': ''},
 {'UID': 'C', 'content': 'Cat'},
 {'UID': 'C', 'content': 'Cow'},
 {'UID': 'D', 'content': ''},
 {'UID': 'E', 'content': ''}
    ]

df = pd.DataFrame(l)
df['len'] = df['content'].str.len()
blanks = df.groupby('UID').len.max()
blanks = blanks[blanks==0]
df = df[~df.UID.isin(blanks.index.tolist())]
df

Unnamed: 0,UID,content,len
0,A,Apple,5
1,A,,0
3,B,Boy,3
4,B,,0
5,C,Cat,3
6,C,Cow,3
