In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(0)
X = np.arange(90).reshape(30, 3)
labels = np.random.random_integers(1, 7, 30)

In [3]:
vgg_cols = ['f1', 'f2', 'f3']
categories = ['business_id']

data_train = pd.concat([
    pd.DataFrame(X, columns=vgg_cols),
    pd.DataFrame(labels, columns=['business_id'])
  ], axis=1)
data_train = data_train.sort_values('business_id')
data_train

Unnamed: 0,f1,f2,f3,business_id
19,57,58,59,1
2,6,7,8,1
28,84,85,86,1
13,39,40,41,1
12,36,37,38,1
24,72,73,74,1
22,66,67,68,2
16,48,49,50,2
6,18,19,20,2
20,60,61,62,2


In [4]:
train_cases = data_train.business_id.unique()
train_cases

array([1, 2, 3, 4, 5, 6, 7])

In [30]:
data_val = data_train
val_cases = train_cases

In [36]:
def train_generator_chunked(n_batches, maxlen, dimin, dimout):
  while True:
    shuffled = train_cases.copy()
    np.random.shuffle(shuffled)
    
    batches = np.array_split(shuffled, n_batches)
    print batches
    
    for batch in batches:
      X = np.zeros((len(batch), maxlen, dimin)) - 1
      Y = np.zeros((len(batch), dimout))
      
      for idx, case in enumerate(batch):
        res = data_train[data_train.business_id == case]
        
        # set Y, it is easy
        Y[idx] = np.array(res[categories][:1])
        
        # set X if there are fewer rows than required
        res = res.sample(len(res)) # shuffle rows
        res = np.array(res[vgg_cols])
        
        if len(res) < maxlen:
          X[idx, :len(res)] = res
        else:
          i = 0
          for c in np.array_split(res, maxlen):
            X[idx, i] = c.mean(axis=0)
            i += 1

      yield (X, Y)


def val_generator_chunked(n_batches, maxlen, dimin, dimout):
  while True:
    batches = np.array_split(val_cases, n_batches)
    print batches
    
    for batch in batches:
      X = np.zeros((len(batch), maxlen, dimin)) - 1
      Y = np.zeros((len(batch), dimout))
      
      for idx, case in enumerate(batch):
        res = data_val[data_val.business_id == case]
        
        # set Y, it is easy
        Y[idx] = np.array(res[categories][:1])
        
        # set X if there are fewer rows than required
        res = np.array(res[vgg_cols])
        
        if len(res) < maxlen:
          X[idx, :len(res)] = res
        else:
          i = 0
          for c in np.array_split(res, maxlen):
            X[idx, i] = c.mean(axis=0)
            i += 1

      yield (X, Y)

In [37]:
maxlen = 4
dimin  = 3
dimout = 1

In [38]:
t = val_generator_chunked(2, maxlen, dimin, dimout)

In [45]:
i = next(t)

In [49]:
data_train[data_train.business_id == 7]

Unnamed: 0,f1,f2,f3,business_id
17,51,52,53,7
11,33,34,35,7
18,54,55,56,7


In [47]:
i[0]

array([[[ 78.,  79.,  80.],
        [  0.,   1.,   2.],
        [ 30.,  31.,  32.],
        [ 42.,  43.,  44.]],

       [[ 63.,  64.,  65.],
        [ 69.,  70.,  71.],
        [ 24.,  25.,  26.],
        [  3.,   4.,   5.]],

       [[ 51.,  52.,  53.],
        [ 33.,  34.,  35.],
        [ 54.,  55.,  56.],
        [ -1.,  -1.,  -1.]]])

In [24]:
i[0]

array([[[ 69. ,  70. ,  71. ],
        [ 24. ,  25. ,  26. ],
        [ 63. ,  64. ,  65. ],
        [  3. ,   4. ,   5. ]],

       [[ 39. ,  40. ,  41. ],
        [ 75. ,  76. ,  77. ],
        [ 66. ,  67. ,  68. ],
        [ 48. ,  49. ,  50. ]],

       [[ 46.5,  47.5,  48.5],
        [ 78. ,  79. ,  80. ],
        [ 39. ,  40. ,  41. ],
        [  6. ,   7. ,   8. ]]])