In [41]:
import numpy as np
import pandas as pd
import io
import bson
import matplotlib.pyplot as plt
from skimage.data import imread
import multiprocessing as mp
import pickle

In [42]:
NCORE = 8
all_categories = mp.Manager().list()

all_categories_array = np.array([])

#categories to int dictionary
categ_to_int = {}
int_to_categ = {}

#total number of items in the list
n_train = 7069896 #from kaggle page
n_test = 1768182 #from kaggle page
n_example = 100 #from kaggle page

all_categories_filename_format = 'allcategoriesdata_{0}.p'
train_data_batch_file_format = 'training_batches/{0}/train_{0}_{1}_{2}.jpeg'
test_data_batch_file_format = 'testing_batches/{0}/test_{0}_{1}_{2}.jpeg'

train_category_folder_path_format = 'training_batches/{0}'
test_category_folder_path_format = 'testing_batches/{0}'
test_category_folder_name_format = 'folder_{0}'

show_every = 10000

In [43]:
import time

In [44]:
import os.path

def process_all_categories(filepath):
    """
    processes all categories and forms the list
    : filepath: file path
    """
    process_filename = filepath[filepath.rfind('/')+1:]
    filename_suffix = process_filename.replace('.bson','')
    categories_filename = all_categories_filename_format.format(filename_suffix)
    if os.path.isfile(categories_filename):
        print('File already exists. Seems already it is processed.')
        return

    def process_record_multicore_category(queue, iolock):
        while True:
            record = queue.get()
            if record is None:
                break
            
            all_categories.append(record['category_id'])
    
    queue = mp.Queue(maxsize=NCORE)
    iolock = mp.Lock()
    pool = mp.Pool(NCORE, initializer=process_record_multicore_category, initargs=(queue, iolock))
    
    #loading data from file
    data = bson.decode_file_iter(open(filepath, 'rb'))
    
    print('Starting to go through the file. Time: {0}'.format(time.ctime()))
    for c, record in enumerate(data):
        queue.put(record)
        if c % 100000 ==0:
            print ('records processed: {0}, time: {1}'.format(c, time.ctime()))
    
    # tell workers we're done and join the stuff
    for _ in range(NCORE):
        queue.put(None)
    pool.close()
    pool.join()
    print('File is processed. Time: {0}'.format(time.ctime()))
    
    all_categories_array = np.array(list(set(all_categories)))

    #process the categories and save them
    process_all_categories_array(all_categories_array, categories_filename)
    print('all categories processed.')

In [58]:
#data record preprocess sub-function 
def process_record_train(record):
    """
    processes each record from the training / test file during preprocessing function execution for training dataset
    : record: record to be processed
    : return: void
    """ 
    product_id = record['_id']
    category_id = record['category_id']
    for e, pic in enumerate(record['imgs']):
        picture = pic['picture']
        filepath = train_data_batch_file_format.format(category_id,product_id, e)
        if os.path.isfile(filepath):
            continue
        with open(filepath, 'wb') as w:
            w.write(picture)

In [60]:
#data record preprocess sub-function for test data set
def process_record_test(record, folder_id):
    """
    processes each record from the training / test file during preprocessing function execution for test data set
    : record: record to be processed
    : return: void
    """
    product_id = record['_id']  
    for e, pic in enumerate(record['imgs']):
        picture = pic['picture']
        filepath = test_data_batch_file_format.format(folder_id,product_id, e)
        if os.path.isfile(filepath):
            continue
        with open(filepath, 'wb') as w:
            w.write(picture)

In [56]:
#data preprocess function 
def process_training_file(data, enum_start=None, limit = None, file_suffix=''):
    """
    processes the training file and saves them to batch files for loading them later
    : filepath: path of the training file
    : return: void
    """
    #create all folders for categories
    for categ, i in categ_to_int.items():
        directory = train_category_folder_path_format.format(categ)
        if not os.path.exists(directory):
            os.makedirs(directory)
    
    #loading data from file
    print('Starting to go through the Set. Time: {0}'.format(time.ctime()))
    
    init =  0 if enum_start == None else enum_start
    for c, record in enumerate(data, start=init):
        if(c % show_every ==0):
            print('processed records: {0}'.format(c))
        if(c > limit):
            break
        process_record_train(record)
        
    print('File is processed. Time: {0}'.format(time.ctime()))
    print('Preprocessing is done and saved. Time: {0}'.format(time.ctime()))

In [48]:
#test data preprocess function 
def process_test_file(data, enum_start=None, limit=None, file_suffix=''):
    """
    processes test file and saves the output to batch files for loading them later
    : filepath: path of the test file on disk
    : return: void
    """
    #create folder for each 10000 images
    folder_name = test_category_folder_name_format.format(file_suffix)
    directory = test_category_folder_path_format.format(folder_name)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    #loading data from file
    print('TestFile: Starting to go through the Set. Time: {0}'.format(time.ctime()))
    
    init =  0 if enum_start == None else enum_start
    
    for c, record in enumerate(data, start=init):
        if(c % show_every==0):
            print('processed records: {0}'.format(c))
        if(c > limit):
            break
        process_record_test(record, folder_name)
    
    print('TestFile: File is processed. Time: {0}'.format(time.ctime()))
    print('Preprocessing is done and saved. Time: {0}'.format(time.ctime()))

In [49]:
def process_all_categories_array(all_categories_array, processed_filename):
    """
    processes all categories found in training data and creates dictionaries for faster reference
    : all_categories_array: array that contains all categories to form one hot encoding
    : return: void
    """
    global categ_to_int, int_to_categ
    categories_length = len(all_categories_array)
    categ_to_int = { categ:idx for idx, categ in enumerate(all_categories_array) }
    int_to_categ = { idx:categ for idx, categ in enumerate(all_categories_array) }
    
    pickle.dump((categ_to_int, int_to_categ), open(processed_filename, 'wb'))

In [50]:
def load_categ_to_int_dicts(data_file_path):
    """
    restores categ_to_int and int_to_categ object dictionaries from saved state files if exist
    : data_file_path: actual data file path - to represent the mode (train or train example)
    """
    process_filename = data_file_path[data_file_path.rfind('/')+1:]
    filename_suffix = process_filename.replace('.bson','')
    categories_filename = all_categories_filename_format.format(filename_suffix)
    
    with open(categories_filename, 'rb') as f:
        
        global categ_to_int, int_to_categ
        
        categ_to_int, int_to_categ = pickle.load(f)

In [51]:
def create_one_hot_label(original_label, label_length, one_hot_labels):
    """
    creates one hot label for a given original label value. A sub function for multi core processing of one hot encode function
    : label_length: length of label to initialize the array
    : one_hot_labels: the array that contains all one hot label
    : return: void
    """
    one_hot_label = np.zeros(label_length, dtype='int16')
    
    #commenting below line since now conversion to index happens while preparing the matrix.
    #one_hot_label[categ_to_int[original_label]] = 1
    #so changing it to
    one_hot_label[original_label] = 1
    
    one_hot_labels.append(one_hot_label)

def one_hot_encode(data_batch):
    """
    creates one hot encoded label for the given data batch using multi-core processing
    : data_batch: the sub-section of original final training data
    : return: array of one hot encoded label
    """
    one_hot_labels = list()
    label_length = len(categ_to_int)
    
    #print(data_batch)
    
    for i in range(len(data_batch)):
        original_label = data_batch[i][1] # category column
        create_one_hot_label(original_label, label_length, one_hot_labels)

    one_hot_labels = np.array(list(one_hot_labels))
    
    return one_hot_labels

In [14]:
#process_all_categories('data/train_example.bson')

File already exists. Seems already it is processed.


In [37]:
#test with training example file

#process_training_file('data/train_example.bson')

Multicore processing Queue, Lock, and Pool have been initialized and set up.
The data file has been loaded.
Starting to go through the file. Time: Tue Oct 10 19:52:25 2017
File is processed. Time: Tue Oct 10 19:52:25 2017
Preprocessing is done and saved. Time: Tue Oct 10 19:52:25 2017


In [None]:
#final_data_array[:,1]

In [None]:
#process_training_file('data/train.bson')

In [85]:
#load_categ_to_int_dicts('data/train_example.bson')

In [88]:
process_all_categories('data/train.bson')

Starting to go through the file. Time: Tue Oct 10 20:47:24 2017
records processed: 0, time: Tue Oct 10 20:47:24 2017
records processed: 100000, time: Tue Oct 10 20:47:30 2017
records processed: 200000, time: Tue Oct 10 20:47:37 2017
records processed: 300000, time: Tue Oct 10 20:47:44 2017
records processed: 400000, time: Tue Oct 10 20:47:50 2017
records processed: 500000, time: Tue Oct 10 20:47:57 2017
records processed: 600000, time: Tue Oct 10 20:48:03 2017
records processed: 700000, time: Tue Oct 10 20:48:10 2017
records processed: 800000, time: Tue Oct 10 20:48:16 2017
records processed: 900000, time: Tue Oct 10 20:48:23 2017
records processed: 1000000, time: Tue Oct 10 20:48:29 2017
records processed: 1100000, time: Tue Oct 10 20:48:36 2017
records processed: 1200000, time: Tue Oct 10 20:48:43 2017
records processed: 1300000, time: Tue Oct 10 20:48:49 2017
records processed: 1400000, time: Tue Oct 10 20:48:56 2017
records processed: 1500000, time: Tue Oct 10 20:49:02 2017
records

In [52]:
#Load dictionaries - categ_to_int and int_to_categ from files to objects
load_categ_to_int_dicts('data/train.bson')

In [32]:
len(categ_to_int)

5270

In [53]:
def preprocess_test_batches(filepath, override_batch=None):
    """
    preprocesses batches and saves them in batches to end up losing data due to long running processes
    : filepath: path of file to be processed
    """
    input_data = bson.decode_file_iter(open(filepath, 'rb'))
    
    limit = 10000
    batches_count = int(n_test / limit)
    batch_range = batches_count if override_batch is None else override_batch
    for batch_idx in range(batch_range):
        print('starting with batch: {0}'.format(batch_idx))
        process_test_file(input_data, enum_start=batch_idx*limit, limit=(batch_idx+1)*limit, file_suffix=batch_idx)
        
    print('all test files are preprocessed. cool!')

In [54]:
def preprocess_training_batches(filepath, override_batch=None):
    """
    preprocesses batches and saves them in batches to end up losing data due to long running processes
    : filepath: path of file to be processed
    """
    input_data = bson.decode_file_iter(open(filepath, 'rb'))
    
    limit = 10000
    batches_count = int(n_train / limit)
    batch_range = batches_count if override_batch is None else override_batch
    for batch_idx in range(batch_range):
        print('starting with batch: {0}'.format(batch_idx))
        process_training_file(input_data, enum_start=batch_idx*limit, limit=(batch_idx+1)*limit, file_suffix=batch_idx)

    print('all training files are preprocessed. cool!')

In [61]:
preprocess_training_batches('data/train.bson')

starting with batch: 0
Starting to go through the Set. Time: Fri Oct 13 22:26:28 2017
processed records: 0
processed records: 10000
File is processed. Time: Fri Oct 13 22:26:28 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:26:28 2017
starting with batch: 1
Starting to go through the Set. Time: Fri Oct 13 22:26:28 2017
processed records: 10000
processed records: 20000
File is processed. Time: Fri Oct 13 22:26:29 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:26:29 2017
starting with batch: 2
Starting to go through the Set. Time: Fri Oct 13 22:26:29 2017
processed records: 20000
processed records: 30000
File is processed. Time: Fri Oct 13 22:26:30 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:26:30 2017
starting with batch: 3
Starting to go through the Set. Time: Fri Oct 13 22:26:30 2017
processed records: 30000
processed records: 40000
File is processed. Time: Fri Oct 13 22:26:31 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:26:31 2017
star

processed records: 340000
File is processed. Time: Fri Oct 13 22:27:30 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:27:30 2017
starting with batch: 34
Starting to go through the Set. Time: Fri Oct 13 22:27:30 2017
processed records: 340000
processed records: 350000
File is processed. Time: Fri Oct 13 22:27:37 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:27:37 2017
starting with batch: 35
Starting to go through the Set. Time: Fri Oct 13 22:27:37 2017
processed records: 350000
processed records: 360000
File is processed. Time: Fri Oct 13 22:27:43 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:27:43 2017
starting with batch: 36
Starting to go through the Set. Time: Fri Oct 13 22:27:43 2017
processed records: 360000
processed records: 370000
File is processed. Time: Fri Oct 13 22:27:49 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:27:49 2017
starting with batch: 37
Starting to go through the Set. Time: Fri Oct 13 22:27:49 2017
processed reco

processed records: 670000
File is processed. Time: Fri Oct 13 22:30:10 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:30:10 2017
starting with batch: 67
Starting to go through the Set. Time: Fri Oct 13 22:30:10 2017
processed records: 670000
processed records: 680000
File is processed. Time: Fri Oct 13 22:30:17 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:30:17 2017
starting with batch: 68
Starting to go through the Set. Time: Fri Oct 13 22:30:17 2017
processed records: 680000
processed records: 690000
File is processed. Time: Fri Oct 13 22:30:22 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:30:22 2017
starting with batch: 69
Starting to go through the Set. Time: Fri Oct 13 22:30:22 2017
processed records: 690000
processed records: 700000
File is processed. Time: Fri Oct 13 22:30:26 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:30:26 2017
starting with batch: 70
Starting to go through the Set. Time: Fri Oct 13 22:30:26 2017
processed reco

processed records: 1000000
File is processed. Time: Fri Oct 13 22:32:57 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:32:57 2017
starting with batch: 100
Starting to go through the Set. Time: Fri Oct 13 22:32:57 2017
processed records: 1000000
processed records: 1010000
File is processed. Time: Fri Oct 13 22:33:03 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:33:03 2017
starting with batch: 101
Starting to go through the Set. Time: Fri Oct 13 22:33:03 2017
processed records: 1010000
processed records: 1020000
File is processed. Time: Fri Oct 13 22:33:07 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:33:07 2017
starting with batch: 102
Starting to go through the Set. Time: Fri Oct 13 22:33:07 2017
processed records: 1020000
processed records: 1030000
File is processed. Time: Fri Oct 13 22:33:13 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:33:13 2017
starting with batch: 103
Starting to go through the Set. Time: Fri Oct 13 22:33:13 2017
pro

processed records: 1330000
File is processed. Time: Fri Oct 13 22:35:39 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:35:39 2017
starting with batch: 133
Starting to go through the Set. Time: Fri Oct 13 22:35:39 2017
processed records: 1330000
processed records: 1340000
File is processed. Time: Fri Oct 13 22:35:45 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:35:45 2017
starting with batch: 134
Starting to go through the Set. Time: Fri Oct 13 22:35:45 2017
processed records: 1340000
processed records: 1350000
File is processed. Time: Fri Oct 13 22:35:49 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:35:49 2017
starting with batch: 135
Starting to go through the Set. Time: Fri Oct 13 22:35:49 2017
processed records: 1350000
processed records: 1360000
File is processed. Time: Fri Oct 13 22:35:54 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:35:54 2017
starting with batch: 136
Starting to go through the Set. Time: Fri Oct 13 22:35:54 2017
pro

processed records: 1660000
File is processed. Time: Fri Oct 13 22:38:20 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:38:20 2017
starting with batch: 166
Starting to go through the Set. Time: Fri Oct 13 22:38:20 2017
processed records: 1660000
processed records: 1670000
File is processed. Time: Fri Oct 13 22:38:25 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:38:25 2017
starting with batch: 167
Starting to go through the Set. Time: Fri Oct 13 22:38:25 2017
processed records: 1670000
processed records: 1680000
File is processed. Time: Fri Oct 13 22:38:31 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:38:31 2017
starting with batch: 168
Starting to go through the Set. Time: Fri Oct 13 22:38:31 2017
processed records: 1680000
processed records: 1690000
File is processed. Time: Fri Oct 13 22:38:36 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:38:36 2017
starting with batch: 169
Starting to go through the Set. Time: Fri Oct 13 22:38:36 2017
pro

processed records: 1990000
File is processed. Time: Fri Oct 13 22:41:13 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:41:13 2017
starting with batch: 199
Starting to go through the Set. Time: Fri Oct 13 22:41:13 2017
processed records: 1990000
processed records: 2000000
File is processed. Time: Fri Oct 13 22:41:19 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:41:19 2017
starting with batch: 200
Starting to go through the Set. Time: Fri Oct 13 22:41:19 2017
processed records: 2000000
processed records: 2010000
File is processed. Time: Fri Oct 13 22:41:25 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:41:25 2017
starting with batch: 201
Starting to go through the Set. Time: Fri Oct 13 22:41:25 2017
processed records: 2010000
processed records: 2020000
File is processed. Time: Fri Oct 13 22:41:31 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:41:31 2017
starting with batch: 202
Starting to go through the Set. Time: Fri Oct 13 22:41:31 2017
pro

processed records: 2320000
File is processed. Time: Fri Oct 13 22:44:25 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:44:25 2017
starting with batch: 232
Starting to go through the Set. Time: Fri Oct 13 22:44:25 2017
processed records: 2320000
processed records: 2330000
File is processed. Time: Fri Oct 13 22:44:32 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:44:32 2017
starting with batch: 233
Starting to go through the Set. Time: Fri Oct 13 22:44:32 2017
processed records: 2330000
processed records: 2340000
File is processed. Time: Fri Oct 13 22:44:37 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:44:37 2017
starting with batch: 234
Starting to go through the Set. Time: Fri Oct 13 22:44:37 2017
processed records: 2340000
processed records: 2350000
File is processed. Time: Fri Oct 13 22:44:43 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:44:43 2017
starting with batch: 235
Starting to go through the Set. Time: Fri Oct 13 22:44:43 2017
pro

processed records: 2650000
File is processed. Time: Fri Oct 13 22:47:42 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:47:42 2017
starting with batch: 265
Starting to go through the Set. Time: Fri Oct 13 22:47:42 2017
processed records: 2650000
processed records: 2660000
File is processed. Time: Fri Oct 13 22:47:48 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:47:48 2017
starting with batch: 266
Starting to go through the Set. Time: Fri Oct 13 22:47:48 2017
processed records: 2660000
processed records: 2670000
File is processed. Time: Fri Oct 13 22:47:55 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:47:55 2017
starting with batch: 267
Starting to go through the Set. Time: Fri Oct 13 22:47:55 2017
processed records: 2670000
processed records: 2680000
File is processed. Time: Fri Oct 13 22:48:00 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:48:00 2017
starting with batch: 268
Starting to go through the Set. Time: Fri Oct 13 22:48:00 2017
pro

processed records: 2980000
File is processed. Time: Fri Oct 13 22:51:02 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:51:02 2017
starting with batch: 298
Starting to go through the Set. Time: Fri Oct 13 22:51:02 2017
processed records: 2980000
processed records: 2990000
File is processed. Time: Fri Oct 13 22:51:07 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:51:07 2017
starting with batch: 299
Starting to go through the Set. Time: Fri Oct 13 22:51:07 2017
processed records: 2990000
processed records: 3000000
File is processed. Time: Fri Oct 13 22:51:13 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:51:13 2017
starting with batch: 300
Starting to go through the Set. Time: Fri Oct 13 22:51:13 2017
processed records: 3000000
processed records: 3010000
File is processed. Time: Fri Oct 13 22:51:20 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:51:20 2017
starting with batch: 301
Starting to go through the Set. Time: Fri Oct 13 22:51:20 2017
pro

processed records: 3310000
File is processed. Time: Fri Oct 13 22:54:22 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:54:22 2017
starting with batch: 331
Starting to go through the Set. Time: Fri Oct 13 22:54:22 2017
processed records: 3310000
processed records: 3320000
File is processed. Time: Fri Oct 13 22:54:28 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:54:28 2017
starting with batch: 332
Starting to go through the Set. Time: Fri Oct 13 22:54:28 2017
processed records: 3320000
processed records: 3330000
File is processed. Time: Fri Oct 13 22:54:34 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:54:34 2017
starting with batch: 333
Starting to go through the Set. Time: Fri Oct 13 22:54:34 2017
processed records: 3330000
processed records: 3340000
File is processed. Time: Fri Oct 13 22:54:41 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:54:41 2017
starting with batch: 334
Starting to go through the Set. Time: Fri Oct 13 22:54:41 2017
pro

processed records: 3640000
File is processed. Time: Fri Oct 13 22:57:47 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:57:47 2017
starting with batch: 364
Starting to go through the Set. Time: Fri Oct 13 22:57:47 2017
processed records: 3640000
processed records: 3650000
File is processed. Time: Fri Oct 13 22:57:53 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:57:53 2017
starting with batch: 365
Starting to go through the Set. Time: Fri Oct 13 22:57:53 2017
processed records: 3650000
processed records: 3660000
File is processed. Time: Fri Oct 13 22:57:59 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:57:59 2017
starting with batch: 366
Starting to go through the Set. Time: Fri Oct 13 22:57:59 2017
processed records: 3660000
processed records: 3670000
File is processed. Time: Fri Oct 13 22:58:05 2017
Preprocessing is done and saved. Time: Fri Oct 13 22:58:05 2017
starting with batch: 367
Starting to go through the Set. Time: Fri Oct 13 22:58:05 2017
pro

processed records: 3970000
File is processed. Time: Fri Oct 13 23:01:12 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:01:12 2017
starting with batch: 397
Starting to go through the Set. Time: Fri Oct 13 23:01:12 2017
processed records: 3970000
processed records: 3980000
File is processed. Time: Fri Oct 13 23:01:18 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:01:18 2017
starting with batch: 398
Starting to go through the Set. Time: Fri Oct 13 23:01:18 2017
processed records: 3980000
processed records: 3990000
File is processed. Time: Fri Oct 13 23:01:25 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:01:25 2017
starting with batch: 399
Starting to go through the Set. Time: Fri Oct 13 23:01:25 2017
processed records: 3990000
processed records: 4000000
File is processed. Time: Fri Oct 13 23:01:31 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:01:31 2017
starting with batch: 400
Starting to go through the Set. Time: Fri Oct 13 23:01:31 2017
pro

processed records: 4300000
File is processed. Time: Fri Oct 13 23:04:37 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:04:37 2017
starting with batch: 430
Starting to go through the Set. Time: Fri Oct 13 23:04:37 2017
processed records: 4300000
processed records: 4310000
File is processed. Time: Fri Oct 13 23:04:44 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:04:44 2017
starting with batch: 431
Starting to go through the Set. Time: Fri Oct 13 23:04:44 2017
processed records: 4310000
processed records: 4320000
File is processed. Time: Fri Oct 13 23:04:50 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:04:50 2017
starting with batch: 432
Starting to go through the Set. Time: Fri Oct 13 23:04:50 2017
processed records: 4320000
processed records: 4330000
File is processed. Time: Fri Oct 13 23:04:57 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:04:57 2017
starting with batch: 433
Starting to go through the Set. Time: Fri Oct 13 23:04:57 2017
pro

processed records: 4630000
File is processed. Time: Fri Oct 13 23:08:04 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:08:04 2017
starting with batch: 463
Starting to go through the Set. Time: Fri Oct 13 23:08:04 2017
processed records: 4630000
processed records: 4640000
File is processed. Time: Fri Oct 13 23:08:10 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:08:10 2017
starting with batch: 464
Starting to go through the Set. Time: Fri Oct 13 23:08:10 2017
processed records: 4640000
processed records: 4650000
File is processed. Time: Fri Oct 13 23:08:17 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:08:17 2017
starting with batch: 465
Starting to go through the Set. Time: Fri Oct 13 23:08:17 2017
processed records: 4650000
processed records: 4660000
File is processed. Time: Fri Oct 13 23:08:23 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:08:23 2017
starting with batch: 466
Starting to go through the Set. Time: Fri Oct 13 23:08:23 2017
pro

processed records: 4960000
File is processed. Time: Fri Oct 13 23:11:31 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:11:31 2017
starting with batch: 496
Starting to go through the Set. Time: Fri Oct 13 23:11:31 2017
processed records: 4960000
processed records: 4970000
File is processed. Time: Fri Oct 13 23:11:37 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:11:37 2017
starting with batch: 497
Starting to go through the Set. Time: Fri Oct 13 23:11:37 2017
processed records: 4970000
processed records: 4980000
File is processed. Time: Fri Oct 13 23:11:44 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:11:44 2017
starting with batch: 498
Starting to go through the Set. Time: Fri Oct 13 23:11:44 2017
processed records: 4980000
processed records: 4990000
File is processed. Time: Fri Oct 13 23:11:50 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:11:50 2017
starting with batch: 499
Starting to go through the Set. Time: Fri Oct 13 23:11:50 2017
pro

processed records: 5290000
File is processed. Time: Fri Oct 13 23:14:58 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:14:58 2017
starting with batch: 529
Starting to go through the Set. Time: Fri Oct 13 23:14:58 2017
processed records: 5290000
processed records: 5300000
File is processed. Time: Fri Oct 13 23:15:04 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:15:04 2017
starting with batch: 530
Starting to go through the Set. Time: Fri Oct 13 23:15:04 2017
processed records: 5300000
processed records: 5310000
File is processed. Time: Fri Oct 13 23:15:18 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:15:18 2017
starting with batch: 531
Starting to go through the Set. Time: Fri Oct 13 23:15:18 2017
processed records: 5310000
processed records: 5320000
File is processed. Time: Fri Oct 13 23:15:37 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:15:37 2017
starting with batch: 532
Starting to go through the Set. Time: Fri Oct 13 23:15:37 2017
pro

processed records: 5620000
File is processed. Time: Fri Oct 13 23:23:33 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:23:33 2017
starting with batch: 562
Starting to go through the Set. Time: Fri Oct 13 23:23:33 2017
processed records: 5620000
processed records: 5630000
File is processed. Time: Fri Oct 13 23:23:49 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:23:49 2017
starting with batch: 563
Starting to go through the Set. Time: Fri Oct 13 23:23:49 2017
processed records: 5630000
processed records: 5640000
File is processed. Time: Fri Oct 13 23:24:04 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:24:04 2017
starting with batch: 564
Starting to go through the Set. Time: Fri Oct 13 23:24:04 2017
processed records: 5640000
processed records: 5650000
File is processed. Time: Fri Oct 13 23:24:19 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:24:19 2017
starting with batch: 565
Starting to go through the Set. Time: Fri Oct 13 23:24:19 2017
pro

processed records: 5950000
File is processed. Time: Fri Oct 13 23:32:18 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:32:18 2017
starting with batch: 595
Starting to go through the Set. Time: Fri Oct 13 23:32:18 2017
processed records: 5950000
processed records: 5960000
File is processed. Time: Fri Oct 13 23:32:35 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:32:35 2017
starting with batch: 596
Starting to go through the Set. Time: Fri Oct 13 23:32:35 2017
processed records: 5960000
processed records: 5970000
File is processed. Time: Fri Oct 13 23:32:52 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:32:52 2017
starting with batch: 597
Starting to go through the Set. Time: Fri Oct 13 23:32:52 2017
processed records: 5970000
processed records: 5980000
File is processed. Time: Fri Oct 13 23:33:08 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:33:08 2017
starting with batch: 598
Starting to go through the Set. Time: Fri Oct 13 23:33:08 2017
pro

processed records: 6280000
File is processed. Time: Fri Oct 13 23:41:08 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:41:08 2017
starting with batch: 628
Starting to go through the Set. Time: Fri Oct 13 23:41:08 2017
processed records: 6280000
processed records: 6290000
File is processed. Time: Fri Oct 13 23:41:23 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:41:23 2017
starting with batch: 629
Starting to go through the Set. Time: Fri Oct 13 23:41:23 2017
processed records: 6290000
processed records: 6300000
File is processed. Time: Fri Oct 13 23:41:38 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:41:38 2017
starting with batch: 630
Starting to go through the Set. Time: Fri Oct 13 23:41:38 2017
processed records: 6300000
processed records: 6310000
File is processed. Time: Fri Oct 13 23:41:57 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:41:57 2017
starting with batch: 631
Starting to go through the Set. Time: Fri Oct 13 23:41:57 2017
pro

processed records: 6610000
File is processed. Time: Fri Oct 13 23:50:01 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:50:01 2017
starting with batch: 661
Starting to go through the Set. Time: Fri Oct 13 23:50:01 2017
processed records: 6610000
processed records: 6620000
File is processed. Time: Fri Oct 13 23:50:16 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:50:16 2017
starting with batch: 662
Starting to go through the Set. Time: Fri Oct 13 23:50:16 2017
processed records: 6620000
processed records: 6630000
File is processed. Time: Fri Oct 13 23:50:35 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:50:35 2017
starting with batch: 663
Starting to go through the Set. Time: Fri Oct 13 23:50:35 2017
processed records: 6630000
processed records: 6640000
File is processed. Time: Fri Oct 13 23:50:50 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:50:50 2017
starting with batch: 664
Starting to go through the Set. Time: Fri Oct 13 23:50:50 2017
pro

processed records: 6940000
File is processed. Time: Fri Oct 13 23:58:58 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:58:58 2017
starting with batch: 694
Starting to go through the Set. Time: Fri Oct 13 23:58:58 2017
processed records: 6940000
processed records: 6950000
File is processed. Time: Fri Oct 13 23:59:14 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:59:14 2017
starting with batch: 695
Starting to go through the Set. Time: Fri Oct 13 23:59:14 2017
processed records: 6950000
processed records: 6960000
File is processed. Time: Fri Oct 13 23:59:29 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:59:29 2017
starting with batch: 696
Starting to go through the Set. Time: Fri Oct 13 23:59:29 2017
processed records: 6960000
processed records: 6970000
File is processed. Time: Fri Oct 13 23:59:48 2017
Preprocessing is done and saved. Time: Fri Oct 13 23:59:48 2017
starting with batch: 697
Starting to go through the Set. Time: Fri Oct 13 23:59:48 2017
pro

In [63]:
preprocess_test_batches('data/test.bson')

starting with batch: 0
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:22:21 2017
processed records: 0
processed records: 10000
TestFile: File is processed. Time: Sat Oct 14 00:22:22 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:22:22 2017
starting with batch: 1
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:22:22 2017
processed records: 10000
processed records: 20000
TestFile: File is processed. Time: Sat Oct 14 00:22:23 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:22:23 2017
starting with batch: 2
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:22:23 2017
processed records: 20000
processed records: 30000
TestFile: File is processed. Time: Sat Oct 14 00:22:24 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:22:24 2017
starting with batch: 3
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:22:24 2017
processed records: 30000
processed records: 40000
TestFile: File is processed. Time: Sat Oct 14 00

processed records: 310000
TestFile: File is processed. Time: Sat Oct 14 00:22:51 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:22:51 2017
starting with batch: 31
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:22:51 2017
processed records: 310000
processed records: 320000
TestFile: File is processed. Time: Sat Oct 14 00:22:52 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:22:52 2017
starting with batch: 32
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:22:52 2017
processed records: 320000
processed records: 330000
TestFile: File is processed. Time: Sat Oct 14 00:22:53 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:22:53 2017
starting with batch: 33
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:22:53 2017
processed records: 330000
processed records: 340000
TestFile: File is processed. Time: Sat Oct 14 00:22:54 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:22:54 2017
starting with batch: 34
TestFil

processed records: 620000
TestFile: File is processed. Time: Sat Oct 14 00:23:46 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:23:46 2017
starting with batch: 62
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:23:46 2017
processed records: 620000
processed records: 630000
TestFile: File is processed. Time: Sat Oct 14 00:23:47 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:23:47 2017
starting with batch: 63
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:23:47 2017
processed records: 630000
processed records: 640000
TestFile: File is processed. Time: Sat Oct 14 00:23:48 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:23:48 2017
starting with batch: 64
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:23:48 2017
processed records: 640000
processed records: 650000
TestFile: File is processed. Time: Sat Oct 14 00:23:49 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:23:49 2017
starting with batch: 65
TestFil

processed records: 930000
TestFile: File is processed. Time: Sat Oct 14 00:24:37 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:24:37 2017
starting with batch: 93
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:24:37 2017
processed records: 930000
processed records: 940000
TestFile: File is processed. Time: Sat Oct 14 00:24:38 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:24:38 2017
starting with batch: 94
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:24:38 2017
processed records: 940000
processed records: 950000
TestFile: File is processed. Time: Sat Oct 14 00:24:39 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:24:39 2017
starting with batch: 95
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:24:39 2017
processed records: 950000
processed records: 960000
TestFile: File is processed. Time: Sat Oct 14 00:24:40 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:24:40 2017
starting with batch: 96
TestFil

processed records: 1230000
TestFile: File is processed. Time: Sat Oct 14 00:25:10 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:25:10 2017
starting with batch: 123
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:25:10 2017
processed records: 1230000
processed records: 1240000
TestFile: File is processed. Time: Sat Oct 14 00:25:14 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:25:14 2017
starting with batch: 124
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:25:14 2017
processed records: 1240000
processed records: 1250000
TestFile: File is processed. Time: Sat Oct 14 00:25:18 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:25:18 2017
starting with batch: 125
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:25:18 2017
processed records: 1250000
processed records: 1260000
TestFile: File is processed. Time: Sat Oct 14 00:25:19 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:25:19 2017
starting with batch: 

processed records: 1530000
TestFile: File is processed. Time: Sat Oct 14 00:25:59 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:25:59 2017
starting with batch: 153
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:25:59 2017
processed records: 1530000
processed records: 1540000
TestFile: File is processed. Time: Sat Oct 14 00:26:00 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:26:00 2017
starting with batch: 154
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:26:00 2017
processed records: 1540000
processed records: 1550000
TestFile: File is processed. Time: Sat Oct 14 00:26:00 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:26:00 2017
starting with batch: 155
TestFile: Starting to go through the Set. Time: Sat Oct 14 00:26:00 2017
processed records: 1550000
processed records: 1560000
TestFile: File is processed. Time: Sat Oct 14 00:26:01 2017
Preprocessing is done and saved. Time: Sat Oct 14 00:26:01 2017
starting with batch: 