This notebook contains a generator class for Keras called `BSONIterator` that can read directly from the BSON data. You can use it in combination with `ImageDataGenerator` for doing data augmentation.

In [1]:
import os
import pandas as pd


In [2]:
data_dir = "/home/manojps/Documents/deep_learning/Project"

# Part 1: Create lookup tables

The generator uses several lookup tables that describe the layout of the BSON file, which products and images are part of the training/validation sets, and so on.

You only need to generate these tables once, as they get saved to CSV files. If you already have these CSV files, skip to part 2.

## Lookup table for categories

In [16]:
categories_path = os.path.join(data_dir, "category_names.csv")
#categories_df = pd.read_csv(categories_path, index_col="category_id")
categories_df = pd.read_csv(categories_path)

# Maps the category_id to an integer index. This is what we'll use to
# one-hot encode the labels.
categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

#categories_df.to_csv("categories.csv")
categories_df.head()

Unnamed: 0,category_id,category_level1,category_level2,category_level3,category_idx
0,1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0
1,1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1
2,1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2
3,1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3
4,1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4


In [17]:
# Create a dataframe with category level1 idx
cat1_level_df = pd.DataFrame(categories_df.category_level1.unique())
cat1_level_df.rename(columns={0: 'category_level1'}, inplace=True)
cat1_level_df["cat1_idx"] = pd.Series(range(len(cat1_level_df)), index=cat1_level_df.index)
cat1_level_df.head()

Unnamed: 0,category_level1,cat1_idx
0,ABONNEMENT / SERVICES,0
1,AMENAGEMENT URBAIN - VOIRIE,1
2,ANIMALERIE,2
3,APICULTURE,3
4,ART DE LA TABLE - ARTICLES CULINAIRES,4


In [18]:
# Merge categories_df and cat1_levle_df into categories_df
categories_df = pd.merge(categories_df, cat1_level_df, on='category_level1', how='inner')
categories_df.head()

Unnamed: 0,category_id,category_level1,category_level2,category_level3,category_idx,cat1_idx
0,1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0,0
1,1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1,1
2,1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2,1
3,1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3,1
4,1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4,1


In [19]:
# Create a dataframe with category level2 idx
cat2_level_df = pd.DataFrame(categories_df.category_level2.unique())
cat2_level_df.rename(columns={0:'category_level2'}, inplace=True)
cat2_level_df["cat2_idx"] = pd.Series(range(len(cat2_level_df)), index = cat2_level_df.index)
cat2_level_df.head()

Unnamed: 0,category_level2,cat2_idx
0,CARTE PREPAYEE,0
1,AMENAGEMENT URBAIN,1
2,SIGNALETIQUE,2
3,SIGNALISATION ROUTIERE,3
4,VOIRIE,4


In [20]:
# Merge categories_df and cat2_level_df into categories_df
categories_df = pd.merge(categories_df, cat2_level_df, on='category_level2', how='inner')
categories_df.head()

Unnamed: 0,category_id,category_level1,category_level2,category_level3,category_idx,cat1_idx,cat2_idx
0,1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0,0,0
1,1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1,1,1
2,1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2,1,1
3,1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3,1,1
4,1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4,1,2


In [21]:
categories_df = categories_df.set_index('category_id')
categories_df.head()

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx,cat1_idx,cat2_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0,0,0
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1,1,1
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2,1,1
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3,1,1
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4,1,2


In [22]:
# Write dataframe to CSV file
categories_df.to_csv("categories.csv")

Create dictionaries for quick lookup of `category_id` to `category_idx` mapping.

In [23]:
def make_category_tables():
    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat

In [24]:
cat2idx, idx2cat = make_category_tables()

In [25]:
# Test if it works:
cat2idx[1000012755], idx2cat[4]

(4, 1000012755)

In [None]:
# Checking if the CSV file can be loaded into a DataFrame correctly
categories_df = pd.read_csv("categories.csv", index_col=0)
categories_df.head()

In [26]:
# Garbage collection
# https://stackoverflow.com/questions/32247643/how-to-delete-multiple-pandas-python-dataframes-from-memory-to-save-ram
list = [cat1_level_df, cat2_level_df, categories_df]
del cat1_level_df, cat2_level_df, categories_df
del list

In [27]:
# Checking if memory is freed
cat1_level_df.head()

NameError: name 'cat1_level_df' is not defined