# About this Notebook:

This notebook detailed the exploratory data analysis on the Open Images Classes. So we can prepared the ourself to how to use the data.

Notebook Enviroment

- Kernel: Python 2
- 

In [1]:
!python --version

Python 2.7.12


## Question to be answered:

1. How many classes can we use?
2. How many images perclass?

## TODO

# Load Modules

In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np
import json
import os
import time
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

import seaborn as sns



# Commonly Shared Statics

In [3]:
data_labels_description_dir_path = '../../data/doc/labels/'

In [4]:
data_dir_path = '../../data/'

# EDA on the class-descriptions.csv

In [5]:
class_descriptions_file_path = data_labels_description_dir_path + 'class-descriptions.csv'


In [6]:
df_class_descriptions = pd.read_csv(class_descriptions_file_path,names=['LabelName', 'Description'])

In [7]:
df_class_descriptions.head(10)

Unnamed: 0,LabelName,Description
0,/m/0100nhbf,Sprenger's tulip
1,/m/0104x9kv,Vinegret
2,/m/0105jzwx,Dabu-dabu
3,/m/0105ld7g,Pistachio ice cream
4,/m/0105lxy5,Woku
5,/m/0105n86x,Pastila
6,/m/0105ts35,Burasa
7,/m/0108_09c,Summer snowflake
8,/m/01_097,Airmail
9,/m/010dmf,Isle of man tt


In [8]:
df_class_descriptions.tail(10)

Unnamed: 0,LabelName,Description
19985,/m/0zdntfj,Candy cane sorrel
19986,/m/0zg7256,Gingerbread house
19987,/m/0zghq,Moonshine
19988,/m/0zkm,Arthropod
19989,/m/0zlm,Conservation-restoration
19990,/m/0zrpfhj,Throwing
19991,/m/0zrrls2,Red dahlia
19992,/m/0zrthkd,Brine
19993,/m/0zrv5th,Lamborghini huracán
19994,/m/0zvk5,Helmet


In [9]:
df_class_descriptions.describe()

Unnamed: 0,LabelName,Description
count,19995,19995
unique,19995,19712
top,/m/02qg91d,Trunk
freq,1,4


In [10]:
df_class_descriptions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19995 entries, 0 to 19994
Data columns (total 2 columns):
LabelName      19995 non-null object
Description    19995 non-null object
dtypes: object(2)
memory usage: 468.6+ KB


Replace space with underscore for S3 bucket prefix 

In [11]:
df_class_descriptions.Description = df_class_descriptions.Description.str.replace(' ', '_')

In [12]:
df_class_descriptions.tail(10)

Unnamed: 0,LabelName,Description
19985,/m/0zdntfj,Candy_cane_sorrel
19986,/m/0zg7256,Gingerbread_house
19987,/m/0zghq,Moonshine
19988,/m/0zkm,Arthropod
19989,/m/0zlm,Conservation-restoration
19990,/m/0zrpfhj,Throwing
19991,/m/0zrrls2,Red_dahlia
19992,/m/0zrthkd,Brine
19993,/m/0zrv5th,Lamborghini_huracán
19994,/m/0zvk5,Helmet


## Compose the list of 600 original labels 

In [13]:
! ls ../../data/doc/labels/



bbox_labels_600_hierarchy.json	test-annotations-human-imagelabels.csv
class-descriptions.csv		test-images-with-rotation.csv
class-descriptions.csv.1	train-annotations-human-imagelabels.csv
classes-trainable.txt		train-images-with-labels-with-rotation.csv
classes-trainable.txt.1		validation-annotations-human-imagelabels.csv
classes.txt			validation-images-with-rotation.csv


In [14]:
bbox_labels_600_hierarchy_json_file_path = data_labels_description_dir_path + 'bbox_labels_600_hierarchy.json'

In [15]:
bbox_labels_600_json_data=open(bbox_labels_600_hierarchy_json_file_path).read()

In [16]:
bbox_labels_600_json = json.loads(bbox_labels_600_json_data)

In [17]:
def item_generator(json_input, lookup_key):
    if isinstance(json_input, dict):
        for k, v in json_input.iteritems():
            if k == lookup_key:
                yield v
            else:
                for child_val in item_generator(v, lookup_key):
                    yield child_val
    elif isinstance(json_input, list):
        for item in json_input:
            for item_val in item_generator(item, lookup_key):
                yield item_val

In [18]:
subcategory_set = set()
for label in item_generator(bbox_labels_600_json, "LabelName"):  
    subcategory_set.add(label)

In [19]:
subcategory_list =list(subcategory_set)

In [20]:
df_bbox_600_labels = pd.DataFrame(subcategory_list,columns=['LabelName'])

In [21]:
df_bbox_600_labels.head()

Unnamed: 0,LabelName
0,/m/04kkgm
1,/m/01bl7v
2,/m/0p833
3,/m/09kx5
4,/m/0cffdh


In [22]:
df_bbox_600_labels.describe()

Unnamed: 0,LabelName
count,602
unique,602
top,/m/02ctlc
freq,1


## Filter out 600 classes from class descriptions

In [23]:
df_trainables = pd.merge(df_bbox_600_labels,
                 df_class_descriptions[['LabelName', 'Description']],
                 on='LabelName', 
                 how='inner')

In [24]:
df_trainables.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 601 entries, 0 to 600
Data columns (total 2 columns):
LabelName      601 non-null object
Description    601 non-null object
dtypes: object(2)
memory usage: 14.1+ KB


In [25]:
df_trainables.head(10)

Unnamed: 0,LabelName,Description
0,/m/04kkgm,Bowl
1,/m/01bl7v,Boy
2,/m/0p833,Perfume
3,/m/09kx5,Deer
4,/m/0cffdh,Maple
5,/m/021sj1,Office_building
6,/m/0175cv,Tick
7,/m/01dwwc,Pancake
8,/m/03vt0,Insect
9,/m/01d40f,Dress


In [26]:
df_trainables.to_csv( data_dir_path + '600_trainable_LabelNames.csv', sep=',')

Listing all the labels 

In [27]:
for i in list(df_trainables['Description']):
    print (i)

Bowl
Boy
Perfume
Deer
Maple
Office_building
Tick
Pancake
Insect
Dress
Dishwasher
Dagger
Whisk
Bathroom_accessory
Missile
Candle
Bathtub
Piano
Pasta
Chopsticks
Face_powder
Television
Bee
Camera
Goggles
Pencil_case
Clothing
Bed
Sports_equipment
Leopard
Jaguar
Frog
Tennis_racket
Cream
Dairy
Vase
Vehicle
Street_light
Cannon
Cabbage
Canary
Ball
Home_appliance
Vehicle_registration_plate
Banjo
Bicycle
Tool
Snowman
Coffee
Shorts
Honeycomb
Chest_of_drawers
Suitcase
Hair_spray
Human_arm
Human_nose
Dice
Pitcher
Cosmetics
Curtain
Cupboard
Baked_goods
Lizard
Dumbbell
Ladybug
Paper_towel
Seat_belt
Girl
Tank
Cheese
Rocket
Wine_rack
Bottle_opener
Container
Food_processor
Pizza_cutter
Stapler
Van
Boot
Shrimp
Hedgehog
Surfboard
Platter
Golf_cart
Jellyfish
Ipod
Sink
Antelope
Plastic_bag
Bidet
Ski
Kangaroo
Handbag
Laptop
Houseplant
Marine_invertebrates
Human_beard
Hot_dog
Plumbing_fixture
Bat
Animal
Studio_couch
Bench
Coffee_table
Land_vehicle
Human_ear
Taxi
Submarine_sandwich
French_fries
Artichoke
Bow_a

### Create a dictionary from from the df_trainables

In [28]:
trainables_dict = df_trainables.set_index("LabelName").to_dict(orient="index")

In [29]:
trainables_dict

{u'/m/011k07': {'Description': 'Tortoise'},
 u'/m/011q46kg': {'Description': 'Container'},
 u'/m/012074': {'Description': 'Magpie'},
 u'/m/0120dh': {'Description': 'Sea_turtle'},
 u'/m/01226z': {'Description': 'Football'},
 u'/m/012n7d': {'Description': 'Ambulance'},
 u'/m/012w5l': {'Description': 'Ladder'},
 u'/m/012xff': {'Description': 'Toothbrush'},
 u'/m/012ysf': {'Description': 'Syringe'},
 u'/m/0130jx': {'Description': 'Sink'},
 u'/m/0138tl': {'Description': 'Toy'},
 u'/m/013y1f': {'Description': 'Organ'},
 u'/m/01432t': {'Description': 'Cassette_deck'},
 u'/m/014j1m': {'Description': 'Apple'},
 u'/m/014sv8': {'Description': 'Human_eye'},
 u'/m/014trl': {'Description': 'Cosmetics'},
 u'/m/014y4n': {'Description': 'Paddle'},
 u'/m/0152hh': {'Description': 'Snowman'},
 u'/m/01599': {'Description': 'Beer'},
 u'/m/015h_t': {'Description': 'Human_beard'},
 u'/m/015p6': {'Description': 'Bird'},
 u'/m/015qbp': {'Description': 'Parking_meter'},
 u'/m/015qff': {'Description': 'Traffic_li

In [30]:
for key, value in trainables_dict.iteritems():
    print(value['Description'])

Bowl
Furniture
Perfume
Deer
Maple
Office_building
Cart
Pancake
Roller_skates
Insect
Dress
Dishwasher
Dagger
Whisk
Suitcase
Missile
Candle
Bathtub
Piano
Lemon
Chopsticks
Peach
Face_powder
Television
Bee
Camera
Goggles
Pencil_case
Clothing
Plant
Sports_equipment
Leopard
Tower
Jaguar
Earrings
Tennis_racket
Cream
Cantaloupe
Vase
Vehicle
Street_light
Cannon
Cabbage
Rugby_ball
Ball
Home_appliance
Vehicle_registration_plate
Banjo
Bicycle
Snowman
Coffee
Shorts
Honeycomb
Chest_of_drawers
Bathroom_accessory
Hair_spray
Human_arm
Human_nose
Dice
Pitcher
Cosmetics
Curtain
Cupboard
Baked_goods
Lizard
Barrel
Paper_towel
Seat_belt
Girl
Tank
Cheese
Rocket
Wine_rack
Bottle_opener
Container
Food_processor
Pizza_cutter
Stapler
Boot
Shrimp
Hedgehog
Surfboard
Platter
Golf_cart
Jellyfish
Ipod
Sink
Plastic_bag
Baseball_bat
Ski
Kangaroo
Handbag
Laptop
Houseplant
Marine_invertebrates
Human_beard
Hot_dog
Plumbing_fixture
Bat
Animal
Studio_couch
Bench
Coffee_table
Land_vehicle
Human_ear
Taxi
Submarine_sandwich
Fr

Adding label '/m/0bl9f' with description "Entity"  

In [31]:
trainables_dict['/m/0bl9f'] = {'Description': 'Entity'}

In [32]:
trainables_dict['/m/0bl9f']

{'Description': 'Entity'}

In [33]:
trainables_dict['/m/0152hh']

{'Description': 'Snowman'}

## Convert labels to human readable description

In [34]:
bbox_labels_600_json = json.loads(bbox_labels_600_json_data)

In [35]:
def label_to_description(json_input, lookup_key):
    if isinstance(json_input, dict):
        for k, v in json_input.iteritems():
            if k == lookup_key:
                json_input[k] = trainables_dict[v]['Description']
 
                yield v
            else:
                for child_val in label_to_description(v, lookup_key):
                      
                    yield child_val
    elif isinstance(json_input, list):
        for item in json_input:
            for item_val in label_to_description(item, lookup_key):
                yield item_val

In [36]:
for label in label_to_description(bbox_labels_600_json, "LabelName"):  
    print("")

# label_to_description(bbox_labels_600_json, "LabelName")



















































































































































































































































































































































































































































































































































































































































































































































































































































































In [37]:
bbox_labels_600_json

{u'LabelName': 'Entity',
 u'Subcategory': [{u'LabelName': 'Coin'},
  {u'LabelName': 'Flag'},
  {u'LabelName': 'Light_bulb'},
  {u'LabelName': 'Toy',
   u'Subcategory': [{u'LabelName': 'Doll'},
    {u'LabelName': 'Balloon'},
    {u'LabelName': 'Dice'},
    {u'LabelName': 'Flying_disc'},
    {u'LabelName': 'Kite'},
    {u'LabelName': 'Teddy_bear'}]},
  {u'LabelName': 'Home_appliance',
   u'Subcategory': [{u'LabelName': 'Washing_machine'},
    {u'LabelName': 'Toaster'},
    {u'LabelName': 'Oven'},
    {u'LabelName': 'Blender'},
    {u'LabelName': 'Gas_stove'},
    {u'LabelName': 'Mechanical_fan'},
    {u'LabelName': 'Heater'},
    {u'LabelName': 'Kettle'},
    {u'LabelName': 'Hair_dryer'},
    {u'LabelName': 'Refrigerator'},
    {u'LabelName': 'Wood-burning_stove'},
    {u'LabelName': 'Humidifier'},
    {u'LabelName': 'Mixer'},
    {u'LabelName': 'Coffeemaker'},
    {u'LabelName': 'Microwave_oven'},
    {u'LabelName': 'Dishwasher'},
    {u'LabelName': 'Sewing_machine'},
    {u'LabelName':

## EDA on the hierchy of 600 original labels by classes¶

In [38]:
bbox_labels_600_json

{u'LabelName': 'Entity',
 u'Subcategory': [{u'LabelName': 'Coin'},
  {u'LabelName': 'Flag'},
  {u'LabelName': 'Light_bulb'},
  {u'LabelName': 'Toy',
   u'Subcategory': [{u'LabelName': 'Doll'},
    {u'LabelName': 'Balloon'},
    {u'LabelName': 'Dice'},
    {u'LabelName': 'Flying_disc'},
    {u'LabelName': 'Kite'},
    {u'LabelName': 'Teddy_bear'}]},
  {u'LabelName': 'Home_appliance',
   u'Subcategory': [{u'LabelName': 'Washing_machine'},
    {u'LabelName': 'Toaster'},
    {u'LabelName': 'Oven'},
    {u'LabelName': 'Blender'},
    {u'LabelName': 'Gas_stove'},
    {u'LabelName': 'Mechanical_fan'},
    {u'LabelName': 'Heater'},
    {u'LabelName': 'Kettle'},
    {u'LabelName': 'Hair_dryer'},
    {u'LabelName': 'Refrigerator'},
    {u'LabelName': 'Wood-burning_stove'},
    {u'LabelName': 'Humidifier'},
    {u'LabelName': 'Mixer'},
    {u'LabelName': 'Coffeemaker'},
    {u'LabelName': 'Microwave_oven'},
    {u'LabelName': 'Dishwasher'},
    {u'LabelName': 'Sewing_machine'},
    {u'LabelName':

In [69]:
def hierchy_generator(json_input, lookup_key, parent_key):
    if isinstance(json_input, dict):
        
        new_parent_key = json_input[lookup_key]
        
        print("parent_key: ",parent_key )
        
        for k, v in json_input.iteritems():
            if k == lookup_key:
                classes_hierchy_dict[v] = {}
                yield v
                
            else:
                for child_val in hierchy_generator(v, lookup_key, new_parent_key):
                    
#                     print("else k: ", k)
#                     print("json_input[lookup_key]: ", json_input[lookup_key])
#                     print("children key: ", child_val)
    
                    
                    classes_hierchy_dict[child_val]['parent'] = parent_key
#                     classes_hierchy_dict[child_val[lookup_key]]['parent'] = json_input[lookup_key]
                    yield child_val
    elif isinstance(json_input, list):
#         classes_hierchy_dict[parent_key]['children'] = []
        for item in json_input:
            for item_val in hierchy_generator(item, lookup_key, parent_key):
#                 classes_hierchy_dict[parent_key]['children'].append(item_val)
                yield item_val

In [70]:
classes_hierchy_dict = {}

for label in hierchy_generator(bbox_labels_600_json, "LabelName", None):  
    print("Do nothing")

parent_key:  Entity
parent_key:  Coin
Do nothing
parent_key:  Flag
Do nothing
parent_key:  Light_bulb
Do nothing
parent_key:  Toy
parent_key:  Doll
Do nothing
parent_key:  Balloon
Do nothing
parent_key:  Dice
Do nothing
parent_key:  Flying_disc
Do nothing
parent_key:  Kite
Do nothing
parent_key:  Teddy_bear
Do nothing
Do nothing
parent_key:  Home_appliance
parent_key:  Washing_machine
Do nothing
parent_key:  Toaster
Do nothing
parent_key:  Oven
Do nothing
parent_key:  Blender
Do nothing
parent_key:  Gas_stove
Do nothing
parent_key:  Mechanical_fan
Do nothing
parent_key:  Heater
Do nothing
parent_key:  Kettle
Do nothing
parent_key:  Hair_dryer
Do nothing
parent_key:  Refrigerator
Do nothing
parent_key:  Wood-burning_stove
Do nothing
parent_key:  Humidifier
Do nothing
parent_key:  Mixer
Do nothing
parent_key:  Coffeemaker
Do nothing
parent_key:  Microwave_oven
Do nothing
parent_key:  Dishwasher
Do nothing
parent_key:  Sewing_machine
Do nothing
parent_key:  Hand_dryer
Do nothing
parent_ke

Do nothing
Do nothing
parent_key:  Man
Do nothing
parent_key:  Woman
Do nothing
parent_key:  Boy
Do nothing
parent_key:  Girl
Do nothing
Do nothing
parent_key:  Food
parent_key:  Fast_food
parent_key:  Hot_dog
Do nothing
parent_key:  French_fries
Do nothing
Do nothing
parent_key:  Waffle
Do nothing
parent_key:  Pancake
Do nothing
parent_key:  Burrito
Do nothing
parent_key:  Snack
parent_key:  Pretzel
Do nothing
parent_key:  Popcorn
Do nothing
parent_key:  Cookie
Do nothing
Do nothing
parent_key:  Dessert
parent_key:  Muffin
Do nothing
parent_key:  Cookie
Do nothing
parent_key:  Ice_cream
Do nothing
parent_key:  Cake
Do nothing
parent_key:  Candy
Do nothing
Do nothing
parent_key:  Guacamole
Do nothing
parent_key:  Fruit
parent_key:  Apple
Do nothing
parent_key:  Grape
Do nothing
parent_key:  Common_fig
Do nothing
parent_key:  Pear
Do nothing
parent_key:  Strawberry
Do nothing
parent_key:  Tomato
Do nothing
parent_key:  Lemon
Do nothing
parent_key:  Banana
Do nothing
parent_key:  Orange


parent_key:  Harp
Do nothing
parent_key:  Harpsichord
Do nothing
parent_key:  Harmonica
Do nothing
parent_key:  Musical_keyboard
Do nothing
parent_key:  Oboe
Do nothing
parent_key:  Piano
Do nothing
parent_key:  Saxophone
Do nothing
parent_key:  Trombone
Do nothing
parent_key:  Trumpet
Do nothing
parent_key:  Violin
Do nothing
parent_key:  Chime
Do nothing
parent_key:  Flute
Do nothing
parent_key:  Accordion
Do nothing
parent_key:  Maracas
Do nothing
Do nothing
parent_key:  Sports_equipment
parent_key:  Paddle
Do nothing
parent_key:  Ball
parent_key:  Football
Do nothing
parent_key:  Cricket_ball
Do nothing
parent_key:  Volleyball
Do nothing
parent_key:  Tennis_ball
Do nothing
parent_key:  Rugby_ball
Do nothing
Do nothing
parent_key:  Bicycle
parent_key:  Bicycle_wheel
Do nothing
Do nothing
parent_key:  Surfboard
Do nothing
parent_key:  Bow_and_arrow
Do nothing
parent_key:  Hiking_equipment
Do nothing
parent_key:  Roller_skates
Do nothing
parent_key:  Flying_disc
Do nothing
parent_key:

In [68]:
classes_hierchy_dict

{'Accordion': {'parent': 'Entity'},
 'Adhesive_tape': {'parent': 'Entity'},
 'Aircraft': {'parent': 'Entity'},
 'Airplane': {'parent': 'Entity'},
 'Alarm_clock': {'parent': 'Entity'},
 'Alpaca': {'parent': 'Entity'},
 'Ambulance': {'parent': 'Entity'},
 'Animal': {'parent': 'Entity'},
 'Ant': {'parent': 'Entity'},
 'Antelope': {'parent': 'Entity'},
 'Apple': {'parent': 'Entity'},
 'Armadillo': {'parent': 'Entity'},
 'Artichoke': {'parent': 'Entity'},
 'Asparagus': {'parent': 'Entity'},
 'Auto_part': {'parent': 'Entity'},
 'Axe': {'parent': 'Entity'},
 'Backpack': {'parent': 'Entity'},
 'Bagel': {'parent': 'Entity'},
 'Baked_goods': {'parent': 'Entity'},
 'Balance_beam': {'parent': 'Entity'},
 'Ball': {'parent': 'Entity'},
 'Balloon': {'parent': 'Entity'},
 'Banana': {'parent': 'Entity'},
 'Band-aid': {'parent': 'Entity'},
 'Banjo': {'parent': 'Entity'},
 'Barge': {'parent': 'Entity'},
 'Barrel': {'parent': 'Entity'},
 'Baseball_bat': {'parent': 'Entity'},
 'Baseball_glove': {'parent': 

In [51]:
classes_hierchy_dict['Accordion']['parent']

'Entity'

## EDA on the classes-trainable.txt

In [117]:
classes_trainable_file_path = data_labels_description_dir_path + 'classes-trainable.txt'



In [118]:
df_classes_trainable = pd.read_csv(classes_trainable_file_path,names=['LabelName'])

In [119]:
df_classes_trainable.head(10)

Unnamed: 0,LabelName
0,/m/0108_09c
1,/m/010dmf
2,/m/010jjr
3,/m/010l12
4,/m/010lq47b
5,/m/010ls_cv
6,/m/01127
7,/m/01_12b
8,/m/0117_25k
9,/m/0117z


In [120]:
df_classes_trainable.describe()

Unnamed: 0,LabelName
count,7186
unique,7186
top,/m/02qg91d
freq,1


In [15]:
df_classes_trainable.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7186 entries, 0 to 7185
Data columns (total 1 columns):
LabelName    7186 non-null object
dtypes: object(1)
memory usage: 112.3+ KB


## Filter out trainable classes from class descriptions

In [16]:
df_trainables = pd.merge(df_classes_trainable,
                 df_class_descriptions[['LabelName', 'Description']],
                 on='LabelName', 
                 how='inner')

In [17]:
df_trainables.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7186 entries, 0 to 7185
Data columns (total 2 columns):
LabelName      7186 non-null object
Description    7186 non-null object
dtypes: object(2)
memory usage: 168.4+ KB


In [18]:
df_trainables.head()

Unnamed: 0,LabelName,Description
0,/m/0108_09c,Summer snowflake
1,/m/010dmf,Isle of man tt
2,/m/010jjr,Amusement park
3,/m/010l12,Roller coaster
4,/m/010lq47b,Witch hat


## EDA on train-annotations-human-imagelabels.csv

In [19]:
train_annotations_human_imagelabels_file_path = data_labels_description_dir_path + 'train-annotations-human-imagelabels.csv'

In [20]:
df_train_annotations_human_imagelabels = pd.read_csv(train_annotations_human_imagelabels_file_path,header=0)

In [21]:
df_train_annotations_human_imagelabels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27894289 entries, 0 to 27894288
Data columns (total 4 columns):
ImageID       object
Source        object
LabelName     object
Confidence    int64
dtypes: int64(1), object(3)
memory usage: 1.0+ GB


In [22]:
df_train_annotations_human_imagelabels.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence
0,000002b66c9c498e,crowdsource-verification,/m/01kcnl,1
1,000002b66c9c498e,verification,/m/010l12,0
2,000002b66c9c498e,verification,/m/012c4n,0
3,000002b66c9c498e,verification,/m/012mj,1
4,000002b66c9c498e,verification,/m/012yh1,1


In [23]:
df_train_annotations_human_imagelabels_confident = df_train_annotations_human_imagelabels[df_train_annotations_human_imagelabels['Confidence']== 1
                                                                                         ]

In [24]:
df_train_annotations_human_imagelabels_confident.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13444569 entries, 0 to 27894288
Data columns (total 4 columns):
ImageID       object
Source        object
LabelName     object
Confidence    int64
dtypes: int64(1), object(3)
memory usage: 512.9+ MB


In [26]:
df_train_annotations_human_imagelabels_confident.head(20)

Unnamed: 0,ImageID,Source,LabelName,Confidence
0,000002b66c9c498e,crowdsource-verification,/m/01kcnl,1
3,000002b66c9c498e,verification,/m/012mj,1
4,000002b66c9c498e,verification,/m/012yh1,1
7,000002b66c9c498e,verification,/m/014sv8,1
14,000002b66c9c498e,verification,/m/016c68,1
16,000002b66c9c498e,verification,/m/016q19,1
23,000002b66c9c498e,verification,/m/019nj4,1
24,000002b66c9c498e,verification,/m/019_nn,1
25,000002b66c9c498e,verification,/m/019sc6,1
30,000002b66c9c498e,verification,/m/01bl7v,1


In [28]:
df_train_annotations_human_imagelabels_imgIDs_Labels = df_train_annotations_human_imagelabels[['ImageID','LabelName']]

In [29]:
df_train_annotations_human_imagelabels_imgIDs_Labels.head()

Unnamed: 0,ImageID,LabelName
0,000002b66c9c498e,/m/01kcnl
1,000002b66c9c498e,/m/010l12
2,000002b66c9c498e,/m/012c4n
3,000002b66c9c498e,/m/012mj
4,000002b66c9c498e,/m/012yh1


## Mergin with ImageIDs

In [38]:
df_trainables_imageIds_label_descriptions = pd.merge(df_train_annotations_human_imagelabels_imgIDs_Labels,
                 df_trainables[['LabelName', 'Description']],
                 on='LabelName', 
                 how='inner')

In [39]:
df_trainables_imageIds_label_descriptions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21214833 entries, 0 to 21214832
Data columns (total 3 columns):
ImageID        object
LabelName      object
Description    object
dtypes: object(3)
memory usage: 647.4+ MB


In [41]:
df_trainables_imageIds_label_descriptions.head(100)

Unnamed: 0,ImageID,LabelName,Description
0,000002b66c9c498e,/m/01kcnl,Birthday
1,000004f4400f6ec5,/m/01kcnl,Birthday
2,0000071d71a0a6f6,/m/01kcnl,Birthday
3,000007e5752f8f1c,/m/01kcnl,Birthday
4,00001bc2c4027449,/m/01kcnl,Birthday
5,000025ea48cab6fc,/m/01kcnl,Birthday
6,00002b368e91b947,/m/01kcnl,Birthday
7,000060e3121c7305,/m/01kcnl,Birthday
8,000069a0b17c906e,/m/01kcnl,Birthday
9,00006af3eaaad9b6,/m/01kcnl,Birthday


In [42]:
df_trainables_imageIds_label_descriptions.to_csv( data_dir_path + 'imageIDs_LabelNames.csv', sep=',')

## EDA on train-annotations-machine-imagelabels.csv

In [30]:
train_annotations_machine_imagelabels_file_path = data_labels_description_dir_path + 'train-annotations-machine-imagelabels.csv'

In [None]:
df_train_annotations_machine_imagelabels = pd.read_csv(train_annotations_machine_imagelabels_file_path,header=0)

In [None]:
df_train_annotations_machine_imagelabels.head()

In [None]:
## EDA on train-annotations-human-imagelabels.csv