## Exploratory data analysis

In [14]:
import jsonlines
import pandas as pd

### load dataset

In [15]:
data_image = []
with jsonlines.open('data/dataset.jsonl') as reader:
    data_image = list(reader)


In [16]:
data_image[0]

{'filename': '2009_004203.jpg',
 'folder': 'VOC2012',
 'object': [{'bndbox_xmax': 483,
   'bndbox_xmin': 23,
   'bndbox_ymax': 222,
   'bndbox_ymin': 113,
   'difficult': '0',
   'name': 'aeroplane',
   'occluded': '0',
   'pose': 'Right',
   'truncated': '0'}],
 'segmented': False,
 'size_depth': 3,
 'size_height': 333,
 'size_width': 500,
 'source_annotation': 'PASCAL VOC2009',
 'source_database': 'The VOC2009 Database',
 'source_image': 'flickr'}

In [21]:
def process(image):
    image['object_names'] = [o['name'] for o in image['object']]
    return image

data_image = [process(image) for image in data_image]
data_object = [obj for image in data_image for obj in image['object']]

In [24]:
df_image = pd.DataFrame(data_image)
df_object = pd.DataFrame(data_object)

In [19]:
df_image.head()

Unnamed: 0,filename,folder,object,segmented,size_depth,size_height,size_width,source_annotation,source_database,source_image,object_names
0,2009_004203.jpg,VOC2012,"[{'bndbox_xmax': 483, 'bndbox_xmin': 23, 'bndb...",False,3,333,500,PASCAL VOC2009,The VOC2009 Database,flickr,[aeroplane]
1,2011_004526.jpg,VOC2011,"[{'actions_jumping': '0', 'actions_other': '0'...",False,3,375,500,PASCAL VOC2011,The VOC2011 Database,flickr,"[person, person]"
2,2010_002177.jpg,VOC2012,"[{'bndbox_xmax': 342, 'bndbox_xmin': 1, 'bndbo...",False,3,500,437,PASCAL VOC2010,The VOC2010 Database,flickr,[cat]
3,2011_001252.jpg,VOC2012,"[{'bndbox_xmax': 190, 'bndbox_xmin': 125, 'bnd...",False,3,500,283,PASCAL VOC2011,The VOC2011 Database,flickr,"[boat, boat, boat]"
4,2011_002627.jpg,VOC2012,"[{'bndbox_xmax': 375, 'bndbox_xmin': 1, 'bndbo...",False,3,500,375,PASCAL VOC2011,The VOC2011 Database,flickr,[person]


In [26]:
df_object.head()

Unnamed: 0,bndbox_xmax,bndbox_xmin,bndbox_ymax,bndbox_ymin,difficult,name,occluded,pose,truncated,actions_jumping,...,actions_reading,actions_ridingbike,actions_ridinghorse,actions_running,actions_takingphoto,actions_usingcomputer,actions_walking,point_x,point_y,part
0,483,23,222,113,0,aeroplane,0.0,Right,0.0,,...,,,,,,,,,,
1,500,237,375,185,0,person,,Unspecified,,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,432.0,316.0,
2,233,185,155,67,0,person,,Unspecified,,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,214.0,112.0,
3,342,1,393,1,0,cat,0.0,Frontal,0.0,,...,,,,,,,,,,
4,190,125,281,23,0,boat,0.0,Rear,0.0,,...,,,,,,,,,,


### basic analysis

Number of images and objects

In [36]:
len(df_image)

17125

In [39]:
len(df_object)

40138

Only 2913 images have segmentation data

In [43]:
df_image['segmented'].sum()

np.int64(2913)

Everything is a jpeg

In [38]:
df_image['filename'].str.lower().str.endswith('.jpg').sum()

np.int64(17125)

All images are from flickr

In [46]:
df_image.value_counts('source_image')

source_image
flickr    17125
Name: count, dtype: int64

list of data sources

In [47]:
df_image.value_counts('source_annotation')

source_annotation
PASCAL VOC2008    4340
PASCAL VOC2011    3640
PASCAL VOC2010    3503
PASCAL VOC2009    2722
PASCAL VOC2012    2164
PASCAL VOC2007     756
Name: count, dtype: int64

### Filter out non-segmented images

In [48]:
data_image = [image for image in data_image if image['segmented']]
data_object = [obj for image in data_image for obj in image['object']]

In [49]:
df_image = pd.DataFrame(data_image)
df_object = pd.DataFrame(data_object)

In [50]:
len(df_image), len(df_object)

(2913, 6934)

VOC2012 doesn't have segmentations

In [52]:
df_image.value_counts('source_annotation')

source_annotation
PASCAL VOC2008    639
PASCAL VOC2010    634
PASCAL VOC2007    632
PASCAL VOC2009    618
PASCAL VOC2011    390
Name: count, dtype: int64

Most common object is person

In [56]:
df_object.value_counts('name')

name
person         1734
chair           549
car             458
bottle          358
pottedplant     322
sheep           308
dog             299
cat             286
cow             284
bird            277
bus             237
boat            232
aeroplane       220
sofa            209
motorbike       204
horse           204
tvmonitor       199
bicycle         197
train           189
diningtable     168
Name: count, dtype: int64

In [57]:
df_object.value_counts('name').describe()

count      20.000000
mean      346.700000
std       340.035463
min       168.000000
25%       204.000000
50%       257.000000
75%       311.500000
max      1734.000000
Name: count, dtype: float64