In [1]:
# I used Open Images Dataset V4
# Download images links, annotations and box descriptions from
# https://storage.googleapis.com/openimages/web/download.html
#
# Used files:
#    class-descriptions-boxable.csv
#    test-annotations-bbox.csv
#    test-images.csv
# I am using test files since they are much smaller and now I will only need 
# 600 images of each categories. Then I will split this to train-valid-test sets.

import pandas as pd

In [2]:
descriptions = pd.read_csv(r"D:\Downloads\class-descriptions-boxable.csv", sep=",", index_col=None, header=None)

In [3]:
descriptions.head()

Unnamed: 0,0,1
0,/m/011k07,Tortoise
1,/m/011q46kg,Container
2,/m/012074,Magpie
3,/m/0120dh,Sea turtle
4,/m/01226z,Football


In [4]:
# Find 2 category ids I am looking for

man_id = descriptions.loc[descriptions[1] == 'Man'].iloc[0][0]
woman_id = descriptions.loc[descriptions[1] == 'Woman'].iloc[0][0]
print(man_id)
print(woman_id)

/m/04yx4
/m/03bt1vf


In [5]:
annotations = pd.read_csv(r"D:\Downloads\test-annotations-bbox.csv", sep=",", index_col=None)
print(annotations.shape)
annotations.head()

(625282, 13)


Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000026e7ee790996,freeform,/m/07j7r,1,0.071905,0.145346,0.206591,0.391306,0,1,1,0,0
1,000026e7ee790996,freeform,/m/07j7r,1,0.439756,0.572466,0.264153,0.435122,0,1,1,0,0
2,000026e7ee790996,freeform,/m/07j7r,1,0.668455,1.0,0.0,0.552825,0,1,1,0,0
3,000062a39995e348,freeform,/m/015p6,1,0.205719,0.849912,0.154144,1.0,0,0,0,0,0
4,000062a39995e348,freeform,/m/05s2s,1,0.137133,0.377634,0.0,0.884185,1,1,0,0,0


In [6]:
man_pics = annotations.loc[annotations['LabelName'] == man_id]
print(man_pics.shape)
man_pics.head()

(17514, 13)


Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
98,00094d5e8b3cb038,freeform,/m/04yx4,1,0.0,0.422093,0.0,1.0,0,1,0,0,0
99,00094d5e8b3cb038,freeform,/m/04yx4,1,0.321955,0.84189,0.120776,0.999904,0,0,0,0,0
146,000b6e5bfa3e2a34,freeform,/m/04yx4,1,0.0,0.38786,0.174753,0.966402,1,1,0,0,0
147,000b6e5bfa3e2a34,freeform,/m/04yx4,1,0.308014,1.0,0.29481,1.0,1,1,0,0,0
148,000b6e5bfa3e2a34,freeform,/m/04yx4,1,0.309429,0.66237,0.295084,0.80385,1,0,0,0,0


In [7]:
woman_pics = annotations.loc[annotations['LabelName'] == woman_id]
print(woman_pics.shape)
woman_pics.head()

(9047, 13)


Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
118,000aa0b1c8fd5ddf,freeform,/m/03bt1vf,1,0.165803,0.494572,0.261411,0.87214,0,0,1,0,0
340,0019ac4eb659f57e,freeform,/m/03bt1vf,1,0.0,0.948736,0.144778,0.968588,0,1,0,0,0
341,0019ac4eb659f57e,freeform,/m/03bt1vf,1,0.329299,0.941604,0.150688,0.962751,1,0,0,0,0
342,0019ac4eb659f57e,freeform,/m/03bt1vf,1,0.362852,0.67996,0.084708,0.608163,1,0,0,0,0
487,0021520c13029a24,freeform,/m/03bt1vf,1,0.0,0.346344,0.048508,1.0,0,1,0,0,0


In [8]:
# Find images occuring in both category, so will not be images containing both man and woman at the same time
same = pd.merge(man_pics, woman_pics, how='inner', on=['ImageID'])
same_ids = same.drop_duplicates(subset='ImageID')
print("Number of same ids in both categories: ", same_ids['ImageID'].shape[0])

same_ids = set(same_ids['ImageID'])
man_only_pics = man_pics[~man_pics['ImageID'].isin(same_ids)]
woman_only_pics = woman_pics[~woman_pics['ImageID'].isin(same_ids)]

print("Number of images of man: ", man_pics.shape[0])
print("Number of images of man only: ", man_only_pics.shape[0])

print("Number of images of woman: ", woman_pics.shape[0])
print("Number of images of woman only: ", woman_only_pics.shape[0])

Number of same ids in both categories:  1272
Number of images of man:  17514
Number of images of man only:  14489
Number of images of woman:  9047
Number of images of woman only:  6661


In [9]:
# Drop images where the person is relatively small
man_pics = man_only_pics.loc[(man_only_pics['XMax'] - man_only_pics['XMin'] > 0.3)].loc[(man_only_pics['YMax'] - man_only_pics['YMin'] > 0.4)]
print(man_pics.shape)

woman_pics = woman_only_pics.loc[(woman_only_pics['XMax'] - woman_only_pics['XMin'] >= 0.3)].loc[(woman_only_pics['YMax'] - woman_only_pics['YMin'] >= 0.4)]
print(woman_pics.shape)

(5415, 13)
(4187, 13)


In [10]:
# Drop duplicated pictures (e.g. where multiple man occurs on 1 picture)

man_pics = man_pics.drop_duplicates(subset='ImageID')
print(man_pics.shape)

woman_pics = woman_pics.drop_duplicates(subset='ImageID')
print(woman_pics.shape)

man_pics.head()

(4602, 13)
(3730, 13)


Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
98,00094d5e8b3cb038,freeform,/m/04yx4,1,0.0,0.422093,0.0,1.0,0,1,0,0,0
146,000b6e5bfa3e2a34,freeform,/m/04yx4,1,0.0,0.38786,0.174753,0.966402,1,1,0,0,0
270,0013bd1e79c796c0,freeform,/m/04yx4,1,0.614717,0.938003,0.286016,0.72454,1,0,0,0,0
306,0017d9757c6f4793,freeform,/m/04yx4,1,0.0,0.468446,0.133566,1.0,0,1,0,0,0
383,001a695ad732152b,freeform,/m/04yx4,1,0.0,0.999994,0.013495,1.0,0,1,0,0,0


In [11]:
# take first 600
man_pics = man_pics.iloc[:600]
woman_pics = woman_pics.iloc[:600]
man_pics.shape

(600, 13)

In [12]:
image_links = pd.read_csv(r"D:\Downloads\test-images.csv", sep=",", index_col=None)
image_links.head()

Unnamed: 0,image_name,image_url
0,a51796b80649e29b.jpg,https://requestor-proxy.figure-eight.com/figur...
1,4c0a784065138975.jpg,https://requestor-proxy.figure-eight.com/figur...
2,63d0fdd8bdcdfc05.jpg,https://requestor-proxy.figure-eight.com/figur...
3,6dca9eeb498da6bc.jpg,https://requestor-proxy.figure-eight.com/figur...
4,fda39fca8c499806.jpg,https://requestor-proxy.figure-eight.com/figur...


In [13]:
!mkdir man
!mkdir woman

In [14]:
import requests
import sys

for id, pic in enumerate(man_pics["ImageID"]):
    filename = "man\\" + str(id+1) + ".jpg"
    sys.stdout.write("\rDownloading: " + filename)
    sys.stdout.flush()
    
    url = image_links.loc[image_links['image_name'] == pic+".jpg"].iloc[0][1]
    content  = requests.get(url).content
    
    with open(filename, 'wb') as f:
        f.write(content)
                       
for id, pic in enumerate(woman_pics["ImageID"]):
    sys.stdout.write("\rDownloading: " + filename)
    sys.stdout.flush()
    filename = "woman\\" + str(id+1) + ".jpg"
    url = image_links.loc[image_links['image_name'] == pic+".jpg"].iloc[0][1]
    with open(filename, 'wb') as f:
        f.write(requests.get(url).content)

Downloading: woman\599.jpg

In [15]:
!mkdir "train"
!mkdir "train\\man"
!mkdir "train\\woman"

!mkdir "valid"
!mkdir "valid\\man"
!mkdir "valid\\woman"

!mkdir "test"
!mkdir "test\\man"
!mkdir "test\\woman"

In [16]:
for i in range(1,401):
    filename = str(i) + '.jpg';
    f = "man\\"+filename
    t = "train\\man\\"+filename
    !move $f $t
    f = "woman\\"+filename
    t = "train\\woman\\"+filename
    !move $f $t
    
for i in range(401,501):
    filename = str(i) + '.jpg';
    f = "man\\"+filename
    t = "valid\\man\\"+filename
    !move $f $t
    f = "woman\\"+filename
    t = "valid\\woman\\"+filename
    !move $f $t
    
for i in range(501,601):
    filename = str(i) + '.jpg';
    f = "man\\"+filename
    t = "test\\man\\"+filename
    !move $f $t
    f = "woman\\"+filename
    t = "test\\woman\\"+filename
    !move $f $t

        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.


        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.


        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.


        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
        1 file(s) moved.
