In [1]:
import os
import json
import pandas as pd
from multiprocessing import Pool
from PIL import Image

In [2]:
input_path = '../datasets/concap/download_tool/'
image_path = '../datasets/concap/images/'

In [3]:
df1 = pd.read_csv(
    os.path.join(input_path, 'Validation_GCC-1.1.0-Validation.tsv'),
    sep='\t', 
    names=["caption","url"]
)

In [4]:
df2 = pd.read_csv(
    os.path.join(input_path, 'downloaded_validation_report.tsv'),
    sep='\t', 
    names=["fname", "split", "ftype", "fsize", "response", "url"]
)

In [5]:
df2['caption'] = df1['caption']

In [6]:
df2['ftype'].unique()

array(['image/jpeg', nan, 'image/gif', 'text/html', 'image/png',
       'inode/x-empty', 'text/plain', 'text/xml'], dtype=object)

In [7]:
def process(row):
    if row['response'] == 200 and 'image' in row['ftype']:
        try:
            img = Image.open(os.path.join(image_path, row['fname'])).convert('RGB')
            row['width'] = img.size[0]
            row['height'] = img.size[1]
        except Exception as ex: 
            row['exception'] = str(ex)
    return row

In [8]:
items = []
with Pool(32) as pool:
    for i, row in enumerate(pool.imap_unordered(
        process, df2.to_dict(orient='records'), 10)):
        if 'width' in row:
            items.append(row)
        elif 'exception' in row:
            print(row)
        if i % 10000 == 0 and i > 0: 
            print(f'{int(i/10000)} out of {int(df2.index.size/10000)}')        



1 out of 1
{'fname': 'validation/3044488379', 'split': 'validation', 'ftype': 'image/jpeg', 'fsize': 48826.0, 'response': 200, 'url': 'http://www.themalaymailonline.com/uploads/articles/2016-04/subaru_forester_1504.jpg', 'caption': 'automobile model -- picture courtesy', 'exception': 'image file is truncated (2 bytes not processed)'}


In [9]:
len(items)

13565

In [10]:
with open('../datasets/concap/validation.json', 'w') as fout:
    json.dump(items, fout)

In [11]:
df1 = pd.read_csv(
    os.path.join(input_path, 'Train_GCC-training.tsv'),
    sep='\t', 
    names=["caption","url"]
)

In [12]:
df2 = pd.read_csv(
    os.path.join(input_path, 'downloaded_training_report.tsv'),
    sep='\t', 
    names=["fname", "split", "ftype", "fsize", "response", "url"]
)

In [13]:
df2['caption'] = df1['caption']

In [14]:
items = []
with Pool(32) as pool:
    for i, row in enumerate(pool.imap_unordered(
        process, df2.to_dict(orient='records'), 10)):
        if 'width' in row:
            items.append(row)
        elif 'exception' in row:
            print(row)
        if i % 100000 == 0 and i > 0: 
            print(f'{int(i/100000)} out of {int(df2.index.size/100000)}')        



1 out of 331


  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/1425666352', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 1736374.0, 'response': 200, 'url': 'https://www.amherst.edu/system/files/styles/original/private/media/IMG_6564.JPG', 'caption': 'a view of the medieval market', 'exception': 'image file is truncated (51 bytes not processed)'}
{'fname': 'training/2968766090', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 144577.0, 'response': 200, 'url': 'http://saltwaterlife.co.uk/wp-content/uploads/2013/09/IMG_9549-001.jpg', 'caption': 'a large white shark , turns on the approach to the camera', 'exception': 'image file is truncated (13 bytes not processed)'}
{'fname': 'training/978654963', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 1835008.0, 'response': 200, 'url': 'http://inktank.fi/wp-content/uploads/2017/07/finnish-midsummer-sunset-young-man-throwing-rock-lake-landscape-markus-watkins-10.jpg', 'caption': 'young man throwing a stone into a lake , during sunset .', 'exception': 'image file is tru



{'fname': 'training/1266021156', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 302359.0, 'response': 200, 'url': 'https://www.uab.edu/studentmedia/images/SoccerVSHoward.jpg', 'caption': 'person dribbles the ball down field during the game against person .', 'exception': 'image file is truncated (11 bytes not processed)'}




3 out of 331
{'fname': 'training/905201635', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 65062.0, 'response': 200, 'url': 'http://danq.me/wp-content/uploads/2013/03/chalet-dinnertable.jpg', 'caption': "person joins person , person , person and animal at the chalet 's dining table .", 'exception': 'image file is truncated (1 bytes not processed)'}




{'fname': 'training/2759667851', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 1417965.0, 'response': 200, 'url': 'https://ivacationonline.com/uploads/pictures/8/2470.jpg', 'caption': 'play and relax on white - sand beaches and enjoy the clear turquoise waters .', 'exception': 'broken data stream when reading image file'}
{'fname': 'training/2951585730', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 448000.0, 'response': 200, 'url': 'https://www.nationalparks.org/sites/default/files/styles/wysiwyg_full_1x/public/acadianp_ocean-drive_kristi-rugg_nps.jpg?itok=u8VKRm06', 'caption': 'blue water of the ocean along the rocky and tree - lined coast', 'exception': 'image file is truncated (182 bytes not processed)'}
4 out of 331




{'fname': 'training/2376948524', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 65065.0, 'response': 200, 'url': 'http://danq.me/wp-content/uploads/2013/05/oranges2.jpg', 'caption': 'orange fruit and blossom hanging from the tree .', 'exception': 'image file is truncated (2 bytes not processed)'}


  "Palette images with Transparency expressed in bytes should be "
  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/2914276465', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 50688.0, 'response': 200, 'url': 'http://picturesofmoney.org/wp-content/uploads/2013/06/holding-a-stack-of-money-1024x602.jpg', 'caption': 'holding a stack of money', 'exception': 'image file is truncated (55 bytes not processed)'}


  " Skipping tag %s" % (size, len(data), tag)


5 out of 331
{'fname': 'training/3558814056', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 48857.0, 'response': 200, 'url': 'http://www.rhinorealty.com/listing_images/10433_29.jpg', 'caption': 'the huge sandy beach is a sure hit in the summer time with kids & parents alike !', 'exception': 'image file is truncated (50 bytes not processed)'}




6 out of 331




{'fname': 'training/1071777789', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 1690434.0, 'response': 200, 'url': 'http://www.reporternewspapers.net/wp-content/uploads/2014/11/BH-Gravel-Road-0647.jpg', 'caption': 'person is perhaps the last remaining gravel road .', 'exception': 'image file is truncated (8 bytes not processed)'}




{'fname': 'training/3094200542', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 101831.0, 'response': 200, 'url': 'https://thingdoer.com/things-to-do-photos/1/9/1/6/19161_l.jpg', 'caption': 'under the water , but just a couple of metres away from me .', 'exception': 'image file is truncated (19 bytes not processed)'}


  " Skipping tag %s" % (size, len(data), tag)


7 out of 331




{'fname': 'training/2604036423', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 215001.0, 'response': 200, 'url': 'https://thebrentwoodspirit.com/wp-content/uploads/DSC_3476-Kate-Gilmore-1st.jpg', 'caption': 'artist settles a ball in the first half against person .', 'exception': 'image file is truncated (58 bytes not processed)'}


  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/4189281855', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 65064.0, 'response': 200, 'url': 'https://danq.me/wp-content/uploads/2014/05/IMG_20140415_214903-e1400525230203.jpg', 'caption': 'temporary tattoo showing the seven of diamonds', 'exception': 'image file is truncated (2 bytes not processed)'}




8 out of 331


  "Image appears to be a malformed MPO file, it will be "


{'fname': 'training/3568605622', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 901393.0, 'response': 200, 'url': 'http://www.brixtonblog.com/wp-content/uploads/2014/11/20141119_200412-1.jpg', 'caption': 'the facade of the new hotel', 'exception': 'image file is truncated (34 bytes not processed)'}




9 out of 331


  "Palette images with Transparency expressed in bytes should be "
  " Skipping tag %s" % (size, len(data), tag)
  " Skipping tag %s" % (size, len(data), tag)
  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/4229568492', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 17127.0, 'response': 200, 'url': 'https://thumb7.shutterstock.com/display_pic_with_logo/164332994/684356974/stock-vector-black-silhouette-of-man-and-woman-the-guy-on-his-knee-and-holding-present-vector-illustration-684356974.jpg', 'caption': 'black silhouette of man and woman .', 'exception': 'image file is truncated (1 bytes not processed)'}




{'fname': 'training/3091106531', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 505391.0, 'response': 200, 'url': 'https://ineedpetcare.com/wp-content/uploads/2016/04/DSC_0638-e1462565761893.jpg', 'caption': 'person and dogs , person and film character , out for a walk on the trail .', 'exception': 'image file is truncated (17 bytes not processed)'}
10 out of 331


  % (tag, len(values))


{'fname': 'training/3142694740', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 1148954.0, 'response': 200, 'url': 'http://journalismiziko.dut.ac.za/wp-content/uploads/2013/09/SAM_0201.jpg', 'caption': 'the male actors on stage', 'exception': "cannot identify image file '../datasets/concap/images/training/3142694740'"}
{'fname': 'training/24780253', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 8311813.0, 'response': 200, 'url': 'https://www.amherst.edu/system/files/styles/original/private/20151024_AC_MUS_ChoralSociety_045_0.jpg', 'caption': 'a large crowd of students , standing with young alumni', 'exception': 'image file is truncated (12 bytes not processed)'}




11 out of 331
{'fname': 'training/4032174245', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 16596.0, 'response': 200, 'url': 'http://billcannandesign.com/oxford_properties/quaker_tower_escalator.jpg', 'caption': 'the symbol as seen through the clear glass alongside an escalator .', 'exception': "cannot identify image file '../datasets/concap/images/training/4032174245'"}
12 out of 331


  " Skipping tag %s" % (size, len(data), tag)
  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/2397536508', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 532357.0, 'response': 200, 'url': 'https://media.gettyimages.com/photos/singer-ryan-key-of-the-american-band-yellowcard-performs-live-during-picture-id467079494', 'caption': 'artist of the band artist performs live during a concert .', 'exception': 'image file is truncated (153 bytes not processed)'}
{'fname': 'training/3062176357', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 663429.0, 'response': 200, 'url': 'https://media.gettyimages.com/photos/the-green-bay-packers-celebrate-defeating-the-pittsburgh-steelers-31-picture-id108870644', 'caption': 'sports team celebrate defeating sports team to 25 .', 'exception': 'image file is truncated (119 bytes not processed)'}
{'fname': 'training/323194557', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 139141.0, 'response': 200, 'url': 'https://media.gettyimages.com/photos/actress-amy-poehler-attend-the-world-premiere-of-baby-mama-during-the-pict

  " Skipping tag %s" % (size, len(data), tag)


14 out of 331


  " Skipping tag %s" % (size, len(data), tag)
  "Palette images with Transparency expressed in bytes should be "
  " Skipping tag %s" % (size, len(data), tag)


15 out of 331




{'fname': 'training/732799096', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 457971.0, 'response': 200, 'url': 'http://www.svsugarshack.com/wp-content/uploads/2017/11/img_6030-1.jpg', 'caption': "top of hill looking down we walked down to the city which is on the water 's edge .", 'exception': 'image file is truncated (34 bytes not processed)'}




{'fname': 'training/3576358632', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 261834.0, 'response': 200, 'url': 'https://www.uab.edu/studentmedia/images/Tim_Alexander_lecture.jpg', 'caption': 'a packed audience attends lecture by artist .', 'exception': 'image file is truncated (12 bytes not processed)'}




16 out of 331
{'fname': 'training/3084249414', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 65536.0, 'response': 200, 'url': 'https://wdaly.com/wp-content/uploads/2016/07/image12.jpg', 'caption': 'person at the fashion show', 'exception': 'image file is truncated (2 bytes not processed)'}
{'fname': 'training/3870355154', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 49152.0, 'response': 200, 'url': 'http://www.lakesnwoods.com/images/Hendri1.jpg', 'caption': 'topographic map of the area', 'exception': 'image file is truncated (33 bytes not processed)'}
{'fname': 'training/1735079079', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 41925.0, 'response': 200, 'url': 'http://www.francois5.com/wp-content/uploads/2014/08/IMG_0043.jpg', 'caption': 'the rocks were covered with mussels', 'exception': 'image file is truncated (21 bytes not processed)'}




17 out of 331


  " Skipping tag %s" % (size, len(data), tag)


18 out of 331


  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/550350905', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 1186170.0, 'response': 200, 'url': 'http://www.themreport.com/wp-content/uploads/2017/10/Feature3_Art-1.jpg', 'caption': 'show them the money : assistance', 'exception': 'image file is truncated (1 bytes not processed)'}
{'fname': 'training/1798278921', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 48858.0, 'response': 200, 'url': 'http://www.rhinorealty.com/listing_images/10433_27.jpg', 'caption': 'private just steps from your home , the private access is used by residents only .', 'exception': 'image file is truncated (1 bytes not processed)'}




19 out of 331
{'fname': 'training/440777445', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 298467.0, 'response': 200, 'url': 'http://www.svsugarshack.com/wp-content/uploads/2017/11/img_6033-1.jpg', 'caption': 'person , i climbed on this beautiful wall .', 'exception': 'image file is truncated (12 bytes not processed)'}


  "Palette images with Transparency expressed in bytes should be "


20 out of 331




{'fname': 'training/1510695374', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 2982389.0, 'response': 200, 'url': 'https://dvorakexpeditions.com/wp-content/uploads/2015/09/Bear-eating-salmon.jpg', 'caption': 'late summer and early fall is when bears spend foraging for food .', 'exception': 'image file is truncated (157 bytes not processed)'}
{'fname': 'training/1083307463', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 7086958.0, 'response': 200, 'url': 'https://www.fbi.gov/image-repository/poster-2.jpg', 'caption': 'poster showing an eye over currency .', 'exception': 'image file is truncated (29 bytes not processed)'}
21 out of 331




{'fname': 'training/2725814572', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 369306.0, 'response': 200, 'url': 'http://discovernwnebraska.com/wp-content/uploads/2015/05/MariSandozHeritageCenter_13sm.jpg', 'caption': 'life - size bronze sculpture of novelist in front .', 'exception': 'image file is truncated (28 bytes not processed)'}
{'fname': 'training/3124228140', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 48898.0, 'response': 200, 'url': 'https://www.iamexpat.nl/sites/default/files/styles/article--full/public/fireworks-netherlands-amsterdam-rotterdam.jpg?itok=3dTnk8_P', 'caption': "new year 's eve firework shows", 'exception': 'image file is truncated (1 bytes not processed)'}
{'fname': 'training/1555053500', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 38692.0, 'response': 200, 'url': 'http://www.filipiknow.net/wp-content/uploads/2016/07/Manila-Hotel-in-the-1940s.jpg', 'caption': 'a city in the 1940s', 'exception': "cannot identify image file '../datasets

  "Palette images with Transparency expressed in bytes should be "


23 out of 331




24 out of 331


  "Palette images with Transparency expressed in bytes should be "
  " Skipping tag %s" % (size, len(data), tag)
  " Skipping tag %s" % (size, len(data), tag)


25 out of 331


  " Skipping tag %s" % (size, len(data), tag)
  " Skipping tag %s" % (size, len(data), tag)
  "Palette images with Transparency expressed in bytes should be "


{'fname': 'training/422230645', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 1266010.0, 'response': 200, 'url': 'https://www.thetrentonline.com/wp-content/uploads/2015/05/DSCN0168.jpg', 'caption': 'a cross section of the crowd present at the event', 'exception': 'image file is truncated (9 bytes not processed)'}
26 out of 331
{'fname': 'training/3201137933', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 293446.0, 'response': 200, 'url': 'http://jrmanley.com/wp-content/uploads/2015/07/Sunset-1600x989.jpg', 'caption': 'sunset viewed from the air', 'exception': 'image file is truncated (0 bytes not processed)'}




{'fname': 'training/2397480660', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 419311.0, 'response': 200, 'url': 'https://www.kimberlyeinmo.com/wp-content/uploads/2008/09/fishy-entree.jpg', 'caption': 'the fresh trout arrive at the table', 'exception': 'image file is truncated (25 bytes not processed)'}


  " Skipping tag %s" % (size, len(data), tag)


27 out of 331
{'fname': 'training/3220139284', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 98664.0, 'response': 200, 'url': 'https://thumb9.shutterstock.com/display_pic_with_logo/1462748/778072672/stock-photo-ho-chi-minh-portrait-on-vietnam-dong-banknote-closeup-macro-vietnamese-communist-778072672.jpg', 'caption': 'portrait on dong banknote closeup macro revolutionary leader and first president .', 'exception': 'image file is truncated (0 bytes not processed)'}
{'fname': 'training/3074471426', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 2261910.0, 'response': 200, 'url': 'https://whartonveterinaryclinic.com/wp-content/uploads/2014/08/IMG_0063-e1421259218820.jpg', 'caption': 'another view of industry and door to private patio .', 'exception': 'image file is truncated (40 bytes not processed)'}
28 out of 331
{'fname': 'training/1583096100', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 324097.0, 'response': 200, 'url': 'https://www.uab.edu/studentmedia/images/so



29 out of 331




{'fname': 'training/4191298342', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 28357.0, 'response': 200, 'url': 'http://images.slideplayer.com/18/6137298/slides/slide_10.jpg', 'caption': 'anxious to learn cars hold wide appeal already thinking about driving many have already driven a car', 'exception': 'image file is truncated (25 bytes not processed)'}




{'fname': 'training/1029301548', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 3084288.0, 'response': 200, 'url': 'https://www.hoart.cam.ac.uk/images/collections-displays-and-the-agency-of-objects-poster', 'caption': 'collections , displays and the agency of poster', 'exception': 'image file is truncated (12 bytes not processed)'}
{'fname': 'training/3941969589', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 637483.0, 'response': 200, 'url': 'http://www.svsugarshack.com/wp-content/uploads/2017/07/img_4429.jpg', 'caption': 'person to keep unwanted visitor .', 'exception': 'image file is truncated (47 bytes not processed)'}


  " Skipping tag %s" % (size, len(data), tag)


30 out of 331


  " Skipping tag %s" % (size, len(data), tag)
  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/1420450289', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 122959.0, 'response': 200, 'url': 'http://www.svsugarshack.com/wp-content/uploads/2017/11/img_6036.jpg', 'caption': 'loved this wooden door against the tattered building .', 'exception': 'image file is truncated (47 bytes not processed)'}
{'fname': 'training/703056226', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 139141.0, 'response': 200, 'url': 'http://media.gettyimages.com/photos/cheerleader-of-rhein-fire-dances-during-the-nfl-europe-match-between-picture-id52778052', 'caption': 'cheerleader of dances during the match .', 'exception': 'image file is truncated (118 bytes not processed)'}
31 out of 331


  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/583585540', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 48857.0, 'response': 200, 'url': 'http://www.rhinorealty.com/listing_images/10433_2.jpg', 'caption': 'huge windows bring in lots of natural light to the quiet front living room .', 'exception': 'image file is truncated (27 bytes not processed)'}




{'fname': 'training/1550982875', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 492962.0, 'response': 200, 'url': 'http://www.svsugarshack.com/wp-content/uploads/2017/11/img_6282.jpg', 'caption': 'beautiful colored buildings inside the old city .', 'exception': 'image file is truncated (14 bytes not processed)'}


  " Skipping tag %s" % (size, len(data), tag)


{'fname': 'training/1559786538', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 270213.0, 'response': 200, 'url': 'https://media.gettyimages.com/photos/canberra-united-players-celebrate-after-a-goal-by-michelle-heyman-picture-id460081584', 'caption': 'players celebrate after a goal by athlete during the round match .', 'exception': 'image file is truncated (33 bytes not processed)'}
{'fname': 'training/192392429', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 139141.0, 'response': 200, 'url': 'https://media.gettyimages.com/photos/director-barry-jenkins-kiki-layne-and-diego-luna-on-the-movie-set-of-picture-id871913798', 'caption': 'actor , person and actor on the movie set of .', 'exception': 'image file is truncated (94 bytes not processed)'}
32 out of 331




{'fname': 'training/432150964', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 16691.0, 'response': 200, 'url': 'http://anacortes.org/wp-content/uploads/2015/03/orca-1-632x444.jpg', 'caption': 'biological species in the waters around a city', 'exception': 'image file is truncated (1 bytes not processed)'}
{'fname': 'training/1500833971', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 149579.0, 'response': 200, 'url': 'http://morganhilllife.com/wp-content/uploads/2016/03/Coyote-Ridge-March-18-21-1.jpg', 'caption': 'years in the making will be permanently preserved', 'exception': 'image file is truncated (10 bytes not processed)'}
{'fname': 'training/1001986659', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 39824.0, 'response': 200, 'url': 'https://thumb10.shutterstock.com/display_pic_with_logo/641824/492631483/stock-photo-boy-and-his-dog-playing-in-a-pile-of-autumn-leaves-492631483.jpg', 'caption': 'boy and his dog playing in a pile of autumn leaves', 'exception': 'i



{'fname': 'training/3965891445', 'split': 'training', 'ftype': 'image/jpeg', 'fsize': 32768.0, 'response': 200, 'url': 'https://thumb7.shutterstock.com/display_pic_with_logo/162468982/757093999/stock-vector-card-invitation-cover-template-design-line-art-background-abstract-geometric-pattern-with-place-757093999.jpg', 'caption': 'card , invitation , cover template design , line art background .', 'exception': 'image file is truncated (1 bytes not processed)'}


  " Skipping tag %s" % (size, len(data), tag)


33 out of 331


In [15]:
len(items)

2749293

In [16]:
with open('../datasets/concap/train.json', 'w') as fout:
    json.dump(items, fout)