# Exploratory Data Analysis

In [14]:
print(f'this notebook is about the {data["type"]} data only - http://visionandlanguage.net/VIST/dataset.html')

this notebook is about the story-in-sequence data only - http://visionandlanguage.net/VIST/dataset.html


### Summary:

Issues - 

1. The number of albums mentioned everywhere: `10,117`. Actual number of albums: **8031 + 998 + 1011 = 10040** from the data
2. The number of images mentioned everywhere: `210,819`. Actual number of images: **167528 + 21048 + 21075 = 209,651**
3. The json files contain `albums` information (that contains a field called `photos` indicating the number of images associated with that album). This is wrong. In actuality there are no albums in all 3 data splits (test, train, valid) that contain less than 10 images. But the json files show albums with less than 10 count. Example album-id - `616890`. More analysis below.

Conclusions - 

1. **All albums have exactly 5 corresponding stories**
2. **All stories have exactly 5 sentences**
3. **An image can be part of atmost 5 stories** (meaning it can have 5 annotations)
4. The only attribute in **order** is `story_id`, which can be used throughout the code to make other required fields (listed below), fall in place
5. **All albums have atleast 10 images**

Fields in scope - 

`photo_flickr_id`, `text`, `worker_arranged_photo_order` (just used to order the 5 images in memory at a given time step)

_____________________________________________________________________

In [2]:
import pandas as pd
import json

### Training data

In [3]:
with open('dataset/sis/train.story-in-sequence.json') as json_data:
    data = json.load(json_data)

data.keys()

dict_keys(['images', 'info', 'albums', 'type', 'annotations'])

In [4]:
actual_images = !find dataset/images/train/ -type f | sed 's/.*\.//' | sort | uniq -c
total_images = !find dataset/images/train/ -type f | wc -l
print(f'{actual_images[2].strip()}, {actual_images[3].strip()}, {actual_images[1].strip()}, {actual_images[0].strip().split(" ")[0]} other types')
print(f' = {total_images[0]} images in total')

167091 jpg, 341 png, 97 gif, 1 other types
 = 167530 images in total


In [5]:
image_jsons = data['images']
album_jsons = data['albums']
annotation_jsons = data['annotations']

In [6]:
image_jsons[0]

{'album_id': '72157605930515606',
 'datetaken': '2008-06-30 07:33:43',
 'id': '2626977325',
 'latitude': '34.414760',
 'license': '5',
 'longitude': '-119.692879',
 'media': 'photo',
 'secret': 'bec0ff3596',
 'tags': 'santabarbara',
 'text': '',
 'title': 'Moreton Bay Fig 1877',
 'url_o': 'https://farm3.staticflickr.com/2078/2626977325_2b7696990c_o.jpg'}

In [7]:
gif_jsons = []
jpg_jsons = []
png_jsons = []
other_jsons = []
for image_json in image_jsons:
    if 'url_o' in image_json:
        image_type = image_json['url_o']
    else:
        image_type = image_json['url_m']
    
    if image_type.find('.jpg') != -1:
        jpg_jsons.append(image_json)
    elif image_type.find('.png') != -1:
        png_jsons.append(image_json)
    elif image_type.find('.gif') != -1:
        gif_jsons.append(image_json)
    else:
        other_jsons.append(image_json)

In [8]:
print(f'{len(jpg_jsons)} jpg, {len(png_jsons)} png, {len(gif_jsons)} gif, {len(other_jsons)} other types')
print(f' = {len(jpg_jsons) + len(png_jsons) + len(gif_jsons) + len(other_jsons)} images in total')
print(f' = {len(image_jsons)} entries')

167090 jpg, 341 png, 97 gif, 0 other types
 = 167528 images in total
 = 167528 entries


In [9]:
len(album_jsons)

8031

In [10]:
album_jsons[0]

{'date_create': '1214980972',
 'date_update': '1432330952',
 'description': 'Believed to be the largest Moreton Bay Fig Tree in the united States',
 'farm': '4',
 'id': '72157605930515606',
 'owner': '12806074@N08',
 'photos': '13',
 'primary': '2626985925',
 'secret': '98149cd59b',
 'server': '3104',
 'title': 'Moreton Bay Fig 1877',
 'vist_label': '4th_of_july'}

In [11]:
total_images = 0
for album_json in album_jsons:
    total_images += int(album_json['photos'])
    
print(f'images count retreived from albums = {total_images}')

images count retreived from albums = 166994


In [12]:
album_ids = set()
for image_json in image_jsons:
    album_ids.add(image_json['album_id'])
    
print(f'albums count retreived from images = {len(album_ids)}')

albums count retreived from images = 8031


In [13]:
album_2_images = {}
for image_json in image_jsons:
    album_id = image_json['album_id']
    image_id = image_json['id']
    if album_id in album_2_images:
        album_2_images[album_id].append(image_id)
    else:
        album_2_images[album_id] = list()
        album_2_images[album_id].append(image_id)

print('mapping between albums and images created')

mapping between albums and images created


In [14]:
print(len(album_2_images), type(album_2_images))

8031 <class 'dict'>


In [15]:
crappy_albums = 0
for album_id, image_ids in album_2_images.items():
    actual_num_image_ids = 0
    for album_json in album_jsons:
        if album_id == album_json['id']:
            actual_num_image_ids = int(album_json['photos'])
            break

    if len(image_ids) != actual_num_image_ids:
#         print(f'some issue with album id {album_id}')
#         print(f'{actual_num_image_ids}, {len(image_ids)}')
        crappy_albums += 1

print(crappy_albums)

75


In [16]:
for image_json in image_jsons:
    if image_json['album_id'] == '616890':
        print(image_json)

{'datetaken': '2005-07-03 20:58:52', 'license': '3', 'title': 'Twilight 4th of July', 'text': '', 'album_id': '616890', 'longitude': '0', 'url_o': 'https://farm1.staticflickr.com/21/26987844_e1184ecdcb_o.jpg', 'secret': 'e1184ecdcb', 'media': 'photo', 'latitude': '0', 'id': '26987844', 'tags': 'boats eastriver manhattan newyorkcity night ny nyc sky water'}
{'datetaken': '2005-07-04 19:49:31', 'license': '3', 'title': 'Eastside', 'text': '', 'album_id': '616890', 'longitude': '0', 'url_o': 'https://farm1.staticflickr.com/22/27239809_0339d99fae_o.jpg', 'secret': '0339d99fae', 'media': 'photo', 'latitude': '0', 'id': '27239809', 'tags': 'skyline eastriver manhattan newyorkcity ny nyc sky architecture'}
{'datetaken': '2005-07-04 19:49:38', 'license': '3', 'title': 'Waiting for fireworks', 'text': '', 'album_id': '616890', 'longitude': '0', 'url_o': 'https://farm1.staticflickr.com/23/26961455_3f64820d7d_o.jpg', 'secret': '3f64820d7d', 'media': 'photo', 'latitude': '0', 'id': '26961455', 'ta

In [17]:
album_2_images['616890']

['26987844',
 '27239809',
 '26961455',
 '26961454',
 '26987845',
 '26987846',
 '27239806',
 '27239807',
 '27239808',
 '28191593',
 '28191591',
 '28191592']

In [18]:
for album_json in album_jsons:
    if album_json['id'] == '616890':
        print(album_json)
        break

{'description': '', 'title': 'Fourth of July 2005, NYC', 'farm': '1', 'date_update': '1296925051', 'primary': '28191593', 'server': '23', 'date_create': '616890', 'photos': '3', 'secret': '1fdc019791', 'owner': '20768188@N00', 'vist_label': '4th_of_july', 'id': '616890'}


In [19]:
for annotation_json in annotation_jsons:
    if annotation_json[0]['photo_flickr_id'] == '26961454':
        print(annotation_json)

[{'original_text': 'We could see the launching pad where the fireworks were to be shot off, we had the best viewing spot possible.', 'album_id': '616890', 'photo_flickr_id': '26961454', 'setting': 'first-2-pick-and-tell', 'worker_id': 'FZWJK4PVSM1P98G', 'story_id': '420', 'tier': 'story-in-sequence', 'worker_arranged_photo_order': 1, 'text': 'we could see the launching pad where the fireworks were to be shot off , we had the best viewing spot possible .', 'storylet_id': '2101'}]
[{'original_text': 'It was July 4th and we were heading into the city to see fireworks.', 'album_id': '616890', 'photo_flickr_id': '26961454', 'setting': 'first-2-pick-and-tell', 'worker_id': 'WGQHJXJ41NTZIWA', 'story_id': '421', 'tier': 'story-in-sequence', 'worker_arranged_photo_order': 0, 'text': 'it was july 4th and we were heading into the city to see fireworks .', 'storylet_id': '2105'}]
[{'original_text': 'We got our spots and watched the barges while the sun set.', 'album_id': '616890', 'photo_flickr_id

In [104]:
bad_news = False
for album_json in album_jsons:
    album_id = album_json['id']
    stories = set()
    for annotation_json in annotation_jsons:
        if annotation_json[0]['album_id'] == album_id:
            stories.add(annotation_json[0]['story_id'])
    
    if len(stories) != 5:
        print(f'album {album_id} does not have 5 stories')
        bad_news = True

if not bad_news:
    print('all albums have 5 stories. life is even!')

all albums have 5 stories. life is even!


In [106]:
stories_2_sentences = {}

for annotation_json in annotation_jsons:
    if annotation_json[0]['story_id'] in stories_2_sentences:
        stories_2_sentences[annotation_json[0]['story_id']] += 1
    else:
        stories_2_sentences[annotation_json[0]['story_id']] = 1
        
sent_count = set(stories_2_sentences.values())
if len(sent_count) == 1:
    print(f'all stories have {list(sent_count)[0]} sentences. life is still even!')
else:
    print('something bumpy')

all stories have 5 sentences. life is still even!


In [116]:
bad_news = False
for album_id, image_ids in album_2_images.items():
    if len(image_ids) < 10:
        print('something bumpy', album_id)
        bad_news = True
        break

if not bad_news:
    print('all albums have atleast 10 images. life is still even!')

all albums have atleast 10 images. life is still even!


In [42]:
bad_news = False
for album_json in album_jsons:
    _album_id = album_json['id']
    story_2_imgs = {}
    for annotation_json in annotation_jsons:
        album_id = annotation_json[0]['album_id']
        if album_id == _album_id:
            story_id = annotation_json[0]['story_id']
            image_id = annotation_json[0]['photo_flickr_id']
        
            if story_id in story_2_imgs:
                story_2_imgs[story_id].append(image_id)
            else:
                story_2_imgs[story_id] = list()
                story_2_imgs[story_id].append(image_id)
    t = list(story_2_imgs.values())
    s = set(frozenset(i) for i in t)
#     print(s)
    if len(s) != 2 and len(s) != 1:
        print(f'there is an issue with {_album_id} album')
        bad_news = True
        break

if not bad_news:
    print('life is good')

life is good


In [21]:
print(pd.DataFrame.from_dict(story_2_imgs))

        420       421       422       423       424
0  27239809  26961454  27239809  27239809  26961454
1  26961454  26961455  26961454  26961454  26961455
2  26987845  27239809  26987845  26987845  27239809
3  27239808  26987845  27239808  27239808  26987845
4  28191592  27239806  28191592  28191592  27239806


In [39]:
story_2_imgs = {}
for annotation_json in annotation_jsons:
        album_id = annotation_json[0]['album_id']
        if album_id == '72157594190376306':
            story_id = annotation_json[0]['story_id']
            image_id = annotation_json[0]['photo_flickr_id']
        
            if story_id in story_2_imgs:
                story_2_imgs[story_id].append(image_id)
            else:
                story_2_imgs[story_id] = list()
                story_2_imgs[story_id].append(image_id)
                
print(pd.DataFrame.from_dict(story_2_imgs))

       30365      30366      30367      30368      30369
0  182023789  182023793  182023789  182023789  182023789
1  182023792  182023789  182023792  182023792  182023792
2  182023793  182030838  182023793  182023793  182023793
3  182030838  182023792  182030838  182030838  182030838
4  182023794  182023794  182023794  182023794  182023794


In [41]:
t = list(story_2_imgs.values())
s = set(frozenset(i) for i in t)
if len(set(frozenset(i) for i in t)) != 2:
    print(f'there is an issue with {album_id} album')
len(s)

there is an issue with 72157624875976415 album


1

### Validation data

In [20]:
actual_images = !find dataset/images/validate/ -type f | sed 's/.*\.//' | sort | uniq -c
total_images = !find dataset/images/validate/ -type f | wc -l
print(f'{actual_images[1].strip()}, {actual_images[2].strip()}, {actual_images[0].strip().split(" ")[0]} other types')
print(f' = {total_images[0]} images in total')

21023 jpg, 24 png, 1 other types
 = 21048 images in total


In [21]:
with open('dataset/sis/val.story-in-sequence.json') as json_data:
    data = json.load(json_data)

data.keys()

dict_keys(['images', 'info', 'albums', 'type', 'annotations'])

In [22]:
image_jsons = data['images']
album_jsons = data['albums']
annotation_jsons = data['annotations']

In [23]:
gif_jsons = []
jpg_jsons = []
png_jsons = []
other_jsons = []
for image_json in image_jsons:
    if 'url_o' in image_json:
        image_type = image_json['url_o']
    else:
        image_type = image_json['url_m']
    
    if image_type.find('.jpg') != -1:
        jpg_jsons.append(image_json)
    elif image_type.find('.png') != -1:
        png_jsons.append(image_json)
    elif image_type.find('.gif') != -1:
        gif_jsons.append(image_json)
    else:
        other_jsons.append(image_json)

In [24]:
print(f'{len(jpg_jsons)} jpg, {len(png_jsons)} png, {len(gif_jsons)} gif, {len(other_jsons)} other types')
print(f' = {len(jpg_jsons) + len(png_jsons) + len(gif_jsons) + len(other_jsons)} images in total')
print(f' = {len(image_jsons)} entries')

21023 jpg, 24 png, 1 gif, 0 other types
 = 21048 images in total
 = 21048 entries


In [25]:
len(album_jsons)

998

In [26]:
album_2_images = {}
for image_json in image_jsons:
    album_id = image_json['album_id']
    image_id = image_json['id']
    if album_id in album_2_images:
        album_2_images[album_id].append(image_id)
    else:
        album_2_images[album_id] = list()
        album_2_images[album_id].append(image_id)

print('mapping between albums and images created')

mapping between albums and images created


In [27]:
for album_id, image_ids in album_2_images.items():
    actual_num_image_ids = 0
    for album_json in album_jsons:
        if album_id == album_json['id']:
            actual_num_image_ids = int(album_json['photos'])
            break

    if len(image_ids) != actual_num_image_ids:
#         print(f'some issue with album id {album_id}')
#         print(f'{actual_num_image_ids}, {len(image_ids)}')
        crappy_albums += 1

print(crappy_albums)

80


In [128]:
bad_news = False
for album_json in album_jsons:
    album_id = album_json['id']
    stories = set()
    for annotation_json in annotation_jsons:
        if annotation_json[0]['album_id'] == album_id:
            stories.add(annotation_json[0]['story_id'])
    
    if len(stories) != 5:
        print(f'album {album_id} does not have 5 stories')
        bad_news = True

if not bad_news:
    print('all albums have 5 stories. life is even!')

all albums have 5 stories. life is even!


In [129]:
stories_2_sentences = {}

for annotation_json in annotation_jsons:
    if annotation_json[0]['story_id'] in stories_2_sentences:
        stories_2_sentences[annotation_json[0]['story_id']] += 1
    else:
        stories_2_sentences[annotation_json[0]['story_id']] = 1
        
sent_count = set(stories_2_sentences.values())
if len(sent_count) == 1:
    print(f'all stories have {list(sent_count)[0]} sentences. life is still even!')
else:
    print('something bumpy')

all stories have 5 sentences. life is still even!


In [130]:
bad_news = False
for album_id, image_ids in album_2_images.items():
    if len(image_ids) < 10:
        print('something bumpy', album_id)
        bad_news = True
        break

if not bad_news:
    print('all albums have atleast 10 images. life is still even!')

all albums have atleast 10 images. life is still even!


### Test data

In [28]:
actual_images = !find dataset/images/test/ -type f | sed 's/.*\.//' | sort | uniq -c
total_images = !find dataset/images/test/ -type f | wc -l
print(f'{actual_images[0].strip()}, {actual_images[1].strip()}')
print(f' = {total_images[0]} images in total')

20991 jpg, 84 png
 = 21075 images in total


In [29]:
with open('dataset/sis/test.story-in-sequence.json') as json_data:
    data = json.load(json_data)

data.keys()

dict_keys(['images', 'info', 'albums', 'type', 'annotations'])

In [30]:
image_jsons = data['images']
album_jsons = data['albums']
annotation_jsons = data['annotations']

In [31]:
gif_jsons = []
jpg_jsons = []
png_jsons = []
other_jsons = []
for image_json in image_jsons:
    if 'url_o' in image_json:
        image_type = image_json['url_o']
    else:
        image_type = image_json['url_m']
    
    if image_type.find('.jpg') != -1:
        jpg_jsons.append(image_json)
    elif image_type.find('.png') != -1:
        png_jsons.append(image_json)
    elif image_type.find('.gif') != -1:
        gif_jsons.append(image_json)
    else:
        other_jsons.append(image_json)

In [32]:
print(f'{len(jpg_jsons)} jpg, {len(png_jsons)} png, {len(gif_jsons)} gif, {len(other_jsons)} other types')
print(f' = {len(jpg_jsons) + len(png_jsons) + len(gif_jsons) + len(other_jsons)} images in total')
print(f' = {len(image_jsons)} entries')

20991 jpg, 84 png, 0 gif, 0 other types
 = 21075 images in total
 = 21075 entries


In [33]:
len(album_jsons)

1011

In [34]:
album_2_images = {}
for image_json in image_jsons:
    album_id = image_json['album_id']
    image_id = image_json['id']
    if album_id in album_2_images:
        album_2_images[album_id].append(image_id)
    else:
        album_2_images[album_id] = list()
        album_2_images[album_id].append(image_id)

print('mapping between albums and images created')

mapping between albums and images created


In [35]:
for album_id, image_ids in album_2_images.items():
    actual_num_image_ids = 0
    for album_json in album_jsons:
        if album_id == album_json['id']:
            actual_num_image_ids = int(album_json['photos'])
            break

    if len(image_ids) != actual_num_image_ids:
#         print(f'some issue with album id {album_id}')
#         print(f'{actual_num_image_ids}, {len(image_ids)}')
        crappy_albums += 1

print(crappy_albums)

87


In [141]:
bad_news = False
for album_json in album_jsons:
    album_id = album_json['id']
    stories = set()
    for annotation_json in annotation_jsons:
        if annotation_json[0]['album_id'] == album_id:
            stories.add(annotation_json[0]['story_id'])
    
    if len(stories) != 5:
        print(f'album {album_id} does not have 5 stories')
        bad_news = True

if not bad_news:
    print('all albums have 5 stories. life is even!')

all albums have 5 stories. life is even!


In [142]:
stories_2_sentences = {}

for annotation_json in annotation_jsons:
    if annotation_json[0]['story_id'] in stories_2_sentences:
        stories_2_sentences[annotation_json[0]['story_id']] += 1
    else:
        stories_2_sentences[annotation_json[0]['story_id']] = 1
        
sent_count = set(stories_2_sentences.values())
if len(sent_count) == 1:
    print(f'all stories have {list(sent_count)[0]} sentences. life is still even!')
else:
    print('something bumpy')

all stories have 5 sentences. life is still even!


In [143]:
bad_news = False
for album_id, image_ids in album_2_images.items():
    if len(image_ids) < 10:
        print('something bumpy', album_id)
        bad_news = True
        break

if not bad_news:
    print('all albums have atleast 10 images. life is still even!')

all albums have atleast 10 images. life is still even!


In [16]:
import json
import os
import shutil

with open('dataset/sis/val.story-in-sequence.json') as json_data:
    data = json.load(json_data)
data.keys()

annotation_jsons = data['annotations']

print(len(annotation_jsons) / 5, type(annotation_jsons))

train_annotations = annotation_jsons[0:100]
len(train_annotations)

train_img_paths = ['/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/' + annotation_json[0]['photo_flickr_id'] for annotation_json in train_annotations]
len(train_img_paths)

4990.0 <class 'list'>


100

In [17]:
destination = '/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/val_sample/'
bad = 0
for file_name in train_img_paths:
    if os.path.isfile(file_name + '.jpg'):
        print(file_name + '.jpg')
        shutil.copy(file_name + '.jpg', destination)
    elif os.path.isfile(file_name + '.png'):
        print(file_name + '.png')
        shutil.copy(file_name + '.png', destination)
    else:
        bad += 1

print(bad)

/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/693397887.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/695160730.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/694227508.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/693397865.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/694227468.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/694227468.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/694227412.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/694227488.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/images/validate/694227508.jpg
/scratch/cs/imagedb/picsom/databases/vist/download/vist-baseline/dataset/