# Data Subset Creator

_By: Carlos Andres Pizarroso Troncoso_

This notebook contains the code for creating a subset from Mapillary Vistas Dataset.

## 1. Creating a subset of the main dataset

First, importing the necessary libraries 

In [1]:
# Importing necessary libraries

from __future__ import print_function
import json
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import shutil
import os

In [2]:
# Defining paths

data_path = 'Data'
train_path = data_path + '/train'
validation_path = data_path + '/validation'
testing_path = data_path + '/testing'
labels_path = '/labels'
images_path = '/images'

# Folders for subdataset (just performed once)

os.mkdir(data_path)
os.mkdir(train_path)
os.mkdir(validation_path)
os.mkdir(testing_path)
os.mkdir(train_path + labels_path)
os.mkdir(validation_path + labels_path)
os.mkdir(train_path + images_path)
os.mkdir(validation_path + images_path)
os.mkdir(testing_path + images_path)

#images_path = "C:/Users/Carlos/Documents/Carlos/Comenius University/Master's Degree/Diploma/Practice/Mapillary-vistas-dataset/mapillary_vistas_v1/training/images"
#labels_path = "C:/Users/Carlos/Documents/Carlos/Comenius University/Master's Degree/Diploma/Practice/Mapillary-vistas-dataset/mapillary_vistas_v1/training/labels"
original_path = "C:/Users/Carlos/Documents/Carlos/Comenius University/Master's Degree/Diploma/Practice/Mapillary-vistas-dataset/mapillary_vistas_v1/"

### 1.1 Training

In this section, images which contain only billboards will be first selected.

Mapillary Vistas Dataset contains several classes, including billboards. Billboards have a label ID of 35 (used below).

We increased the size of the training set from **2000** images to **5000** images.

In [3]:
# Getting the names of the image files from the original training path (Mapillary Vistas)

training_images_names = os.listdir(original_path + "training/images")
n = len(training_images_names)
for i in range(0, n):
    training_images_names[i] = training_images_names[i][0:training_images_names[i].find('.')]
print(training_images_names[25])
print(n)

-4x3QnUx01ZfYnZPVE9DNA
18000


In [4]:
# Getting the ids of images that contain billboards on them

i = 0
size = 5000 # size of the desired subset (due to storage limitations, we start by selecting 5000 images with billboards)
subset_names_train = []
while (len(subset_names_train) < size):
    image_id = training_images_names[i]
    # read in config file
    with open(original_path + 'config.json') as config_file:
        config = json.load(config_file)
        
    labels = config['labels']
    label_dic = {}
    for label_id, label in enumerate(labels):
        label_dic[label['name']] = label_id

    # set up paths for every image
    image_path = original_path + "training/images/{}.jpg".format(image_id)
    panoptic_path = original_path + "training/panoptic/{}.png".format(image_id)

    # load images
    base_image = Image.open(image_path)
    panoptic_image = Image.open(panoptic_path)
        
    # PANOPTIC HANDLING

    # read in panoptic file
    with open(original_path + "training/panoptic/panoptic_2018.json") as panoptic_file:
        panoptic = json.load(panoptic_file)
    
   # convert annotation infos to image_id indexed dictionary
    panoptic_per_image_id = {}
    for annotation in panoptic["annotations"]:
        panoptic_per_image_id[annotation["image_id"]] = annotation
        
    # convert category infos to category_id indexed dictionary
    panoptic_category_per_id = {}
    for category in panoptic["categories"]:
        panoptic_category_per_id[category["id"]] = category
        
    # convert segment infos to segment id indexed dictionary
    example_panoptic = panoptic_per_image_id[image_id]
    example_segments = {}
    for segment_info in example_panoptic["segments_info"]:
        example_segments[segment_info["id"]] = segment_info
    
    panoptic_array = np.array(panoptic_image).astype(np.uint32)
    panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
    panoptic_ids_from_image = np.unique(panoptic_id_array)
    
    for panoptic_id in panoptic_ids_from_image:
        if panoptic_id == 0:
            # void image areas don't have segments
            continue
        segment_info = example_segments[panoptic_id]
        category = panoptic_category_per_id[segment_info["category_id"]]
        
        img_label = label_dic[category['supercategory']]
        if img_label == 35: # <- Billboards label ID, this filters images which contain billboards only
            print(image_id)
            subset_names_train.append(image_id)
            break
        example_segments.pop(panoptic_id)
    i += 1

--vWKSR3Rh8quTfK4AuKOQ
--WOpVBuHlCygAUADkttpg
-0C1J9CvgFP4BTVLXNeNZA
-0GQmYRienNVqEKiQ0Mkyw
-0Q9X3KqBrhdlGzap-YToQ
-0QdotcnWZMgDLHELjMeoQ
-0xySwkm6plG4nznHLQcHg
-1Z28Rrc91syhk-y8Bp_cg
-2H8gOJp_LEvus2Y6pCIOA
-3BbEwDFdZ6kvDrbqhNShw
-3bnLdeHkonpmCJj3a1X_w
-3Ce66lOpbY2j4Tgngr-dQ
-3qDI4rcmarHQRV9FDKfeA
-3u6bh9pGFOtClGfzvMqqg
-3WWvMk6i1YgKC9ECG2HEA
-4Kw4AJqOzZG8q5le7bGlQ
-4oZ0SAVEd4ijmony0t1Zw
-4SR45KuNoLQJJMNstPwRg
-4VHR0_p6lfAJ3Id2RXozg
-4x3QnUx01ZfYnZPVE9DNA
-5Bq-gd3iHCFkWdCjjfWlA
-5g8sem4SFSvxctV9MeyIw
-5n8RNkx2tWpD7w-ZAZXZA
-6-WLs7O63-6cwx-8adk7g
-60QYVrLRufeAjbRN-z79g
-6ie-JRSyamuIYcRWTWZVw
-6kI3IofoH9vQ7kM6ake2w
-6tNQe10dgH3ByWafilXHw
-6zd1w0cgRgijS2srqkMUA
-7kDWbpzqmrQkBx39Z_Jgg
-7njPpiHROkOjrBeSzNiow
-8d4l3xicTRHZLze9Wuc4g
-8d9PIE2D6OjzC3Q_fEUmA
-8EClSWhRM4FuiWdu-kbQg
-8LfeDEADhyLcKBvhRWrCw
-9bvNvBWAX3xjKfJzS4vTA
-9s41VI1tVoMhjTwKUMbnw
-A3duJBYuHa2VtOTTsG1dQ
-ahL3qORtev5ZHXrB5NYIA
-AMluJ4RmAbAN0L9AfwFUg
-asE-g-kqSEenUxYHbzx_w
-AUC1oIt8HUa9TmU4bd0Dg
-aZs4bu6mgdPQs7Xi8xUNQ
-b3TQ65aEVg

0tFtDheFTiIIEPConav6nQ
0tKkbIJoljSQK1BoCDtH-Q
0tLCUB-LGK4JPmAKS5iIvw
0TNpCYNPOo1c5Vq-ztxd0Q
0tysbnnKZGOk-SyzweASaA
0TZ4wg5QDLO5vBbRV2y3ug
0u3FaC5WLGq_mvLKxM14fQ
0UFny3cOYkXq9J8RhLRMhw
0UH-3IizefRkP7x22udblA
0uHsL_U_sFalxsGiVpGwoA
0Utp00q-Fbq5qTX3LkOLcA
0UTXK-0rPSKXTenLg9ibeg
0v2sWrXwmQlJ80vK91Iqig
0V8P2kRlDYQWt76HuysCzg
0VbycD0Rm50flulLp2K80w
0Vfi038p2TdtPQbExh1J8Q
0vflPRzTq-FYQ0J3my2vGQ
0vp2KPqyqKxmvUR1RwhTmw
0Vsc8UPOf6xf-IKjZSG8wA
0vtS3t4ny4B2oF_nTfoUKQ
0vu-v1EUl7jIOMDsRsVsgA
0vVDvzrjPDPRzd2f_3Zl2w
0vXBHpB8tyywvQdFZb-1DA
0vXrloJm2nLIFA6natO_9w
0VZpPpnAb4F5JwCqL8sE5A
0w0eYx0kvL01xPxQ7-Zg6Q
0wCxQ_bV7onu-MtYzAaNwQ
0wNf0XgEMaB3oTOievFmgg
0wnu4N1HEvr5YqNPyVppvA
0wpMqijLv03vbJxPdaliHA
0WsBRyikf-HoBsRxJmgo6g
0X9wVunorwSPozpNUsDzYQ
0XMa8LJMOIF914PZXxEo9A
0XSaKjXc4DVoyPDYZJ-AzQ
0xxhqQ5vvmt5LBqDLPoFRw
0XzKZQy1Pw7zawsoVWB8vg
0xzvBStoXJNcp9Az_lD23Q
0Y6iyzSFXrFemKXupjt5Vw
0YNU9HxEXdaEf5T-TogA2Q
0yW6ODx0kurrFi-Dpe_XfQ
0Yx-EqG0Edv2xInd-Tpz1w
0yyUNtEVKlqHAuUO35zpHw
0Z1IdCkf1TP_7-E1OJSPyg
0Z77CZke1w_

2dH-WFUdE2s7TjgxYqN3ZQ
2dm06RRLp8igUCkWodhW9w
2dNZiSKr5SImVwNDq7-ZcA
2dP8ZTMC_fopdAWuhleNeg
2DTFX41vHqR1hwLy8kO6-A
2dZYhGkK9HVUmsZXneiR1w
2E23KMd-54oA6o3tge6eog
2EG-9qqAB81t0bjWF0iS9Q
2eipfatrXp3SeFPFGmhnLQ
2ELdGHsj7plRZEdkTH-dtQ
2el_nfBOJUFx-AC3gar0Iw
2emRJGyu96_oBCPKjlSTjg
2F347mUCp7cpMquLeeszGw
2Fcdo9pR7WbzOzCjr2Y5tg
2FDHorKcw2iwa3tR4dAFaQ
2FFtGtDdKPVRrJ-etecCcw
2FGOx9Neauy6D6w3uaAHJw
2fMA2H6ACjBh46FSPNyc8Q
2Fw7WgweQu75FnLVpR3qUA
2g6hVFDKFYpuGbK-QDmeew
2g73-N2IfHBYbLtoV4zn2w
2GIn2wFcMnSBTHH-zgBCng
2GJpm7rF-G1JY-p8CbQelg
2gKghFDkHSkJ9Gt7xqCH_Q
2GLKEXlO1JWyVfUDVoPMmA
2gLRdpU0HWZRxMywQMopjw
2GvDoNykO_gquJwWxR6Txw
2h2jSQz0moHv1g4oeLjzxQ
2HgOY0lgFzgDVwvvXjizGw
2HI5pi-fthAKQ-ZG59HYAA
2HIgWImJ7NbfGCbpPc5p-g
2hjcrGPPRq8irEsY2u-z5Q
2HoDIGydEeVC4d_8SOZw_Q
2hTjd8A8-e_ddWYG95SEzg
2hVkch2_vnjRn7GCbqWL6g
2hvoNWKlQkfrC7_jNx80FQ
2HZrl6WfgOhiSOMNYIo-UQ
2I62jcKbCg-mKJSuUZw7tw
2ICzaJfCybFCijETv1SJSw
2Ih7LIsjKQcXnwrOLyA6_A
2IHA_OHJ7mCnyk9d0831ew
2IjYPj2FSw2E3cdW3s65fQ
2IlOKz52L9UVIXvFW46OKg
2IozVYfLo74

3PaYhRUNqal4Pg2cyiLbXQ
3pBA_SQM-qTVgOxwZEYNjg
3pcgY0nCOwr1tTrR5qhkhA
3piTjXPqVrA8pFv7WWUwMg
3pi_x7iQstaNiev1mDfFdg
3PPsZLcDWI_RLNYvaAtEdw
3PsrujMDDYMve_hS_McRmA
3pwWY1hStuDHu5LLKD-NIw
3PZpBha4oNlwpGbc_HwDNA
3qFTJUNuDEfMPkOL7nZM0w
3QI-3YMMVE8rrwSc6rzx9A
3qiuZjdkbF76ip_MuKDlDg
3Qj_yZ-DBfidchtenHRdqQ
3qmDcHvPMvvQKOayYzCQCA
3qorfjHSLAvcN-dGaweMeg
3qPIV_9Sw1G42o-9qWLp1Q
3Q_o5LxVvmF6dRKT6VoyzQ
3r-t5ES3YalN8XU3yIfAzw
3r1bF3zTxw0RxJi4rNEfXA
3R4ZN_vA1qb5vjUhhcNGuQ
3r6xrzVrFaFMIDhT-gA-jw
3RB7MwMIeBcNusnU5bdl-g
3RClcfxHI27-nboPlHFuyA
3ReL18otnnfil5iLO1Ik7g
3Rl1pqh2OPUxVgCCp1zTjg
3RmIGRw0ClOYNaBo6dqGsw
3rmyf6PCGxW_Th64v8Q7Jg
3rPZ5klezNiUzS17X1BNAA
3Rqwl0WUBMW_hjj8wrunBg
3RRB6_YQbPmCWUJyPfIg-A
3RYJup1xVWPIMOk0TmvOyg
3s-Yt-qSl_P8vFcHLOAvFg
3S08KTAe8c4mFPnLWNhCeg
3S7_XWRmw5UFHxJKsLxA9A
3SdHpC8nwI4yCwF0Punhgg
3SgnxEl1WNYZqEJBV5E7gg
3shL_96xI949QC_g3ZYjSQ
3sJQM2GyHjsTOwmGgwhMTQ
3sOrQHMIeF9O0iSK7tPSuQ
3sQQEQLp5RZH1__X10Tk_A
3SYDZNRQyfi_8GiDa-6mig
3syXRsVTxpQKJ3bnGvTJyg
3TBbRZaM4Fssc5bu5lIGmg
3tC7ocA9XDG

506DEiUGGGkkY_QyDFV1MQ
50PDhqRBboi82elhfCVYpw
50sRa7oqsEktFfkYlgKqlg
50wFrzYFHPLEayIFOB41uw
50x-R_qN8J9h8b71yoM_vA
51FxR5AqweU7bDomEFtcnQ
51KV0T1-9jl58vOEF31ABQ
51lsQzsi_NiI739C6u96QQ
51RM_N9F5gGELxTCClEtOQ
528c0WcpfM2xC5DoWIFpeg
528ttqLd_waG2cttM0IBiQ
52D7w0tsSfAINI7TID-a5A
52Oz_LGAb1gZdTu1iqV4RA
52zdGPUoerz3Y_9t1ug1Ew
53-TnzoH9sgBc2W4hBRJGw
537QZnJ61xU2owhY_QHH-w
53MKAlSpzaumE4mpiVqT3g
53MQp-LugqqwMg7MER8Mpg
53OusFiU9k3FCSUAVBfHXg
5403CnaMPUprlWt5cWcf0A
54A7HnuGOYCmBE2RJpVszQ
54d1oFGJ_NC3asYhDlNt1g
54D_2MR3IgKLTXq-svf53Q
54EY5nrMZeICC8YxJL7qgQ
54jYr5Ea2hkTM3q6FWSWlg
54QyQsDeYSViz4vtiZydXA
54Y1KPMEBFrypvsOTr2Uag
54_7Yd8MSLVWOPaHXwpmOA
55ZXMxAYliqO-e41Kab6vw
569h5Li65AXmuBfzPgJ8jg
56hHEZ4hVzCtLF8yc8PqYw
56V3O7uBwUAx-I5nuMmfIg
57B36cWTm2AVpS28KnRO2A
57J-AmMRKjixLOT14FJnpA
57M3kL9OEN8BTE_d1TNCbA
57o5zE24ib1ZM41bte7qmg
57ROFdTxrsiNWEsUwCPpeA
57wlV1bx42C0sqxIx3OVpg
58NFVbyl6eejkjuAgt3RPQ
58Og4VJX--hFy92YZnqBzA
58tPSYNWxKth0J-dWsWxAg
59-5LWsN8M_ldUPzXT7vdg
596O8-JFviUWCSkBoJTEbg
5983SP4pi6f

6N1SQ9w4KsDngHylqcIM4A
6ndFjZGI0uNvokIyFc-K9A
6Nm-4-p-KhwwDVmeyDYgUg
6nM2h_LiRGGAB7Yb9ewo9w
6N_TYV6p4lgNoUp_lOXQOw
6obAf9CRQh_dBFHPAIiRFQ
6omcYu73eKatDdWUviP6mA
6ou-7BD9Cq9VRN0y-EOtdA
6OvQ8DLz41jJKPYq-NIQYw
6P1drkbxytYGuq5LhNh1dw
6P22LnbXgSdRVlt_VPmRiQ
6p7vJ6Xf0oozl6W3tmvhlg
6pdjvVg41IkkOUiLBw5P8Q
6pntMpLo0belhyeJHK6LXQ
6pzEsL3jT2ouoXED2Vevjw
6pZNlBC8S77punWJcm-zcg
6P_OPXDkbfpYhn9mtlcCsQ
6q2_pdsbQVHreaGouefyTQ
6Qm6I1uXA5Y8jgBdD9K_4w
6QNvFFRyuUhUBiWUgNoObw
6Qu86KUPbdWzNH4UqGO4Ww
6qZs4lf1XSGKNL280sSTVw
6R0B7xfXbebdlzkTtmR3QA
6R3QO9IiEEOd3CBLLTMu3Q
6r9vmmFcsBRrsItv7ZZXjg
6rdT6Mvh4bPMy7yZH6nlHg
6RfJZ9pqbkODziho1qMD7w
6RGRki5dig4GsiVuuH1VEg
6RhFKAOYs-izRGlWKQ6wXQ
6RNBI2GrQnZ22GpKqJdJUg
6rsAJDpNbB_VmH71qo9riQ
6rsnJyxDTX9haI0tO5HzMA
6RuBccaC0aJiS8xjUMJQGw
6RVeJGEfsDzlPd_RrmrLAQ
6RyBQgl2JLmdTmBvOZLo9g
6s6ZY0HhjfDHX3vWswBgaA
6SbeUsOescgM_QOcrkrNcw
6sbUWtSxM-oDXmPQJLYDOQ
6SdtUh7zE5LVvhs0FhXk9Q
6seYSUojTBUcvZJeHL2L2g
6SKcZ1Xpf5-nmLw5joBPYQ
6somCp4u7IZC2et7T_zcqA
6sOPn6bzqBZOAdzJcdJgRQ
6TzJlesX5OL

8aMuP-66xU0IGy8xl4o6Bg
8AS0l693AVIydnD_en_3tA
8aumxSldiQvA7rjJdI2R9w
8az8dNENYkKrbS0qjbMdCw
8B8cjHZgmECFbK342k0I0w
8bDedSf1Of6dJF3mGhxW8w
8Bt35G6BzpxA6VAVjD0xfw
8BV5FtKXHwgZBSFY9Bfc5g
8CDvLD5PsMVCspofF83bCQ
8CfzEhWiUKj_AZifeCmj-g
8cGCxVj03tJuZpvilksvzA
8CiPhmiqSQ7VXw8NXryahA
8cn5fzb36IyvT8OrMgYmGw
8cnR1U4fvmBVUCbERo5bUA
8CoC_9QGwaALfBiqeBDiMA
8C_sdTyjsakCkW2ec3RSJA
8D4EZDv8ndZeIuW5RcwoXg
8dcqHOkqiT9hFQEspn6XSw
8dFWN5zg88K_UvY1Nd9FuQ
8DhXZYYUVsskrW36cZ65vQ
8DKdj5UsSeD9U6lQ8p6Isw
8DluhBQQLgh21Ysy4hrpew
8DOzUVzqycHQgRGk0c0tgw
8Ds06T94LYhGWAObxHlC8A
8duGXFe_UrHQYy8_HNZMGw
8E-Z1xazTLNFPihhBDoG4Q
8e3k3ur_uBcV_d5FemJljA
8ehGfIYOq6qWoKSGzBIfdg
8ehKe7sfQs6QnYcpovU_LA
8elMC15QdDxkyjfUMDF_dw
8eOaUDyHgvY9DLLfCp91zw
8EuHdajiysc_AOH7H0cJHQ
8EuLq9FNDaaPUwW_mYnZNg
8eXjixYba2pmyuo63QS-xQ
8EZk21J4AZ6Zgg04y_8kig
8F7N_Vd6RVpJfJVlhlV02g
8fak7TSxHUlMBSH-YMeTXQ
8fapm1PBtdKHLd619IPC6w
8FfF-SlPTyMPCD900dscng
8FgB8vgfwVi-yRum0M35Nw
8frjLi6B4H8LQ25Ddhbkjw
8FtCmzuXDj8_DKyHTgg4Sw
8FtpnEsUTQmu6QLpS2vNjA
8f_Hj3yXxyt

9w_9tcmRXPOW5DuT-HEirw
9xLFWMI4N-v3RkIQzSAuTg
9xrS7d0EerODpD9k0n6Nxg
9x_H73oPdS9F_KsBRIC9cg
9Y0UaVSMQaNJUABPz87OJg
9y4NdKX15Dj_EtsylkarKw
9YafdoFo7eAad5u9NBJT1g
9YcviOim__11_oq5xRYR3g
9yez5EB8br87aEgnR_RHqg
9yfizcfToCui2oDtypC9qg
9yfn9T_hIFfaP5oPNVCd7g
9Yl-ue315oprmPG0dUpxng
9yVV2J9kJR6l_cNBW55jHw
9Y_leRSoTJuaS_tEJal5sA
9Z5GYBVvlZNZlvJ1bqZHdg
9Z5SIbdxdcvIpeQ7L0P0BA
9zjB0FqThJ4i6ZttKNbobg
9zReHTPZvcWK5lrvHF_oeQ
9ZxDGIHO6Ki-_OJ8VLdf5w
9ZYrvatwApKvERkeH6Q7_A
9_eWNxN97fsZFxrNKfLQ8w
9_hTW45gM9ZRBGHE4dFDWg
9_Yw4z913Oje909LKwbBAQ
a-9IYUqEgG0OFUlwUqOtyA
A-BA6O5BBG7AsciMn1N56A
A-BciZJvtSMOM2SpOGHb3g
A-m7oXGI8b4_Lz0MrIUjxA
A-vPFEri34xtgt4oszsDPg
a-zTY6nOk0Rr4hsWJtnTEw
a08Cr9Njn9DUTg-2fZZFUw
A0LaC3Rx1xgA-YMRc8NOpg
A0oNOKx5Ee8ZT7VYUiNGgA
a0vGGO5MIsQW1K_-sulhAg
a0_WoN7oiFP5mJhDO2Vhsw
a1e1BGjv1xKE8qeSPyB2mQ
a1FtDJvJ25qT6BTtfraANw
a1j4LovHN0xBXhkUuQcqOg
a1kBLgRPhKF-utMwetOllg
A1LXc9fujrwkzv17SnsVxA
a1Mt8iRS9Z-xBlqkGaB-yA
A23c46KmeV8VWPHeIxgmkA
a2hJXEB5GToAPC8HK2BYzg
a2n-xCGvqyZ2BZp7VKrGeA
a32yL3ZhyRE

AuRY45U0KXYNj2i4sjhsbA
aUw213FO38Az31E7s2dp6g
auZ4ZxNT3mGPk35x82Hx_w
AV43s6OBAf6ehrzQ0Istlg
av8ofnqIzk3h-oTWgXHTaA
aVf14S2H8BZN5I9GI7eTPg
avJcmjoucaYKBCUHn6-pTw
AVKUrv7-rAu7hqqBzSZlNQ
AvsbUqF_XKYSeLxTWcEoSg
avtyF7bCk6iqjssj540QtQ
aVUme2kp5ZjSEUeLkXTE_g
aVuSPwmImhhe01o_hJm7IA
Avv43l6bU2HTEVbBsgathA
aVVgN9KVGOB1iYDy8o_HUQ
aW2gLt8zrvcu0Q5-LzlckQ
aW2gnxZd-Gh_D8ygOuB3CA
AwdTFNtoT1a-c7bG2R8eFA
AwEI2ef_Qt7skMGAaefVAg
aWEyQV-Kcepv_YEgHQg1FA
aWFu1Pqa4E6zDld_fSaRyg
AWJGriMGpGD0YXCPDDH4dA
awoindXH3e88CUu276W41w
aWtNgIBMpLpIvfESgMu86A
AWVaLnACMY9lklu7JZelKA
aX75iflFCVTYmuR-WE8a9w
AXCJZb0hmMJXH63kHBrOdw
AXCXuA5b-pXNIT22DyPg0w
AXFPQeF7mmqbux7S95Kw0Q
AxGBoEJ77OUNNyKi29TdVA
AxGI44fQi4vd2Js9ehFMYA
AxMnz7b-Ne9xGQAMxkpYiA
AXmy2bUFY31Myc0fCGuh2w
axnzFvmcebKcpt4_vysp_A
Axo71eGe4m02SpwZ6Rmg3g
axoZk2H7wGAxhI9KGgR-Ow
AxUo8-sp6fZLLrs9B4RbnQ
AXzNxhiQayQpw_CwLGvAcg
AY4z_xbRcYiw2OGvQJZZOw
aycY4PShXkmEGFCgW4-ZGQ
AyFgP79d7ctTkBu3Ww_DDw
aygghkJIicXNCtuYoviOnQ
AyHfo2ljITho5vibL6bcsA
Ayi3WMxXuc6BYNrYxJlzOA
ayIQ4RbKZLQ

bRoJvJ9aBXB1sIfxitUaRQ
bRZuxa_XsxY0gFypy0fTcA
Bs-WiVP4BUczMoTe8JI2Xw
bS06_LyxIs9sZ_VtUqSLnw
BS2M5aIDo94K11oLLKQK6w
bSAmObY3_5wFx3tobnlCUw
Bsb77bPxixxVP2-O6E-blA
bsdnK6Qg8oaIrEyM2unELw
BsdSYwp2dAEtXFARX6LS5w
BsGuDZ4HaY7Yl8YPOa_pHQ
bsh63gvB7d-x-NRvh7lF6Q
BsiTE4dUYqLiYa4csVlpfA
Bsj7HjPg-s4qoanAaAH0XQ
Bsozrxr387DdA2YwI4OcKw
bSpxbQtGYTTxewR75LNjYA
BsTF0HXyEGNjYh_yTvEYeQ
bsvO0puN-QB_I-jws1IHJQ
bsX6_0dN2JJo_oTL7yInGQ
BSYM17AVAJB7tthJCbuumA
bSyoZpDpz5Xal7Dg9Xapuw
bT3ED61rhnnktxRCP0i-jQ
BT3G22uY8-VmRqC0WAwA7Q
BT43i6mz-45yZx_z4UxU-g
bT7XoGrZGGrZkpJGbN9QYA
bTAzJZn9jQLCMwOi24mxuA
Btca5mmB8EJ8Mn7TKTi_EQ
btcHG-5iXB6iEC8iEfWXgw
BTKlsdYAHqOHXhbyG-plxw
bTkx6tbW2q7vkLRcawdq_Q
bTl9vLb2pcguhui8H7BeDA
btLpGBEbbr-lqLHX-NEVuA
BtQ7gZ9hgwFtQ8B44nqDdQ
BTzr0qktDZSVjeEREQrANg
bUD9tumwBthSIExfxdTanw
bUEyIUgqwmPq9G2_IDOYPg
bUg2RVKclumWNjpONDkjEQ
bUG6eTsxFZAVbZdZB3bCSw
buHNdH2-X-DtVD3SZodkgw
bUKUAgOdiRrIMQKcSkZUuQ
BUmIufZngYLb_t_njo4EgA
bUMsgXEqSHxIwR6yleMm4A
BuNbDkBVMntr2eo_ZqUO3g
BunD1DyGUlWYyIcHOMukbQ
BUnXvLBn8v8

cNP2iYntMvf1FGFtPPnpHA
cNQplqsGDS19IiWrfxBi0Q
cnQud6KLh3-TvhMs1NoCtg
CnrznZ0Kt4zKo9QNX0mK8A
Cnu5VfO1O2gDJ7Yi_ZMn2Q
cNUYJLqUSl6N0g2emYIEsQ
cnY6gU01P2FWCOhMoRPqUg
CnZ1nic0fCHdSFu0oK-jCA
cO17FSYaIbsOIGbQooxORA
CO3iYjcbYx8AZiS3JzuAuQ
CODVionM-rbbvQGpJnTUag
COEcj92dPSnDeXS0qW4zPQ
CoG8c9yxubnjMgbm22W95A
coG8Epd2QLYbsB8Hxxw5Ww
CoGjPqLfkPaa75ic0YAQ8Q
colxMhOQELR5Blg59tOh4g
cOQ74aDqziBck0QGLvrS0Q
cosWnDtP17I1a0hZpG4_2g
CoU-Bbo25VJ4xwzdI7XqWQ
CoZAg1yX5KTG32Ik9iGUmg
CoZTwI4-LiFxZxbTagxe9g
CO_HoBu4uVUomqoCF1RPpw
cp2b41bAzpc3pz2hDU3_tA
Cp4KnNq4yUX4g0BAhThh_A
cP8BJjHXO5qMRi3QRTO2iA
Cpg5lWF9Ui0l3PXaH_zVEQ
cpmwMk4nj7zY3CvotoJwPg
cpoYcueDylNn5aKc74JJYA
cPqBZBoL4FYDMVV8KcdSaA
CPxyA3LrzE8DpYuo8wFhsQ
CqADlJ7hz1iCAbfyRIbSLA
CqHCnUa75cedqgh0dS0IPA
CQHgQ-8UJ52gPPfNFYmAcQ
cQIXK-AXnjNBgvZiFiXUhA
CqkkjCbNnWbuJkibzguBSw
cqlbG36vhaz1tbwsDrjKOw
CQMfzvnl4YbCn-5zRFCfNg
cQMy-ZtkFh_TKsUdhLydHA
cQPmxsooUVcafvOdFBKYmw
CQQSAkWYtAzaqJFJjMSSJg
CQSMR8dFe4ZUWpbZxjkp4A
cr12BA6YHzwZfuUhtA1GIQ
cRF2NA_eC721-VTAmWAWSg
crLQjWHwYtd

dIic8LM6DPjvjBns0SAbJg
diKvQF9KzCLG1PJctvpIMg
dIQ6-VPTgS3Vz6_gmqs3Ow
DisHPpXKQ_cwe_Q7Gvs7Cg
diUmbk-LKc6qptinvK3Y5g
DiVfOSxkQMu0IYc6JhJryw
di_5rah7DuywYiXN35evKQ
dj-Oh3dlZIWlAYGUtSNLVQ
dj1gclcf7D3InhUznFLaTQ
Dj6iMpl3XsxwBGLpn7VVUA
dJ7ekpoqtHAGxgYtkRPdDw
dJdO9ONA-3K-f1qP7PHPSg
DjJNPaY1IAybpJb_cVgUDw
djMjFhKXRyYZ45wtE94rWw
dJMpdEnRUuzseEsCU0TsHw
DJNQ7My9HIe-1uoRQuJDEQ
djttVCajnaxj-ntw54qdkQ
DjvcvyNjLUbnbQ3IfenYUw
DjXlcYZI-I0Xn7E9EVGIwA
djZc3SsaV09El9IhsNxBqw
djzjh9tvLSOpN9t5UAa4EQ
Dk1VUNIb4PvdbgRlHLRnyA
dk5Nv6LTxtKGKdM6UQfxSA
dk6EE9EBhXCxqu87ZGCANA
DKBZzApnse6nNHTauQNqeQ
dkdcp6rOL2omNU5n-K50rw
DKGNGabjpz7VkHMvdBGb4Q
DKi-PvQKKzXldDcsid8t1g
dkje1D7u2O0iU1DjqnnyNQ
DkkIQ_rOopb6aH-7zPTu9g
DkkrbOSfldScKzPtUevhrw
dKM4xuG0sifb7_KyQ-WdBA
DkOcZGM4ELLwppKFdX4u1g
dKsYs9qNBPer2bd8XTDmbg
DktCSf0jxSZQ4VmaA8kXUQ
dKU-D3rNQFkTkUxqhPpgpA
DKvwJNK32SX85LmBafyXHQ
DkZttH4bQcdEuvW9T1m4Vg
DL2cNWCaRrdkjCJyBANUkQ
Dl4edecrd5B7WfCBUoKgrQ
DL7oeLvLynzS6FMn6yvdLw
DLbX6WVF0k0pNG0hfRX1_g
DlgennYp4i-2o0hsZFXOLg
DLgfk3LRDYC

EBGlDAt4mZM38DY1iWFzlw
EbgswdL8OP9qAygT7lfnaw
ebHqiIRMu_UjubAqTIWG5w
EBJUpaaay3XHS1-3VvAB-w
EbLr01EJlm2lm7UpEWtY1A
EBR-Taj3xLkQ3lsTbKif7A
eBuito59H3m8YJV_IgVxEw
eBvCSwA_GGa6DzHEg-XXRQ
EBvs5DqN-aaqIIxugpmxsw
EBy44p_dvriusEAf32Y5fg
EB_2sLQRPN9VOcs-5rYjMg
ec0eJSuckZTIbiKkHP4CWg
Ec3kmAAmeZW_Y2yDv3nu5g
EcCrc-hnfzTEAcn02j3VAA
eCCuL9AWqnOnwszrfMlE9Q
EcIb8cdwAuHLhpLm2GvtTw
EcJf4RZ250ywafJKHmCJnA
eCk8zvhCqW_D_mps-Dc1tw
ECkR7zIGZL1vWOUQhyc7rQ
eClHIqN06ODpBZLPwZ-brA
eCM81hLJzECURGKhTVo0jg
EcngvT0KSg3JIof8PSU63A
ECPRr-iuEj7xwQNiY5zmog
ecT4B8cmA2E0PwIqMS8Q6Q
ECx2AvhcDEgFFcc4HXghgA
ed0xne1O1fGwEaO0lC1iag
ed1TbXwBAsdj9w9oSqsXuw
Ed2TH-LNgWEwWZuuXcX16A
ED5PFTJNJmmEY0U-tcyHIg
ED92LBTvz6LeSIwbbOyS8A
eDc-YzabrhA-qT4c-wIl_g
EdDEASrPMjWoBl7aXcP0aA
EDfKrruEc1HO4gtbBczEkw
EDjCJyPuZyxlN-Ps5GtUJw
eDJTMYz9ipTXeYYf_JphEA
edKIjD7-091MD40nE8NN4g
eDLXGB6vYtAyV71JIv6JdQ
edM3qTnV14YmyaoM89GNjw
eDP0JrbQgx7DDdByoC2bgw
EDpdFHm_LAo74abVUHxJrQ
edq_RqUdLZ72eLzI3njFaw
EdRo8u-DBmTItvJGRp4RUA
EdXF_SI6QuD2FhydMGL3wg
EDxtbKjClM_

F-tYSSw3YDjb-hArun1-1A
f-vPm48j9yCMHM0O0ryvHQ
F-_Ez-SuI-97FpaRsyxnbA
F0LaqyNyCre6AQFYrS9dfw
f0n5BiuCYMlqiIFCEo3VbQ
f0tKTPlygersQsU1Q4lf3w
F0y_iE2cZYplZANVWKe4Bg
f1199rDyMYgQDxBxX0HiXw
f19kDiGBHv_6sTXLFjQu5g
F1ath8-JIZ0qr37Tfeblxg
F1CgafYoE2jK-TP_xCzhbg
F1d2q65r7BPf1Mjy5yoLMQ
F1rqUxU2_ZNBkwggl0eolA
F1ui0Dm0geoqk4JkZ0HmOA
f1w74sD4ieFanWaymgk8Qg
F1XLWMKpdXjOwgBpa2NonA
f1_v0AeaoESYtPw8G4CmYw
F2-1NJIoI43fjYOV3cVl2g
F2gqN1RG2zxcpgAAnWQ3Aw
F2si--Dy5rcVY-mVN8GwIg
f3cxMjBr4fwLN2eNAAOyeg
f3pML85cJdKUFh7UKfbGJA
f40s3laPNoi-mZu5nE4zHg
f476k_cROcKVEDFwRiGyxg
F4CJsRWrULArpkRo-Etdcg
F4NV_urEoflu_g_vZg3WCA
f4oQotrOBQVbymAaoM5QYQ
f4Q-Ok9SgvVZ7ZjnGuct1Q
f4RxHBE_BFQOC4rBG2fbog
f4VgqWwV6wBSevf8R4cqBw
F4YJG30T7khe8rKPShnS0A
F5-Yf5q_S3koPWOCr7M_Wg
F5J0KgxlghzLrgYR2RonwA
F5rDJzWNXvVhPj5NK04KzQ
f5veV4Uiq-iTvhI0mpgeUg
f63Hb1Q4WMYRErIByNfLfA
f6Cd1P_iqaj8GCn4_YlVOA
f6G3zfqLuGEgPKNbajljiw
F6jVmKzuJGPNNJKjcj5XpA
F6R87FvEHPd7EHqy_QRoGg
F6S47YW5gTmcVFRum7g-Nw
f6sEHMZ6rJORzUYJ6Jle9A
f7-J3RlgPlU3SiCncVQdEQ
F7FmWx5SlwV

FwNndItJFNt4xI57H_QNZg
fWRWaDCdvVvKuGQy_SL9Lg


In [5]:
# Saving names in a file (the process of the previous cell took 6 hours)

subset_file_train = open('subset_names_train.txt', 'w')
for element in subset_names_train:
    subset_file_train.write(str(element) + '\n')
subset_file_train.close()

In [6]:
len(subset_names_train)

5000

In [4]:
# Read the file where the names of images were saved

loaded_names_train = []
nfile = open('subset_names_train.txt', 'r')
for line in nfile:
    loaded_names_train.append(line[0:line.find('\n')])
nfile.close()

print(len(loaded_names_train))

5000


In [5]:
# Copying the image files from the original dataset that matches the names of the saved ones

n = len(loaded_names_train)
for i in range(0, n):
    image_id = loaded_names_train[i]
    source = original_path + 'training/images/{}.jpg'.format(image_id)
    destination = 'Data/train/images/{}.jpg'.format(image_id)
    shutil.copyfile(source, destination)

### 1.2 Validation

The same process done in for the training subset will be performed for the validation subset.

We increased the size of the validation subset from **500** images to **1000** images.

In [5]:
# Getting the names of the image files from the original validation path (Mapillary Vistas)

validation_images_names = os.listdir(original_path + 'validation/images')
n = len(validation_images_names)
for i in range(0, n):
    validation_images_names[i] = validation_images_names[i][0:validation_images_names[i].find('.')]
print(validation_images_names[25])
print(n)

-YfOWQM-geslv541gdpQPQ
2000


In [13]:
# Getting the images that contain billboards on them (validation)

i = 0
size = 1000 # size of the desired subset
subset_names_val = []
while (len(subset_names_val) < size):
    image_id = validation_images_names[i]
    # read in config file
    with open(original_path + 'config.json') as config_file:
        config = json.load(config_file)
        
    labels = config['labels']
    label_dic = {}
    for label_id, label in enumerate(labels):
        label_dic[label['name']] = label_id

    # set up paths for every image
    image_path = original_path + "validation/images/{}.jpg".format(image_id)
    panoptic_path = original_path + "validation/panoptic/{}.png".format(image_id)

    # load images
    base_image = Image.open(image_path)
    panoptic_image = Image.open(panoptic_path)
        
    # PANOPTIC HANDLING

    # read in panoptic file
    with open(original_path + "validation/panoptic/panoptic_2018.json") as panoptic_file:
        panoptic = json.load(panoptic_file)
    
   # convert annotation infos to image_id indexed dictionary
    panoptic_per_image_id = {}
    for annotation in panoptic["annotations"]:
        panoptic_per_image_id[annotation["image_id"]] = annotation
        
    # convert category infos to category_id indexed dictionary
    panoptic_category_per_id = {}
    for category in panoptic["categories"]:
        panoptic_category_per_id[category["id"]] = category
        
    # convert segment infos to segment id indexed dictionary
    example_panoptic = panoptic_per_image_id[image_id]
    example_segments = {}
    for segment_info in example_panoptic["segments_info"]:
        example_segments[segment_info["id"]] = segment_info
    
    panoptic_array = np.array(panoptic_image).astype(np.uint32)
    panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
    panoptic_ids_from_image = np.unique(panoptic_id_array)
    
    for panoptic_id in panoptic_ids_from_image:
        if panoptic_id == 0:
            # void image areas don't have segments
            continue
        segment_info = example_segments[panoptic_id]
        category = panoptic_category_per_id[segment_info["category_id"]]
        
        img_label = label_dic[category['supercategory']]
        if img_label == 35:
            print(image_id)
            subset_names_val.append(image_id)
            break
        example_segments.pop(panoptic_id)
    i += 1

--BJs76vloEaiH-wppzWNA
-3-MmXdwhyIQhtb4-8NqHQ
-32tlgoydG0ZCyijh8piZQ
-4jzRzGfKmQg8RBNlNqnGQ
-9y4NjcjdoPFMs5wwC7otg
-BqO16ocxK46wM5W-QCE_A
-BYnT4s40fJHAlOumPYbyQ
-C-x3xSPFIEjqbyVC5PRaQ
-cHw9ug6U5pE2c9H1dHUOg
-DXgAnuaSe6TtQ9Hbm3G2A
-F5vhdPopdHyJjiC2hI6xg
-hUGu_wVAkoTya0Gp1cCsA
-jGzo9blhVkb-GgmTnYzHg
-jri_jG7haWRygtOb8FffQ
-lGfFLqh9n-iBv5fZDykSQ
-mqVqUNOImM1OzSXoL44rA
-nBxnPqrlKNcp4oY4U8A5A
-tpmjcVylShgHq6uKICQhQ
-uocR-Vc1j8eeDUF3H866w
-UqWx1Q0an_GDMMJs3bmOw
-xh1oIBGFpKz0J8nOBbeMA
-YfOWQM-geslv541gdpQPQ
-zwvMY-T6zh0_dZf9GW_wA
-_prubhCd03M0BLOHOGGOA
0-g5x1x9t7t6_lmXUPFazw
03x1wf0aRT9QaJnkSGCGMg
08uTEfTU4TFuM3JIFP2VGQ
0B5qssoIEl6LguVQjoRiDQ
0Be5dXzrmHWKPlOF65P-kw
0daE8mWxlKFT8kLBE5f12w
0eS0pdffaI0C3s4IvSbUYA
0hGMKuBXemoKzHp7I_hvxg
0hMunfM7UARtb1ILfDbD-g
0NAkQGTqAfm7LNzziBPBww
0Ngc6NLyTxpHwljinEgCew
0nQvP02UmANiaFb9Vp9vVg
0PkrQqg3IeAtnMH7JWpA0A
0R8Zjkw4z7-1xU_GsPPG_w
0wcA84wys1Ag1X-_L8PreA
0xtR9XkHx-pfm_JEzhq2Ug
0ZjIZyAs4I1HBgQOssdXcw
0_bg9QV4-OGi7e4hLKfnlQ
1-RJMdpt4X1knyrpLYNPAg
1-XkVjt5MGb

bWAXNbYjctScbpl_GXHMAQ
BWP4YGCaqSRElc9_JZQ-0Q
bx2Y3cJu09T7LM8uyQ9e2Q
BXhh-qqP56adAb_SPNnfFg
BXwmoGtx_jBN3Yi1Qk9pMg
byskyaa9lIooL257e00VCw
Bz5-mgMDWk6gMN_gLPwnxg
bZHp5w3WwghNf1Csgv3Bfg
B_L8LjGPPavozYu1i4La7A
c-34l_HLcapVXtlUEyt8iw
c2bG4N-fWC46PQl6i9w1rA
C3fao_-9rHFlTCldCc_neQ
C4quW-KqGLwGsQZ1b7wbrw
C7dkaoE4Fs_xJc-82mt7Dg
c8gxiaoGYO8Qccpt6VEdfA
C8Xyfj2asi5f7pj0Qrjtgw
C9EH53I9eLpBaPjPVIerDg
cAcdmuHwbTw9DLUvK__zkg
CAjAHPaEB0PYNXglida1NA
Caywm3ZvdyWtu-SA1LRh0A
cbkG7UrHcNBARMficcbAXg
CBmLbmdDpYtvT9l8ci6dBw
cbqGDgTW6QKI4NtokPyvnw
cc5dEAyQECBFF9MN3MbdZA
CCDHXNGN2T7LwMys9sp11Q
cCfJCcHXOYPBxF6SziLqng
cdCDRU6cLWCJx9hghz3T_w
cDvtKnuYMm7IXAUpybkkaQ
CEFcHgV8L6OOssaz-rdyBw
CFACScKGKv1azVab3ComDA
CFlfycyyHwUAIJfciovRdw
cFS8nrEaY2DzE9aj2ema-w
cgcD6BOK8WCMGLPvfgWqcg
CKWavXWzmxsBbQZAwKsk1g
clXXNaBwFCAmfi54g0FnEQ
cLYWTcAXc7KuN6uTZ5yFxA
CmIM22rLkCuCUX_VaICE2w
CMimK2BbqvvP71X_JSKvQA
CmTphiGKXuhdujTGy20hCQ
CNjjfRK0bogNdYjx7wxDYw
co0DZw6kSQuj0Ou7Mi-Y_A
CoBAkHgeaj5djbvAhayQ1g
CoMi13trvH0UE03Omj2INQ
cPPYM-UZsRt

jrjoyxuv_5J5mK4-R5odxA
JRvRrXk2JVojKw0jB0pH8w
jrXktbfXkBl0QangXxQcmw
JSCTB43EcE3jcrH2kZk8eA
JSmd7VpS0nXmXob3I-0GBg
jTHuwAuMdl-50usve0zP_A
jupx68Mpq0AI7-Kz304LdQ
juW3wss37XAgPmpKLAHC-w
JV53UNrvoOxyT2xpAQOIpw
jvTtbeW6AawFLsfPQAoijg
JYcWwc0EXi0T5tcwYJwWGg
jYL7T8WG6dZ-yWZS7PTlgg
jzW7IO0aja2Ww9aQtiag7Q
k0Ey_K2TwNo5cSOFY_aXrw
k4JPOarbzd56FLpbm-AOFw
k53E4nIRjAdP4Qhf_jL4og
k5cKiJ0H9z4ltK2Idn7g0Q
K6Pr4PgQLpywqYE41pScMQ
k9UCSxm_NtkoQRVNp4Mn2A
kaw-xWWzKQuUgFdjIv1Zdw
KBg0KYuwdE5o0fbkXm5fow
kBLlD8p3UN-jJ0EKMQwD7w
kCA4mZhQTMahGWqLhh4c1g
KD86ag1GZDGgldVtMAV1xw
KDJbc9eWOTpyfLmghj3BjA
KhkOVyYivfQuiymV2SHYPA
KIPTtS828jCxZfKdQ2KULw
KjZXnU33x24D1K9UiCmY5w
klHF8Cnt3x6rVG4wnOSNsw
klV2LGDKxRSj37HA_n8eeA
KlZ3XODVQ4MFjMRyjWLgCw
kLZi69SEq4eUnQqLUc7Zdw
KnhdzVvaMLeXMpJvrg4XFQ
KnM8yd2cabsAlGqxWtCYXA
KnZ1g2AW_OmV96DDgYzB2g
Kpuc81vbkJRppwBroR5WOg
kpyGZtTfNn3Ln_ADb7OOMw
kqxMTbTPAG4kG1U1i1dPVw
kReK1GrIyeegBJSwwdoRdA
KSq23J1ibgw_ufqayMOGPA
KtKQ7L5qvFVukU_X-f_d7w
kUlsHgvx7wYmzmYTOOsfaQ
KvZ1L-oKjh8hPu5ANWm42A
KX4_R-rXInA

In [14]:
# Saving names in a file (the process of the previous cell took 3 hours)

subset_file_val = open('subset_names_val.txt', 'w')
for element in subset_names_val:
    subset_file_val.write(str(element) + '\n')
subset_file_val.close()

In [6]:
# Read the file where the names of images were saved

loaded_names_val = []
nfile = open('subset_names_val.txt', 'r')
for line in nfile:
    loaded_names_val.append(line[0:line.find('\n')])
nfile.close()

print(len(loaded_names_val))

1000


In [16]:
# Copying files that matches our requirements

n = len(subset_names_val)
for i in range(0, n):
    image_id = subset_names_val[i]
    source = original_path + 'validation/images/{}.jpg'.format(image_id)
    destination = 'Data/validation/images/{}.jpg'.format(image_id)
    shutil.copyfile(source, destination)

### 1.3 Testing

Testing images do not have panoptic images (segmentation labels which contain the information of bounding boxes). Annotation of billboards has to be done manually.

We increased the size of the testing subset from **500** to **2500** images.

Since this is a testing subset, we do not need to filter images (our model has to be capable of detecting which image has billboards and where, and it case there is not billboard present, the model will explicitly metion that).

In [14]:
# Getting the names of the image files for testing data

testing_images_names = os.listdir(original_path + 'testing/images')
n = len(testing_images_names)
for i in range(0, n):
    testing_images_names[i] = testing_images_names[i][0:testing_images_names[i].find('.')]
print(testing_images_names[25])
print(n)

-gs02rx26QQIvEew6E5VCQ
5000


In [15]:
# Copying a small version of the testing dataset (including extra images)

n = 2500
for i in range(0, n):
    image_id = testing_images_names[i]
    source = original_path + 'testing/images/{}.jpg'.format(image_id)
    destination = 'Data/testing/images/{}.jpg'.format(image_id)
    shutil.copyfile(source, destination)

## 2. Creating labels for testing, validation, and testing subsets

As mentioned earlier, annotations for labels are not explicitly included (in the form of numbers or coordinates), but they are included as segmented images from which it is possible to extract the bounding boxes. The code below is in charge of processing all images and extract the billboards bounding box information and save it in a text file for training the models.

### 2.1 Training

The code below makes use of the image segmentation information and retrieves the coordinates of the bounding boxes, the information is saved in the YOLO format in a text file.

In [26]:
# Creating training labels

n = len(os.listdir('Data/train/images'))
for i in range(0, n):
    image_id = loaded_names_train[i]
    # read in config file
    with open(original_path + 'config.json') as config_file:
        config = json.load(config_file)
        
    labels = config['labels']
    label_dic = {}
    for label_id, label in enumerate(labels):
        label_dic[label['name']] = label_id

    # set up paths for every image
    image_path = original_path + "training/images/{}.jpg".format(image_id)
    panoptic_path = original_path + "training/panoptic/{}.png".format(image_id)

    # load images
    base_image = Image.open(image_path)
    panoptic_image = Image.open(panoptic_path)
        
    # PANOPTIC HANDLING

    # read in panoptic file
    with open(original_path + "training/panoptic/panoptic_2018.json") as panoptic_file:
        panoptic = json.load(panoptic_file)
    
   # convert annotation infos to image_id indexed dictionary
    panoptic_per_image_id = {}
    for annotation in panoptic["annotations"]:
        panoptic_per_image_id[annotation["image_id"]] = annotation
        
    # convert category infos to category_id indexed dictionary
    panoptic_category_per_id = {}
    for category in panoptic["categories"]:
        panoptic_category_per_id[category["id"]] = category
        
    # convert segment infos to segment id indexed dictionary
    example_panoptic = panoptic_per_image_id[image_id]
    example_segments = {}
    for segment_info in example_panoptic["segments_info"]:
        example_segments[segment_info["id"]] = segment_info
    
    label_doc = open(train_path + labels_path + '/' + loaded_names_train[i] + '.txt', 'w')
    panoptic_array = np.array(panoptic_image).astype(np.uint32)
    panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
    panoptic_ids_from_image = np.unique(panoptic_id_array)
    
    for panoptic_id in panoptic_ids_from_image:
        if panoptic_id == 0:
            # void image areas don't have segments
            continue
        segment_info = example_segments[panoptic_id]
        category = panoptic_category_per_id[segment_info["category_id"]]
        
        img_label = label_dic[category['supercategory']]
        
        if img_label == 35: # <- To double check all images contain billboards
        
            # Billboard information is stored 
            box_x_left = segment_info['bbox'][0]
            box_y_top = segment_info['bbox'][1]
            box_width = segment_info['bbox'][2]
            box_height = segment_info['bbox'][3]
            img = Image.open(image_path)
            img_width, img_height = img.size

            x_center = (box_x_left + box_width/2)/img_width
            y_center = (box_y_top + box_height/2)/img_height
            width = box_width/img_width
            height = box_height/img_height
            label_doc.write(str(0) + ' ' + str(x_center) + ' ' + str(y_center) + ' ' + str(width) + ' ' + str(height) + '\n')
            img.close()

        example_segments.pop(panoptic_id)
    
    label_doc.close()

### 2.2 Validation

The same process is repeated for the validation test.

In [8]:
# Creating validation labels

n = len(os.listdir('Data/validation/images'))
for i in range(0, n):
    image_id = loaded_names_val[i]
    # read in config file
    with open(original_path + 'config.json') as config_file:
        config = json.load(config_file)
        
    labels = config['labels']
    label_dic = {}
    for label_id, label in enumerate(labels):
        label_dic[label['name']] = label_id

    # set up paths for every image
    image_path = original_path + "validation/images/{}.jpg".format(image_id)
    panoptic_path = original_path + "validation/panoptic/{}.png".format(image_id)

    # load images
    base_image = Image.open(image_path)
    panoptic_image = Image.open(panoptic_path)
        
    # PANOPTIC HANDLING

    # read in panoptic file
    with open(original_path + "validation/panoptic/panoptic_2018.json") as panoptic_file:
        panoptic = json.load(panoptic_file)
    
   # convert annotation infos to image_id indexed dictionary
    panoptic_per_image_id = {}
    for annotation in panoptic["annotations"]:
        panoptic_per_image_id[annotation["image_id"]] = annotation
        
    # convert category infos to category_id indexed dictionary
    panoptic_category_per_id = {}
    for category in panoptic["categories"]:
        panoptic_category_per_id[category["id"]] = category
        
    # convert segment infos to segment id indexed dictionary
    example_panoptic = panoptic_per_image_id[image_id]
    example_segments = {}
    for segment_info in example_panoptic["segments_info"]:
        example_segments[segment_info["id"]] = segment_info
    
    label_doc = open(validation_path + labels_path + '/' + loaded_names_val[i] + '.txt', 'w')
    panoptic_array = np.array(panoptic_image).astype(np.uint32)
    panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
    panoptic_ids_from_image = np.unique(panoptic_id_array)
    
    for panoptic_id in panoptic_ids_from_image:
        if panoptic_id == 0:
            # void image areas don't have segments
            continue
        segment_info = example_segments[panoptic_id]
        category = panoptic_category_per_id[segment_info["category_id"]]
        
        img_label = label_dic[category['supercategory']]
        
        if img_label == 35:
        
            box_x_left = segment_info['bbox'][0]
            box_y_top = segment_info['bbox'][1]
            box_width = segment_info['bbox'][2]
            box_height = segment_info['bbox'][3]
            img = Image.open(image_path)
            img_width, img_height = img.size

            x_center = (box_x_left + box_width/2)/img_width
            y_center = (box_y_top + box_height/2)/img_height
            width = box_width/img_width
            height = box_height/img_height
            label_doc.write(str(0) + ' ' + str(x_center) + ' ' + str(y_center) + ' ' + str(width) + ' ' + str(height) + '\n')
            img.close()

        example_segments.pop(panoptic_id)
    
    label_doc.close()

### 2.3 Increasing dataset

Following the comments done in the previous semester, we decided to increase the dataset by including images which do not contain billboards).

#### 2.3.1 Increasing Training Subset

An extra of 1000 images without billboards were added to the training data subset.

In [31]:
# Getting the ids of images that DO NOT contain billboards on them (training)

i = 0
extra_size = 1000 # size of the desired subset
nobillboard_names_train = []
while (len(nobillboard_names_train) < extra_size):
    image_id = training_images_names[i]
    if image_id not in loaded_names_train:
        # read in config file
        with open(original_path + 'config.json') as config_file:
            config = json.load(config_file)

        labels = config['labels']
        label_dic = {}
        for label_id, label in enumerate(labels):
            label_dic[label['name']] = label_id

        # set up paths for every image
        image_path = original_path + "training/images/{}.jpg".format(image_id)
        panoptic_path = original_path + "training/panoptic/{}.png".format(image_id)

        # load images
        base_image = Image.open(image_path)
        panoptic_image = Image.open(panoptic_path)

        # PANOPTIC HANDLING

        # read in panoptic file
        with open(original_path + "training/panoptic/panoptic_2018.json") as panoptic_file:
            panoptic = json.load(panoptic_file)

       # convert annotation infos to image_id indexed dictionary
        panoptic_per_image_id = {}
        for annotation in panoptic["annotations"]:
            panoptic_per_image_id[annotation["image_id"]] = annotation

        # convert category infos to category_id indexed dictionary
        panoptic_category_per_id = {}
        for category in panoptic["categories"]:
            panoptic_category_per_id[category["id"]] = category

        # convert segment infos to segment id indexed dictionary
        example_panoptic = panoptic_per_image_id[image_id]
        example_segments = {}
        for segment_info in example_panoptic["segments_info"]:
            example_segments[segment_info["id"]] = segment_info

        panoptic_array = np.array(panoptic_image).astype(np.uint32)
        panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
        panoptic_ids_from_image = np.unique(panoptic_id_array)

        found = False
        for panoptic_id in panoptic_ids_from_image:
            if panoptic_id == 0:
                # void image areas don't have segments
                continue
            segment_info = example_segments[panoptic_id]
            category = panoptic_category_per_id[segment_info["category_id"]]

            img_label = label_dic[category['supercategory']]
            if img_label == 35:
                found = True
            example_segments.pop(panoptic_id)
        if found == False: # Adding images that do not contain billboards exclusively
            nobillboard_names_train.append(image_id)
            print(image_id)
        i += 1
    else:
        i += 1

--jRsD9anHdGxv4ZOCQktA
--NSVcUgfVhFd6uzkqHOOg
--SWG8TJo7vrbZ_x5YLj6A
--tczNya4_w5IhU9DeRMHw
--zBInAo8-NDrBCfvJ4fEw
-24xIsrQS_XSpHHgmmOB2g
-5Tar0aE6kVomCLac556dA
-5_vD3uQLbZa9We0ZRe64g
-6KIBA7UMZeUpAF-MreNKw
-8uEGk5o-Mm4J_WrNd-lVg
-9KGl9SjjCNY1k7e7kZAQw
-9Rwv8nwRvau53Ravsn3-A
-A0I13p2EKUmebBd_o1WAw
-Ae9jTBaJa6ZIG0_a1gKTw
-b6cxvVDgKKduy1ohAOLiw
-BXtaV58FRkenRKZro21cQ
-D7PwwMuQ0K0dH01QKD__g
-DhrBI2NUhI53j6u9k_adw
-eGpGmG2YBFiIJKbToWO0g
-ejK_lQI473xOq4kLFNPdw
-ElQvytjkvXF3jNYXoe4Og
-FDKMXaTWSNS5u9Ad2SO2Q
-FtqiD0037yomyFmsem25w
-GG2FOy_wmOsX6ImEm1uLA
-gV4nG-7GAoQOLfuxidvEw
-gV_IwkfSjb6nsiJD9hulQ
-HFT-WGo9ftuyGUUm-4zGQ
-HMM6ZkJJr4Wbdr9YS1MzA
-J48Mok3W-uXxpG-eCuECQ
-JmdruPVjc--VbGtTIogRw
-k9vlupLf7unY_ZtGrivIw
-koJxu16wZChFAi7BvOxQw
-La2XLCsqbAJz05JCLflSg
-LBkw6k6UPWAtdiWs1yV-Q
-LudG9v6lNCqqxvMp9WMyw
-Mb8uJ76AJsENmC8H_2Plw
-MVFdfakzYTZdHvvdSUXcg
-n0xk449bCVYsk6K1KScKA
-NDNpoEMRoquHngyABVewg
-Nm-gZulic8rBJCPVNOzFw
-nvTzJ-2am0mxQPqnZzZBA
-NwoP_4ghO8Uipu3Oj2cQw
-NYCQnAYXLuvk2bKlJWf4w
-oGaIbSDBcE

3Tz-rjTFD9hdXy9xsaZnCQ
3uEL0FIt91rGX4Jf2Ib64g
3UKuFDTK9OZRgBhr7871oA
3uLNgOj-4iejrDtGB5HrQQ
3ulpkm1Q-Mqi8lSSLLEWUQ
3uQlNaY-1en9s7y9Ar8rCA
3vhQjI8NQelkS28IyvYH2g
3VO01uj77j4vAZeSSICz3w
3vpvkSzWWCUvVbIo0Q5P5w
3vs2sr7NNAnm43Y32qOggg
3WQikHteQyxxyOHWzISvDw
3x4Aa27dOmI_orlI7oUxyQ
3X6Gg6X16KVIHicDgNTByg
3XaFu0oLYnlPvs0jCOAn3g
3xJot0WFWYDdjmZnUj-fqg
3XQKu6GoVHVJ33Lr4Taiww
3YvsinMPpfviEPHrfQoHyw
3z0K_SVnUAoOqaXocBm0fw
3zliytpAuiRDKIizn7HgWg
3ZMT09ZG9bGQb2y8smipgA
3_fNH6k2KclGOCnX8sgTgA
3_iBHJqqvDguqU2fxT_-9A
4-9coGfbA_Tn67Y9gJHdGQ
4-Q1zeBVtJsbRwEMIds6BQ
40Bb2Y7Eae6C-j7lRGsuiw
40k06pvYPOOsDXidUhMLxA
40ye4e3WlMSUTklWTEp8dg
41hm9XGjH71SiMOWqz9eSQ
41oz03GxpDCCxX7JhrcK0A
425RYp6cuBSkORPQ569OBQ
42ltiq6R1DcjCCBeqTNKLw
42mZAlfq3jzGX5IoHTaZSQ
42y5FzvZO7fn7OaauBfq5Q
436R2pE8mKkGyyCJH1hw7g
43hsfeRik88juOn8wuf5-w
444TemJ9wVTkl1Hr0OX2DQ
44e786MCUhlL0U5s5sDvsw
45qQGMGIFqkT88kKplDWIw
4729siAEyUQ6URwE37oz3A
47VvVp571PWR5PjAMhht0g
483XnMwD2kAXSgZ6l0YLiw
48sZ5gvwA19QW_rwwPJLmw
49khrzHNDYtXKCugEeSqmw
49uPurIZ8xh

7kZ61Nqz3U-X4rxkqbQLcA
7L-xh3hiZmF_4j9zYsQNfA
7Lh9ptt0ryingeFaamoEXQ
7lnC-M4bPLV3cnXdhTYM0g
7n2vCMgfSgRCPIM1pU0Z2w
7NPyt842d9j3FcfGw45tyQ
7OC-NYrkwv5-rj-UhAntlg
7OcKHOcv6KwUvzY7-NvoDw
7OKrlI4wTwdfe2HEIHZNpw
7Oz1c-XtyAJA3_nwm2DBGQ
7pHXhYqDki29gF3CPRLjtg
7Ppge7pEiUY00ZrYoC2NGw
7QLZYSBPKlEdJwukm4ompQ
7QphQcZZG7SoDIue8FoxWQ
7QzfS-7AGysrJEBczkKBpA
7R1IoOJdwr1dxUiHBFMNFg
7ra3mJiS43cB6baY6Qf6YQ
7rYa6scKbeW5wFa_ajk2Qg
7SOINT4Tidaksd68SrGTyg
7t41lXNxZ5P-B0fFz5uEmQ
7T9Wrc_GjqXARIVoQDtCfA
7U5xOA0w9WxT3RAOlIc0Iw
7U7dvVvQcLAIVh3mrjCJ_A
7uMCUCVjgw64M3J6LsSSzQ
7un2vpG5-Aq0NxQ6QapQtA
7uZjkhhIOrm3w_Bpy2p0DQ
7VlD0_D71FHACIHtYI_QdA
7WfR4OHGzkeSDYflD-3qrg
7wWBjAwc-ZTyyXKusBjbWA
7Xd2CWnK4JCYRsfW5h_OHA
7xqNGL854zBCxeG6mPeK_Q
7Y-VoZgutihhATPVIhsu3Q
7yqcp5hJKRsUK7xl49A6Pg
7zgCEGUNiY3U0oPid9mE9g
7ZwRJolzRtGtonSupFGPMQ
8-FbAAG-tuILtdtDuRByjg
8-gIRjawRHMTsthYW_VNgw
8-hvEEIlGGl1BLMBybT0IQ
80aCC-ikBztVNQEGXAKMbQ
81e5bRiBnr7C3_lV3slppA
82DbRqsEid67wr_-qNaB_Q
82kED-EcvmG4Ptj98rcQug
8404UtpIF0el5KScR4jMzg
84d0yKfr7a_

In [35]:
# Saving names in a file (the process of the previous cell took 2 hours)

extra_file_train = open('extra_names_train.txt', 'w')
for element in nobillboard_names_train:
    extra_file_train.write(str(element) + '\n')
extra_file_train.close()

1000


In [7]:
# Read the file where the extra names of images were saved

loaded_extranames_train = []
extra_file = open('extra_names_train.txt', 'r')
for line in extra_file:
    loaded_extranames_train.append(line[0:line.find('\n')])
extra_file.close()

#print(new_names == subset_names_train)
print(len(loaded_extranames_train))

1000


In [14]:
# Copying files that matches our requirements

n = len(loaded_extranames_train)
for i in range(0, n):
    image_id = loaded_extranames_train[i]
    source = original_path + 'training/images/{}.jpg'.format(image_id)
    destination = 'Data/train/images/{}.jpg'.format(image_id)
    shutil.copyfile(source, destination)

#### 2.3.2 Increasing validation subset

The same process is repeated.

200 extra images without billboards were added to the validation subset.

In [9]:
# Getting the ids of images that DO NOT contain billboards on them (validation)

i = 0
extra_size = 200 # size of the desired subset
nobillboard_names_val = []
while (len(nobillboard_names_val) < extra_size):
    image_id = validation_images_names[i]
    if image_id not in loaded_names_val:
        # read in config file
        with open(original_path + 'config.json') as config_file:
            config = json.load(config_file)

        labels = config['labels']
        label_dic = {}
        for label_id, label in enumerate(labels):
            label_dic[label['name']] = label_id

        # set up paths for every image
        image_path = original_path + "validation/images/{}.jpg".format(image_id)
        panoptic_path = original_path + "validation/panoptic/{}.png".format(image_id)

        # load images
        base_image = Image.open(image_path)
        panoptic_image = Image.open(panoptic_path)

        # PANOPTIC HANDLING

        # read in panoptic file
        with open(original_path + "validation/panoptic/panoptic_2018.json") as panoptic_file:
            panoptic = json.load(panoptic_file)

       # convert annotation infos to image_id indexed dictionary
        panoptic_per_image_id = {}
        for annotation in panoptic["annotations"]:
            panoptic_per_image_id[annotation["image_id"]] = annotation

        # convert category infos to category_id indexed dictionary
        panoptic_category_per_id = {}
        for category in panoptic["categories"]:
            panoptic_category_per_id[category["id"]] = category

        # convert segment infos to segment id indexed dictionary
        example_panoptic = panoptic_per_image_id[image_id]
        example_segments = {}
        for segment_info in example_panoptic["segments_info"]:
            example_segments[segment_info["id"]] = segment_info

        panoptic_array = np.array(panoptic_image).astype(np.uint32)
        panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
        panoptic_ids_from_image = np.unique(panoptic_id_array)

        found = False
        for panoptic_id in panoptic_ids_from_image:
            if panoptic_id == 0:
                # void image areas don't have segments
                continue
            segment_info = example_segments[panoptic_id]
            category = panoptic_category_per_id[segment_info["category_id"]]

            img_label = label_dic[category['supercategory']]
            if img_label == 35:
                found = True
            example_segments.pop(panoptic_id)
        if found == False:
            nobillboard_names_val.append(image_id)
            print(image_id)
        i += 1
    else:
        i += 1

-F-jXLRFKunhfJg4s-62jA
-gQj6MFElvWSGDLGgrZpmQ
-OB82zvf2k0rTOxuMEuQGA
-Ws5_rCy-VI_NsLgFIMBbQ
03G8WHFnNfiJR-457i0MWQ
03T1YjbnUSumHU5n-iYEzA
07kQvxZ_6UZoTPvH3YhiFg
0awMSNYMwfdXN1K4sGI8Cw
0kMplsNDh3pMAKXPB1AOng
0mv_D8kuyuoaxHmOn_8PvA
0RAR6R-Dxo-YSXhucEdjxw
0VGACHmnzbjqiRs6BQ4ufA
1EaLwA9alBSKntjx2w4_Wg
1MoFjt8lFydFWvuKu_jOWQ
1mu7evLfp_Deoglas9V3Gw
1ReO2qYXBTxdHVFn1gF2Ww
1sHd0W56n3Jct9zVUcMqqg
1THQtCsaZH8PqwA2-Vb9tg
1u_XW5RrYKPBy0iWkOub0g
1X56RIJI8NLVtXJlRRbbgQ
1y7aXjIMHDC4U_gp15_Nhg
1YjohJoLyEcJ_Eo2GD4w0Q
24--9XvrREjXs5RNbtvLfw
2CFMBRBizGbV4mWXMzyrTg
2d3hvB1rNaivBLVcSYHREA
2dUUdESNqgXC2CW0gHWY7A
2ehsE_e72WNQo32Cd2B8jA
2gml1pBZB9i5c0WmFp0Lxw
2NanO8tyD3DyTMhNl6UXQw
2RfwaKn8rVSFH2epy9rJqQ
2T2reH82rbd0z8gtmBekzg
2Zmmq8u0tDHyuavKnDYDWw
3BsjwktSq0pw-LQmogl5YQ
3CkIEgqnCS0PbXmx85KmRA
3E_wVcJJQNYCdWT53lMlew
3imssSXy8mHzcHEHcQi6ag
3mvMhGpAEATgBkP7U1e-ew
3pO6x_ScawsTIR9vJNpXIw
3qb2uglycvCgcK_ma22myQ
3QQrUUWoNcLmMhP76AT-TA
3qzU13RQZzLHg5ETTfciUA
3UwyvoQHgLG9LqmjX7sULA
41Pk_r4d00VDbSrAK8jABA
46P_SNQCh9O

In [10]:
print(len(nobillboard_names_val))

200


In [11]:
# Saving names in a file (the process of the previous cell took 5 hours)

extra_file_val = open('extra_names_val.txt', 'w')
for element in nobillboard_names_val:
    extra_file_val.write(str(element) + '\n')
extra_file_val.close()

In [8]:
# Read the file where the extra names of images were saved

loaded_extranames_val = []
extra_file = open('extra_names_val.txt', 'r')
for line in extra_file:
    loaded_extranames_val.append(line[0:line.find('\n')])
extra_file.close()

#print(new_names == subset_names_train)
print(len(loaded_extranames_val))

200


In [13]:
# Copying files that matches our requirements

n = len(loaded_extranames_val)
for i in range(0, n):
    image_id = loaded_extranames_val[i]
    source = original_path + 'validation/images/{}.jpg'.format(image_id)
    destination = 'Data/validation/images/{}.jpg'.format(image_id)
    shutil.copyfile(source, destination)

In [9]:
print(len(loaded_extranames_train))

1000


### 2.4 Creating Labels for the Extra Images

This section creates the labels from the images recently added.

**Note:** Obviously, since the images do not contain billboards, no information will be stored in the text files. However, this process must be done to assure the added images do not have billboards and because it is a requirement for training YOLO models.

#### 2.4.1 Extra labels for Training Subset

In [10]:
# Creating extra training labels

n = len(loaded_extranames_train)
for i in range(0, n):
    image_id = loaded_extranames_train[i]
    # read in config file
    with open(original_path + 'config.json') as config_file:
        config = json.load(config_file)
        
    labels = config['labels']
    label_dic = {}
    for label_id, label in enumerate(labels):
        label_dic[label['name']] = label_id

    # set up paths for every image
    image_path = original_path + "training/images/{}.jpg".format(image_id)
    panoptic_path = original_path + "training/panoptic/{}.png".format(image_id)

    # load images
    base_image = Image.open(image_path)
    panoptic_image = Image.open(panoptic_path)
        
    # PANOPTIC HANDLING

    # read in panoptic file
    with open(original_path + "training/panoptic/panoptic_2018.json") as panoptic_file:
        panoptic = json.load(panoptic_file)
    
   # convert annotation infos to image_id indexed dictionary
    panoptic_per_image_id = {}
    for annotation in panoptic["annotations"]:
        panoptic_per_image_id[annotation["image_id"]] = annotation
        
    # convert category infos to category_id indexed dictionary
    panoptic_category_per_id = {}
    for category in panoptic["categories"]:
        panoptic_category_per_id[category["id"]] = category
        
    # convert segment infos to segment id indexed dictionary
    example_panoptic = panoptic_per_image_id[image_id]
    example_segments = {}
    for segment_info in example_panoptic["segments_info"]:
        example_segments[segment_info["id"]] = segment_info
    
    label_doc = open(train_path + labels_path + '/' + loaded_extranames_train[i] + '.txt', 'w')
    panoptic_array = np.array(panoptic_image).astype(np.uint32)
    panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
    panoptic_ids_from_image = np.unique(panoptic_id_array)
    
    for panoptic_id in panoptic_ids_from_image:
        if panoptic_id == 0:
            # void image areas don't have segments
            continue
        segment_info = example_segments[panoptic_id]
        category = panoptic_category_per_id[segment_info["category_id"]]
        
        img_label = label_dic[category['supercategory']]
        
        if img_label == 35:
        
            box_x_left = segment_info['bbox'][0]
            box_y_top = segment_info['bbox'][1]
            box_width = segment_info['bbox'][2]
            box_height = segment_info['bbox'][3]
            img = Image.open(image_path)
            img_width, img_height = img.size

            x_center = (box_x_left + box_width/2)/img_width
            y_center = (box_y_top + box_height/2)/img_height
            width = box_width/img_width
            height = box_height/img_height
            label_doc.write(str(0) + ' ' + str(x_center) + ' ' + str(y_center) + ' ' + str(width) + ' ' + str(height) + '\n')
            img.close()

        example_segments.pop(panoptic_id)
    
    label_doc.close()

In [17]:
# Checking all images have its corresponding label

imgs_names = os.listdir("Data/train/images")
lbls_names = os.listdir("Data/train/labels")
problem = False
for i in range(len(imgs_names)):
    if imgs_names[i][:-4] != lbls_names[i][:-4]:
        print("There is a problem with image number: ", i + 1)
        problem = True

if problem == False:
    print("All good")

All good


#### 2.4.2 Creating Extra Labels for Validation Subset

The process is repeated for the validation subset

In [18]:
# Creating extra validation labels

n = len(loaded_extranames_val)
for i in range(0, n):
    image_id = loaded_extranames_val[i]
    # read in config file
    with open(original_path + 'config.json') as config_file:
        config = json.load(config_file)
        
    labels = config['labels']
    label_dic = {}
    for label_id, label in enumerate(labels):
        label_dic[label['name']] = label_id

    # set up paths for every image
    image_path = original_path + "validation/images/{}.jpg".format(image_id)
    panoptic_path = original_path + "validation/panoptic/{}.png".format(image_id)

    # load images
    base_image = Image.open(image_path)
    panoptic_image = Image.open(panoptic_path)
        
    # PANOPTIC HANDLING

    # read in panoptic file
    with open(original_path + "validation/panoptic/panoptic_2018.json") as panoptic_file:
        panoptic = json.load(panoptic_file)
    
   # convert annotation infos to image_id indexed dictionary
    panoptic_per_image_id = {}
    for annotation in panoptic["annotations"]:
        panoptic_per_image_id[annotation["image_id"]] = annotation
        
    # convert category infos to category_id indexed dictionary
    panoptic_category_per_id = {}
    for category in panoptic["categories"]:
        panoptic_category_per_id[category["id"]] = category
        
    # convert segment infos to segment id indexed dictionary
    example_panoptic = panoptic_per_image_id[image_id]
    example_segments = {}
    for segment_info in example_panoptic["segments_info"]:
        example_segments[segment_info["id"]] = segment_info
    
    label_doc = open(validation_path + labels_path + '/' + loaded_extranames_val[i] + '.txt', 'w')
    panoptic_array = np.array(panoptic_image).astype(np.uint32)
    panoptic_id_array = panoptic_array[:,:,0] + (2**8)*panoptic_array[:,:,1] + (2**16)*panoptic_array[:,:,2]
    panoptic_ids_from_image = np.unique(panoptic_id_array)
    
    for panoptic_id in panoptic_ids_from_image:
        if panoptic_id == 0:
            # void image areas don't have segments
            continue
        segment_info = example_segments[panoptic_id]
        category = panoptic_category_per_id[segment_info["category_id"]]
        
        img_label = label_dic[category['supercategory']]
        
        if img_label == 35:
        
            box_x_left = segment_info['bbox'][0]
            box_y_top = segment_info['bbox'][1]
            box_width = segment_info['bbox'][2]
            box_height = segment_info['bbox'][3]
            img = Image.open(image_path)
            img_width, img_height = img.size

            x_center = (box_x_left + box_width/2)/img_width
            y_center = (box_y_top + box_height/2)/img_height
            width = box_width/img_width
            height = box_height/img_height
            label_doc.write(str(0) + ' ' + str(x_center) + ' ' + str(y_center) + ' ' + str(width) + ' ' + str(height) + '\n')
            img.close()

        example_segments.pop(panoptic_id)
    
    label_doc.close()

In [19]:
# Checking all images have its corresponding label

imgs_names = os.listdir("Data/validation/images")
lbls_names = os.listdir("Data/validation/labels")
problem = False
for i in range(len(imgs_names)):
    if imgs_names[i][:-4] != lbls_names[i][:-4]:
        print("There is a problem with image number: ", i + 1)
        problem = True

if problem == False:
    print("All good")

All good


### 2.5 Creating Labels for Testing Data

This section was completed after annotating the testing images with Roboflow Annotation Tool.

The purpose of the code below is to check all testing images have their respective labels, and make corrections in case a problem is found.

In [3]:
# Copying the labels obtained from Roboflow

raw_labels = os.listdir("raw testing labels")
for i in range(len(raw_labels)):
    original_name = raw_labels[i]
    name = original_name.split("_jpg")[0]
    new_name = name + ".txt"
    source = "raw testing labels/{}".format(original_name)
    destination = "Data/testing/labels/{}".format(new_name)
    shutil.copyfile(source, destination)

In [11]:
# Checking the testing images have their corresponding labels

errors = []
correct_image_name = []
wrong_label_name = []
loaded_testing_images = os.listdir(testing_path + images_path)
testing_labels = os.listdir(testing_path + labels_path)
for i in range(len(loaded_testing_images)):
    if testing_labels[i][:-4] != loaded_testing_images[i][:-4]:
        print("Error in archive: ", i)
        errors.append(i)
        correct_image_name.append(loaded_testing_images[i])
        wrong_label_name.append(testing_labels[i])

#print(correct_image_name)
#print(wrong_label_name)

Error in archive:  33
Error in archive:  107
Error in archive:  563
Error in archive:  581
Error in archive:  788
Error in archive:  977
Error in archive:  979
Error in archive:  1108
Error in archive:  1870
['-KGpjq3c--Sw0mlw8rZmNw.jpg', '0f_Nk--FNh-RogLRlsKL_Q.jpg', '5_MjVb--4PpKb1Pw1RsuuQ.jpg', '6BPZ9kEX--1sUfa3-iSJ5w.jpg', '8ovcV2zbA733SLWCkBr--w.jpg', 'aijPEVXs--_uhTD-IWwQLw.jpg', 'aj7VincEdaT--QEfKGGXqA.jpg', 'bf--JqMENhi0yeKyhi0jxg.jpg', 'gcFrvCPjHTcE--vOkyI24w.jpg']
['-KGpjq3c-Sw0mlw8rZmNw.txt', '0f_Nk-FNh-RogLRlsKL_Q.txt', '5_MjVb-4PpKb1Pw1RsuuQ.txt', '6BPZ9kEX-1sUfa3-iSJ5w.txt', '8ovcV2zbA733SLWCkBr-w.txt', 'aijPEVXs-_uhTD-IWwQLw.txt', 'aj7VincEdaT-QEfKGGXqA.txt', 'bf-JqMENhi0yeKyhi0jxg.txt', 'gcFrvCPjHTcE-vOkyI24w.txt']


After carefully analyzing the differences, we found out that the names provided by the Annotation tool did not copy exactly the image original names for the following cases:

If images that contained more than one minus symbol "-" were copied as a single "-" sign. For instance, the image original name was "-KGpjq3c--Sw0mlw8rZmNw.jpg" but it was copied as "-KGpjq3c-Sw0mlw8rZmNw.txt". This problem was generated by the Annotation Tool but it is corrected for the affected files as shown below:

In [13]:
for i in range(len(errors)):
    os.rename(testing_path + labels_path + "/" + wrong_label_name[i], testing_path + labels_path + "/" + correct_image_name[i][:-4] + ".txt")

In [15]:
# Checking the testing images have their corresponding labels

errors = []
correct_image_name = []
wrong_label_name = []
loaded_testing_images = os.listdir(testing_path + images_path)
testing_labels = os.listdir(testing_path + labels_path)
for i in range(len(loaded_testing_images)):
    if testing_labels[i][:-4] != loaded_testing_images[i][:-4]:
        print("Error in archive: ", i)
        errors.append(i)
        correct_image_name.append(loaded_testing_images[i])
        wrong_label_name.append(testing_labels[i])

if len(errors) == 0:
    print("All good")

All good


In [9]:
print(raw_labels[33], loaded_testing_images[33], testing_labels[33])
print(raw_labels[107], loaded_testing_images[107], testing_labels[107])
print(raw_labels[563], loaded_testing_images[563], testing_labels[563])
print(raw_labels[581], loaded_testing_images[581], testing_labels[581])
print(raw_labels[788], loaded_testing_images[788], testing_labels[788])
print(raw_labels[977], loaded_testing_images[979], testing_labels[979])
print(raw_labels[1108], loaded_testing_images[1108], testing_labels[1108])
print(raw_labels[1870], loaded_testing_images[1870], testing_labels[1870])

-KGpjq3c-Sw0mlw8rZmNw_jpg.rf.c88a036f4289ed26772c6e1a72d3ea77.txt -KGpjq3c--Sw0mlw8rZmNw.jpg -KGpjq3c-Sw0mlw8rZmNw.txt
0f_Nk-FNh-RogLRlsKL_Q_jpg.rf.3d9a2930ff34dcd02ed691fdf277a989.txt 0f_Nk--FNh-RogLRlsKL_Q.jpg 0f_Nk-FNh-RogLRlsKL_Q.txt
5_MjVb-4PpKb1Pw1RsuuQ_jpg.rf.5f5fccade55454e2f8fb7273e051b2fe.txt 5_MjVb--4PpKb1Pw1RsuuQ.jpg 5_MjVb-4PpKb1Pw1RsuuQ.txt
6BPZ9kEX-1sUfa3-iSJ5w_jpg.rf.c2fb459e73fa79b93825f36729480fb2.txt 6BPZ9kEX--1sUfa3-iSJ5w.jpg 6BPZ9kEX-1sUfa3-iSJ5w.txt
8ovcV2zbA733SLWCkBr-w_jpg.rf.60682ba6d34a4fa3ec4960fbdda273ac.txt 8ovcV2zbA733SLWCkBr--w.jpg 8ovcV2zbA733SLWCkBr-w.txt
aijPEVXs-_uhTD-IWwQLw_jpg.rf.4e70ea66eaf93c60b3c3baa3bc147b8d.txt aj7VincEdaT--QEfKGGXqA.jpg aj7VincEdaT-QEfKGGXqA.txt
bf-JqMENhi0yeKyhi0jxg_jpg.rf.ae9477d453eb1f2a31bf279376d34a78.txt bf--JqMENhi0yeKyhi0jxg.jpg bf-JqMENhi0yeKyhi0jxg.txt
gcFrvCPjHTcE-vOkyI24w_jpg.rf.060c4e5c0808bf8b7666641f9ab318a4.txt gcFrvCPjHTcE--vOkyI24w.jpg gcFrvCPjHTcE-vOkyI24w.txt


In [16]:
print(len(loaded_testing_images[33]), len(testing_labels[33]))

26 26
