## Download and extract original dataset

In [1]:
%pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=99fcf7c3f0fd36610c8188c1454dc48bd077147802fbdf27a8e2c0c6c75c30e7
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
import wget
wget.download('https://docs.google.com/uc?export=download&id=1d1ni3fdETzC6S7itu7IwAhSsYAypCoBC', './original-dataset.zip')

'./original-dataset.zip'

In [3]:
!unzip original-dataset.zip -d ./original-dataset
!rm -rf ./original-dataset.zip

Archive:  original-dataset.zip
  inflating: ./original-dataset/309.jpg  
  inflating: ./original-dataset/306 (2).jpg  
  inflating: ./original-dataset/299 (3).jpg  
  inflating: ./original-dataset/293 (2).jpg  
  inflating: ./original-dataset/292.jpg  
  inflating: ./original-dataset/291 (2).jpg  
  inflating: ./original-dataset/279.jpg  
  inflating: ./original-dataset/278.jpg  
  inflating: ./original-dataset/277 (2).jpg  
  inflating: ./original-dataset/275 (2).jpg  
  inflating: ./original-dataset/266 (2).jpg  
  inflating: ./original-dataset/265.jpg  
  inflating: ./original-dataset/264 (2).jpg  
  inflating: ./original-dataset/242.jpg  
  inflating: ./original-dataset/239.jpg  
  inflating: ./original-dataset/230.jpg  
  inflating: ./original-dataset/224.jpg  
  inflating: ./original-dataset/223.jpg  
  inflating: ./original-dataset/221.jpg  
  inflating: ./original-dataset/219.jpg  
  inflating: ./original-dataset/218 (2).jpg  
  inflating: ./original-dataset/217.jpg  
  inflati

In [4]:
%cd ./original-dataset

/content/original-dataset


## Split data between train and validation sets

In [5]:
import json

with open('signal-plates-identifier-vgg-project.json', 'r') as file:
    vgg_project = json.load(file)

metadata_keys = list(vgg_project['_via_img_metadata'].keys())

plates_score_board = {}

for key in metadata_keys:
    current_meta = vgg_project['_via_img_metadata'][key]
    plates = [r['region_attributes']['code'] for r in current_meta['regions']]

    for plate in plates:
        if plate not in plates_score_board:
            plates_score_board[plate] = {'count': 0, 'files': []}

        plates_score_board[plate]['count'] += 1
        plates_score_board[plate]['files'].append(current_meta['filename'])

plates_codes = list(plates_score_board.keys())

train_filenames = []
val_filenames = []

for code in plates_codes:
    plate_info = plates_score_board[code]

    if 'remainingCount' not in plate_info:
        plate_info['remainingCount'] = plate_info['count']

    if plate_info['remainingCount'] == 0:
        continue

    plate_info['remainingCount'] -= 1
    train_filenames.append(plate_info['files'].pop(0))

for code in plates_codes:
    plate_info = plates_score_board[code]

    if plate_info['remainingCount'] == 0:
        continue

    plate_info['remainingCount'] -= 1
    val_filenames.append(plate_info['files'].pop(0))

for code in plates_codes:
    plate_info = plates_score_board[code]

    if plate_info['remainingCount'] == 0:
        continue

    train_filenames.extend(plate_info['files'])
    plate_info['remainingCount'] = 0

train_filenames = list(set(train_filenames) - set(val_filenames))

train_filenames = list(set(train_filenames))
val_filenames = list(set(val_filenames))

assert len(train_filenames) == sum(1 for item in vgg_project['_via_img_metadata'].values() if item['filename'] in train_filenames)

assert len(val_filenames) == sum(1 for item in vgg_project['_via_img_metadata'].values() if item['filename'] in val_filenames)

### Generate metadata files for each set with annotations

In [6]:
train_meta = {key: item for key, item in vgg_project['_via_img_metadata'].items() if item['filename'] in train_filenames}
val_meta = {key: item for key, item in vgg_project['_via_img_metadata'].items() if item['filename'] in val_filenames}

with open('signal-plates-identifier-vgg-project-train.json', 'w') as file:
    json.dump(train_meta, file)

with open('signal-plates-identifier-vgg-project-val.json', 'w') as file:
    json.dump(val_meta, file)

### Move images to each set folder

In [7]:
import os
import shutil

data = {'val': val_filenames, 'train': train_filenames}

source_directory = './'

for key, filenames in data.items():
    target_directory = os.path.join(source_directory, key)
    os.makedirs(target_directory, exist_ok=True)

    for filename in filenames:
        source_path = os.path.join(source_directory, filename)
        target_path = os.path.join(target_directory, filename)

        if os.path.exists(source_path):
            shutil.move(source_path, target_path)
            print(f"Moved {filename} to {target_path}")
        else:
            print(f"File {filename} not found in {source_directory}")

    print(f"Moved total of {len(filenames)} images in '{key}' dataset to {target_directory}")

print("File moving completed.")

Moved 30 (3).jpg to ./val/30 (3).jpg
Moved 2 (4).jpg to ./val/2 (4).jpg
Moved 28 (2).jpg to ./val/28 (2).jpg
Moved 107.jpg to ./val/107.jpg
Moved 100 (2).jpg to ./val/100 (2).jpg
Moved 25 (5).jpg to ./val/25 (5).jpg
Moved 46 (4).jpg to ./val/46 (4).jpg
Moved 264 (2).jpg to ./val/264 (2).jpg
Moved 4 (3).jpg to ./val/4 (3).jpg
Moved 68 (2).jpg to ./val/68 (2).jpg
Moved 170 (2).jpg to ./val/170 (2).jpg
Moved 7 (3).jpg to ./val/7 (3).jpg
Moved 77.jpg to ./val/77.jpg
Moved 126.jpg to ./val/126.jpg
Moved 64 (3).jpg to ./val/64 (3).jpg
Moved 239.jpg to ./val/239.jpg
Moved 60 (6).jpg to ./val/60 (6).jpg
Moved 67 (2).jpg to ./val/67 (2).jpg
Moved 196 (3).jpg to ./val/196 (3).jpg
Moved 5 (7).jpg to ./val/5 (7).jpg
Moved 224.jpg to ./val/224.jpg
Moved 20 (5).jpg to ./val/20 (5).jpg
Moved 64 (2).jpg to ./val/64 (2).jpg
Moved 293 (2).jpg to ./val/293 (2).jpg
Moved 109 (4).jpg to ./val/109 (4).jpg
Moved 24 (3).jpg to ./val/24 (3).jpg
Moved 201 (3).jpg to ./val/201 (3).jpg
Moved 13 (6).jpg to ./val/1

### Move results and discard remaining files

In [8]:
!mv ./signal-plates-identifier-vgg-project-train.json ./train/signal-plates-identifier-vgg-project.json
!mv ./signal-plates-identifier-vgg-project-val.json ./val/signal-plates-identifier-vgg-project.json
!rm -rf ./via.html
!rm -rf ./*.jpg
!rm -rf ./signal-plates-identifier-vgg-project.json

In [13]:
!zip -r transformed-dataset.zip ./

  adding: val/ (stored 0%)
  adding: val/264 (2).jpg (deflated 0%)
  adding: val/170 (2).jpg (deflated 0%)
  adding: val/46 (4).jpg (deflated 0%)
  adding: val/2 (4).jpg (deflated 4%)
  adding: val/39.jpg (deflated 1%)
  adding: val/196 (3).jpg (deflated 1%)
  adding: val/239.jpg (deflated 0%)
  adding: val/126.jpg (deflated 0%)
  adding: val/13 (6).jpg (deflated 3%)
  adding: val/28 (2).jpg (deflated 0%)
  adding: val/20 (5).jpg (deflated 2%)
  adding: val/67 (2).jpg (deflated 0%)
  adding: val/201 (3).jpg (deflated 0%)
  adding: val/224.jpg (deflated 1%)
  adding: val/109 (4).jpg (deflated 1%)
  adding: val/107.jpg (deflated 2%)
  adding: val/68 (2).jpg (deflated 1%)
  adding: val/60 (6).jpg (deflated 0%)
  adding: val/7 (3).jpg (deflated 1%)
  adding: val/signal-plates-identifier-vgg-project.json (deflated 80%)
  adding: val/64 (3).jpg (deflated 0%)
  adding: val/293 (2).jpg (deflated 0%)
  adding: val/100 (2).jpg (deflated 1%)
  adding: val/77.jpg (deflated 0%)
  adding: val/64 (2)