# Exploratory analysis of the data

In [1]:
import cv2
import json
import os
import glob
import numpy as np

## Explore images sizes

We don't want to downscale or upscale input images too much as it can hurt model's performance. It seems reasonable if we take images' width and height around the third quartile.

In [2]:
def analyse_image_sizes(image_fnames):
    img_ws = []
    img_hs = []
    
    for img_fname in image_fnames:
        img = cv2.imread(img_fname)
        img_hs.append(img.shape[0])
        img_ws.append(img.shape[1])
    
    print(f"Image width: min {np.min(img_ws)}, max {np.max(img_ws)}, quartiles {np.quantile(img_ws, [0.25, 0.5, 0.75])}")
    print(f"Image height: min {np.min(img_hs)}, max {np.max(img_hs)}, quartiles {np.quantile(img_hs, [0.25, 0.5, 0.75])}")

In [3]:
analyse_image_sizes(glob.glob("autoriaNumberplateOcrRu-2020-10-12/*/img/*.png"))

Image width: min 59, max 882, quartiles [170. 228. 301.]
Image height: min 13, max 191, quartiles [36. 49. 65.]


## Analyze data fields

We are interested in parsed plates. There are two main fields that seem to contain this information, one is called "description" and one is "predicted" under "moderation" section.

First, we check if the are equal all the time. This could be the easiest option, but it's not the case. 

Second, we test these fields against image file names in validation and test sets that seem to also match plates' numbers. This is also not the case.

Since we don't have a reliable source of true labels, we can stick to the "description" field that has the highest matching rate. We must remember that noisy labels can hurt models' performance.

In [2]:
def assert_labels_match(label_fnames):
    matches = []
    for label_fname in label_fnames:
        with open(label_fname, "rt") as f:
            label = json.load(f)
            match = int( label["description"] == label["moderation"]["predicted"] )
            matches.append(match)
    
    all_match = ( np.sum(matches) == len(matches) )
    print(f"All match: {all_match}")

In [7]:
assert_labels_match(glob.glob("autoriaNumberplateOcrRu-2020-10-12/*/ann/*.json"))

All match: False


In [20]:
def assert_description_fnam_match(label_fnames):
    matches = []
    for label_fname in label_fnames:
        with open(label_fname, "rt") as f:
            label = json.load(f)
            match = int( label["description"] == os.path.basename(label_fname).split(".")[0] )
            matches.append(match)
    
    all_match = ( np.sum(matches) == len(matches) )
    print(f"All match: {all_match}, match ratio {np.sum(matches)/len(matches)}")

In [21]:
assert_description_fnam_match(glob.glob("autoriaNumberplateOcrRu-2020-10-12/val/ann/*.json"))

All match: False, match ratio 0.8813080339119903


In [22]:
assert_description_fnam_match(glob.glob("autoriaNumberplateOcrRu-2020-10-12/test/ann/*.json"))

All match: False, match ratio 0.8168717047451669


In [23]:
def assert_moderation_fnam_match(label_fnames):
    matches = []
    for label_fname in label_fnames:
        with open(label_fname, "rt") as f:
            label = json.load(f)
            match = int( label["moderation"]["predicted"] == os.path.basename(label_fname).split(".")[0] )
            matches.append(match)
    
    all_match = ( np.sum(matches) == len(matches) )
    print(f"All match: {all_match}, match ratio {np.sum(matches)/len(matches)}")

In [24]:
assert_moderation_fnam_match(glob.glob("autoriaNumberplateOcrRu-2020-10-12/val/ann/*.json"))

All match: False, match ratio 0.8772708922083166


In [25]:
assert_moderation_fnam_match(glob.glob("autoriaNumberplateOcrRu-2020-10-12/test/ann/*.json"))

All match: False, match ratio 0.81195079086116


## Checking label length

Finally, we're interested in lengthes of our labels as it will determine our models' architecture. So we take the maximum length of our label field

In [12]:
def analyse_description_sizes(label_fnames):
    lengths = []
    for label_fname in label_fnames:
        with open(label_fname, "rt") as f:
            label = json.load(f)
            lengths.append(len(label["description"]))
    
    print(f"Description length: min {np.min(lengths)}, max {np.max(lengths)}, quartiles {np.quantile(lengths, [0.25, 0.5, 0.75])}")

In [13]:
analyse_description_sizes(glob.glob("autoriaNumberplateOcrRu-2020-10-12/*/ann/*.json"))

Description length: min 8, max 9, quartiles [8. 8. 9.]
