In [1]:
import numpy as np
import pandas
import matplotlib.pyplot as plt
import os
import json
import random
import cv2
from json import JSONEncoder
import skimage

# VizWiz

[Data](https://vizwiz.org/tasks-and-datasets/image-quality-issues/)

In [2]:
train_dir = '/media/arnau/SSD/VizWiz/captioning/train/'
val_dir = '/media/arnau/SSD/VizWiz/captioning/val/'
test_dir = '/media/arnau/SSD/VizWiz/captioning/test/'
annots_dir = '/media/arnau/SSD/VizWiz/captioning/annotations/'
annots_qi_dir = '/media/arnau/SSD/VizWiz/quality_issues/annotations/'

Train / Val / Test splits images **annotations**. Used to retrieve if there is **text present** in images

In [3]:
with open(annots_dir + "train.json", encoding='UTF-8') as tr_json_file:
    train_data = json.load(tr_json_file)
    train_annots = train_data['images']

with open(annots_dir + "val.json", encoding='UTF-8') as v_json_file:
    val_data = json.load(v_json_file)
    val_annots = val_data['images']

with open(annots_dir + "test.json", encoding='UTF-8') as ts_json_file:
    test_data = json.load(ts_json_file)
    test_annots = test_data['images']

In [4]:
# Load custom test data
f = '/media/arnau/PEN/TFG/hf_model_test_res.json'
#f = 'D://TFG//hf_model_test_res.json'
with open(f) as user_file:
    test_data = json.load(user_file)

model_test_imgs = list(test_data.keys())

Images containing text for each split

In [5]:
train_imgs_w_text = []
val_imgs_w_text = []
test_imgs_w_text = []

annots = {'train': train_annots, 
        'val' : val_annots,
        'test' : test_annots}

for split, data in annots.items():
    for d in data:
        if d["text_detected"] == True and split == 'train':
            train_imgs_w_text .append(d["file_name"])
        elif d["text_detected"] == True and split == 'val':
            val_imgs_w_text .append(d["file_name"])
        elif d["text_detected"] == True and split == 'test':
            test_imgs_w_text .append(d["file_name"])
        
print(f"{len(train_imgs_w_text)} training images containing text")
print(f"{len(val_imgs_w_text)} training images containing text")
print(f"{len(test_imgs_w_text)} training images containing text")

14701 training images containing text
5018 training images containing text
5093 training images containing text


Train / Val / Test splits images **quality annotations**. Used to retrieve quality flaws **(blur)** in images

In [6]:
with open(annots_qi_dir + "train.json", encoding='UTF-8') as tqif:
    train_qi_data = json.load(tqif)
    
with open(annots_qi_dir + "val.json", encoding='UTF-8') as vqif:
    val_qi_data = json.load(vqif)
    
with open(annots_qi_dir + "test.json", encoding='UTF-8') as tsqif:
    test_qi_data = json.load(tsqif)

In [7]:
blured_train_images_with_text = []
non_blured_train_images_with_text = []

blured_val_images_with_text = []
non_blured_val_images_with_text = []

blured_test_images_with_text = []
non_test_train_images_with_text = []

annots_qi = {'train': train_qi_data, 
            'val' : val_qi_data}

for split, data in annots_qi.items():
    for d in data:
        if d["image"] in train_imgs_w_text: # if image has text
            if d["flaws"]["BLR"] >= 3: # if image is blur
                blured_train_images_with_text.append(d["image"])
            elif d["flaws"]["NON"] >= 3: # if image is clean
                non_blured_train_images_with_text.append(d["image"])
        
        # Since we use a custom test set (extracted from val set) that also contains 
        # images from VizWiz, we have to check that the validation images are not in the test set. 
        # More on why we do this in vqa_hf notebook
        elif d["image"] in val_imgs_w_text and d["image"] not in model_test_imgs: 
            if d["flaws"]["BLR"] >= 3: 
                blured_val_images_with_text.append(d["image"])
            elif d["flaws"]["NON"] >= 3: 
                non_blured_val_images_with_text.append(d["image"])
            

print(f"{len(blured_train_images_with_text)} training images blured with text")
print(f"{len(non_blured_train_images_with_text)} training images non-blured with text")

print(f"{len(blured_val_images_with_text)} val images blured with text")
print(f"{len(non_blured_val_images_with_text)} val images non-blured with text")

4196 training images blured with text
4496 training images non-blured with text
1248 val images blured with text
1049 val images non-blured with text


In [8]:
def balance(l):  
    """
    Balance a given list to have the same proportion of elements
    for each class
    """
    
    n = len(l) // 2
    arr = np.array(l, dtype=object)
    zeros = arr[arr[:,1] == 0]
    ones = arr[arr[:,1] == 1]
    np.random.shuffle(zeros)
    np.random.shuffle(ones)
    final_arr = np.concatenate((zeros[:n], ones[:n]))
    final_list = list(map(tuple, final_arr))
    
    return final_list

## Train data

Train set containing blur and non-blur images

In [9]:
blured_train_dataset = np.asarray(list(map(lambda im : (im, 1), blured_train_images_with_text)), dtype=object)
non_blured_train_dataset = np.asarray(list(map(lambda im : (im, 0), non_blured_train_images_with_text)), dtype=object)

vw_train_set = np.asarray(balance(
                                np.vstack((blured_train_dataset, non_blured_train_dataset)
                                         )), 
                          dtype=object)

print(f"VizWiz TRAIN set size {vw_train_set.shape[0]}")

VizWiz TRAIN set size 8542


## Val + Test

In [10]:
blured_val_dataset = np.asarray(list(map(lambda im : (im, 1), blured_val_images_with_text)), dtype=object)
non_blured_val_dataset = np.asarray(list(map(lambda im : (im, 0), non_blured_val_images_with_text)), dtype=object)

test_n_val_set = np.asarray(balance(
                                np.vstack((blured_val_dataset, non_blured_val_dataset))
                        ), 
                        dtype=object)

In [11]:
np.random.shuffle(test_n_val_set)
# Half data for val set, half data for test set
test_set = test_n_val_set[: len(test_n_val_set) // 2]
val_set = test_n_val_set[len(test_n_val_set) // 2: ]

In [12]:
vw_test_set = np.asarray(balance(test_set), dtype=object)
vw_val_set = np.asarray(balance(val_set), dtype=object)

print(f"VizWiz TEST set size {vw_test_set.shape[0]}")
print(f"VizWiz VAL set size {vw_val_set.shape[0]}")

VizWiz TEST set size 1082
VizWiz VAL set size 1065


In [13]:
for img in list(vw_test_set[:, 0]):
    assert img not in list(vw_val_set[:, 0]), "ERROR"

In [14]:
#plt.imshow(plt.imread(train_dir + random.choice(blured_train_images_with_text)))
#plt.axis('off')
#plt.show()
#plt.imshow(plt.imread(train_dir + random.choice(non_blured_train_images_with_text)))
#plt.axis('off')
#plt.show()

## Save

In [15]:
total_size = len(vw_test_set) + len(vw_val_set) + len(vw_train_set)

np.random.shuffle(vw_test_set)
np.random.shuffle(vw_train_set)
np.random.shuffle(vw_val_set)

vw_test_set = vw_test_set[: int(total_size * 0.1)]
vw_val_set = vw_val_set[: int(total_size * 0.1)]
vw_train_set = vw_train_set[: int(total_size * 0.8)]

vw_data = {'train' : vw_train_set,
           'val' : vw_val_set,
           'test' : vw_test_set,
          }

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

with open('/home/arnau/tfg/GED-TFG/data/vw_blur_dataset.json', 'w') as outfile:
    json.dump(vw_data, outfile, cls=NumpyArrayEncoder)
print("Files saved")

Files saved


# TextVQA

[Data](https://textvqa.org/dataset/)

In [16]:
tvqa_imgs_dir = './/TextVQA//train_images//'

In [17]:
imgs_list = os.listdir(tvqa_imgs_dir)

random.shuffle(imgs_list)

print(f"TextVQA Dataset size {len(imgs_list)}")

FileNotFoundError: [Errno 2] No such file or directory: './/TextVQA//train_images//'

In [None]:
# Remove dark images


## Train data

In [None]:
tvqa_train_imgs = imgs_list[: len(imgs_list) // 2]
print(f"TextVQA TRAIN size: {len(tvqa_train_imgs)}")

## Val data

In [None]:
tvqa_val_imgs = imgs_list[len(imgs_list) // 2 :]
print(f"TextVQA VAL size: {len(tvqa_val_imgs)}")

## Blur data augmentation

In [None]:
tvqa_train_to_add_blur = tvqa_train_imgs[: len(tvqa_train_imgs) // 2 :]
tvqa_train_non_blur = tvqa_train_imgs[len(tvqa_train_imgs) // 2 :]

tvqa_val_to_add_blur = tvqa_val_imgs[: len(tvqa_val_imgs) // 2 :]
tvqa_val_non_blur = tvqa_val_imgs[len(tvqa_val_imgs) // 2 :]

In [None]:
def add_blur(img_name, typ, orient="h"):

    if type(img_name) == str:
        img = cv2.imread(img_name, cv2.IMREAD_COLOR)

    if typ == "motion":
        kernel_size = 20

        kernel_v = np.zeros((kernel_size, kernel_size))
        kernel_h = np.copy(kernel_v)
        kernel_v[:, int((kernel_size - 1)/2)] = np.ones(kernel_size)
        kernel_h[int((kernel_size - 1)/2), :] = np.ones(kernel_size)
        kernel_v /= kernel_size
        kernel_h /= kernel_size

        vertical_mb = cv2.filter2D(img, -1, kernel_v)
        horizonal_mb = cv2.filter2D(img, -1, kernel_h)

        if orient == 'h':
            return horizonal_mb
        elif orient == "v":
            return vertical_mb
        
    elif typ == "gaussian":
        sigma = 4.0

        # apply Gaussian blur, creating a new image
        blurred = skimage.filters.gaussian(
            img, sigma=(sigma, sigma), truncate=3.5, channel_axis=2)
        return blurred

In [None]:
u, d, t = 0, 0, 0

for i, img in enumerate(tvqa_train_to_add_blur[:100]):
    if i >= 0 and i <= 33:
        img_v_mblur = add_blur(tvqa_imgs_dir + img, "motion", "v")
        cv2.imwrite(blur_dir + img, img_v_mblur)
        u += 1
    elif i >= 33 and i <= 66:
        img_h_mblur = add_blur(tvqa_imgs_dir + img, "motion", "h")
        cv2.imwrite(blur_dir + img, img_h_mblur)
        d += 1
    elif i >= 66:
        img_gsn_blur = add_blur(tvqa_imgs_dir + img, "gaussian")
        cv2.imwrite(blur_dir + img, img_gsn_blur)
        t += 1
print(u, d, t)

In [None]:
blur_dir = './/TextVQA//blur//'

if len(os.listdir(blur_dir)) == 0:
    
    for i, img in enumerate(tvqa_train_to_add_blur):
        if i>= 0 and i <= len(tvqa_train_to_add_blur) // 3:
            img_v_mblur = add_blur(tvqa_imgs_dir + img, "motion", "v")
            cv2.imwrite(blur_dir + img, img_v_mblur)
            i >= 33 and i <= 66:
        elif i >= len(tvqa_train_to_add_blur) // and i <= (len(tvqa_train_to_add_blur) // 3) * 2:
            img_h_mblur = add_blur(tvqa_imgs_dir + img, "motion", "h")
            cv2.imwrite(blur_dir + img, img_h_mblur)
        elif i >= (len(tvqa_train_to_add_blur) // 3) * 2:
            img_gsn_blur = add_blur(tvqa_imgs_dir + img, "gaussian")
            cv2.imwrite(blur_dir + img, img_gsn_blur)
         
    for i, img in enumerate(tvqa_val_to_add_blur):
        if i>= 0 and i <= len(tvqa_val_to_add_blur) // 3:
            img_v_mblur = add_blur(plt.imread(tvqa_imgs_dir + img), "motion", "v")
            cv2.imwrite(blur_dir + img, img_v_mblur)
        elif i >= len(tvqa_val_to_add_blur) // 3 and i <= (len(tvqa_val_to_add_blur) // 3) * 2:
            img_h_mblur = add_blur(plt.imread(tvqa_imgs_dir + img), "motion", "h")
            cv2.imwrite(blur_dir + img, img_h_mblur)
        elif i >= (len(tvqa_val_to_add_blur) // 3) * 2:
            img_gsn_blur = add_blur(plt.imread(tvqa_imgs_dir + img), "gaussian")
            cv2.imwrite(blur_dir + img, img_gsn_blur)

# VizWiz + TextVQA