# Preparing a blur dataset

## VizWiz dataset

### Training

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy  as np
import cv2
import random
import json
import pprint
from collections import Counter

In [None]:
source = '/media/arnau/PEN/TFG/train/'
direc = '../data/'
file = "vizwiz_skill_typ_train.csv"
data = pd.read_csv(direc + file)

Load images (these images are a combination of 2 VizWiz datasets)

In [None]:
with open('../data/all_imgs.json', 'r', encoding='UTF-8') as json_file:
    all_imgs = json.load(json_file)

Get the info that says if the img is blurred or not from VizWiz assessing quality dataset 

In [None]:
train_json_qlty = '../data/'

# Read assessing quality dataset 
with open(train_json_qlty + "final.json", encoding='UTF-8') as json_file:
    data = json.load(json_file)
    data = data["train"]  
    
data_size = len(data)
print(f"Dataset size: {data_size}")

Create 2 subdatasets containing blurred and non blurred images

In [None]:
blurred_imgs = []
non_blurred_imgs = []

for i, img in enumerate(all_imgs):
    img_info =  data[img]
    imgs_flaws = img_info["flaws"]
    # if image is blurred
    if img_info and imgs_flaws[1] == True:
        blurred_imgs.append(img)
    # if image is non-flawed
    elif imgs_flaws[6] == True:
        non_blurred_imgs.append(img)
        
non_blurred_imgs = non_blurred_imgs[: len(blurred_imgs)]
print(f"Number of non-blurred images {len(non_blurred_imgs)}")
print(f"Number of blurred images {len(blurred_imgs)}")

Known issues:

- *The images that are not blurred are the ones that have the "NON flawed" property*
- *Some images are more blurred than others leaving the dataset unbalanced*

### Validation & Test

**NOTE:** The test set is extracted from the validation dataset

In [None]:
with open(train_json_qlty + "final_val.json", encoding='UTF-8') as json_file:
    data_val = json.load(json_file)["val"]

In [None]:
val_blurred_imgs = []
val_non_blurred_imgs = []
size = int(0.1 * len(non_blurred_imgs))  # ~ 10%

for img, img_info in data_val.items():
    imgs_flaws = img_info["flaws"]
    # if image is blurred
    if img_info and imgs_flaws[1] == True:
        val_blurred_imgs.append(img)
    # if image is non-flawed
    elif imgs_flaws[6] == True:
        val_non_blurred_imgs.append(img)
        
val_non_blurred_split = val_non_blurred_imgs[: size]
val_blurred_split = val_blurred_imgs[: size]

test_non_blurred_split = val_non_blurred_imgs[size : 2*size]
test_blurred_split = val_blurred_imgs[size : 2*size]

print(f"Number of val non-blurred images: {len(val_non_blurred_split)}")
print(f"Number of val blurred images: {len(val_blurred_split)}")
print(f"\nNumber of test non-blurred images: {len(test_non_blurred_split)}")
print(f"Number of test blurred images: {len(test_blurred_split)}")

In [None]:
train_nbi = [(img, 0) for img in non_blurred_imgs]
train_bi = [(img, 1) for img in blurred_imgs]
val_nbi = [(img, 0) for img in val_non_blurred_split]
val_bi = [(img, 1) for img in val_blurred_split]
test_nbi = [(img, 0) for img in test_non_blurred_split]
test_bi = [(img, 1) for img in test_blurred_split]

# Train set
vw_train_data = train_nbi + train_bi  
# Validation set
vw_val_data = val_nbi + val_bi
# Test set
vw_test_data = test_nbi + test_bi

## Blur dataset (Kaggle)

[Dataset](https://www.kaggle.com/datasets/kwentar/blur-dataset)

3 categories

 - **S**: Sharp
 - **M**: Motion Blurred
 - **F**: Defocused Blurred

In [None]:
import os
from PIL import Image

In [None]:
sharp, defocused_blurred, motion_blurred = [], [], []
types = ["sharp", "defocused_blurred", "motion_blurred"]

for t in types:
    dir_path = f'/media/arnau/PEN/TFG/blur_dataset/blur_dataset_scaled/{t}/'
    for img_name in os.listdir(dir_path):
        if t == "sharp":
            sharp.append(dir_path + img_name)
        elif t == "defocused_blurred":
            defocused_blurred.append(dir_path + img_name)
        elif t == "motion_blurred":
            motion_blurred.append(dir_path + img_name)

train_sharp = sharp[: int(0.8 * len(sharp))]
train_defocused_blurred = defocused_blurred[: int(0.8 * len(defocused_blurred))]
train_motion_blurred = motion_blurred[: int(0.8 * len(motion_blurred))]

val_sharp = sharp[int(0.8 * len(sharp)) : int(0.8 * len(sharp)) + int(0.2 * len(sharp))]
val_defocused_blurred = defocused_blurred[int(0.8 * len(defocused_blurred)) : int(0.8 * len(defocused_blurred)) + int(0.2 * len(defocused_blurred))]
val_motion_blurred = motion_blurred[int(0.8 * len(motion_blurred)) : int(0.8 * len(motion_blurred)) + int(0.2 * len(motion_blurred))]

# Training set
bk_train = train_sharp + train_defocused_blurred + train_motion_blurred
# Validation set
bk_val = val_sharp + val_defocused_blurred + val_motion_blurred

In [None]:
import shutil

dest_train = '/media/arnau/PEN/TFG/blur_dataset/blur_dataset_scaled/train/'
dest_val = '/media/arnau/PEN/TFG/blur_dataset/blur_dataset_scaled/val/'

for filename in bk_train:
    if filename not in dest_train:
        shutil.copy(filename, dest_train)
    
for filename in bk_val:
    if filename not in dest_val:
        shutil.copy(filename, dest_val)

# Join VizWiz + Blur Kaggle datasets

In [None]:
import random
from collections import Counter
from sklearn.model_selection import train_test_split

In [None]:
random.seed(2023)

size_vw_train = size_bk_train = len(bk_train)   # 80% 
size_vw_val = size_bk_val = len(bk_val)   # 10% 
size_vw_test = len(bk_val) * 2   # 10%
size_bk_test = 0 


vw_vw_train_data_amp = random.choices(vw_train_data, k=size_bk_train)
vw_vw_val_data_amp = random.choices(vw_val_data, k=size_bk_val)
bk_val = random.choices(bk_val, k=size_bk_val)
vw_vw_test_data_data = random.choices(vw_test_data, k=size_vw_test)

# Blur dataset kagle + VizWiz dataset (train)
bvw_vw_train_data = bk_train + vw_vw_train_data_amp
# Blur dataset kagle + VizWiz dataset (val)
bvw_vw_val_data = bk_val + vw_vw_val_data_amp
# Vizwiz test data
vw_test_data = vw_vw_test_data_data

print(f"Dataset size: {len(bvw_vw_train_data) + len(bvw_vw_val_data) + len(vw_test_data)}")

Save results

In [None]:
datam = {"train" : bvw_vw_train_data,
        "val" : bvw_vw_val_data,
           "test": vw_test_data}


with open('../data/mixed_vwbk_dataset.json', 'w') as outfile:
    json.dump(datam, outfile)