# Imports

In [1]:
import os,zipfile
import tarfile,json
import imagesize
import numpy as np
import glob
from tqdm import tqdm
import pickle
from utils.args import *

# Raw data and processed data paths

In [None]:
# Path for the zips and json of RRC data
zip_path = 'rrc_data/zips/'
json_path = 'rrc_data/json/'

# Path where the zips are supposed to be extracted to
zip_dest = 'rrc_data/images/'

# Path where the txt files and symlinks to images will be created
train_path = 'data/train/'
val_path = 'data/val/'

# Extracting data

In [None]:
# These files are supposed to be downloaded from the RRC website and placed in the proper folder
# https://rrc.cvc.uab.es/?ch=19&com=downloads

with zipfile.ZipFile(os.path.join(zip_path,'val.zip'), 'r') as file:
    file.extractall(zip_dest)
    
with zipfile.ZipFile(os.path.join(zip_path,'test_croppedv2.zip'), 'r') as file:
    file.extractall(zip_dest)
    
with tarfile.open(os.path.join(zip_path,'cropped_train.tar.gz')) as file:
    file.extractall(zip_dest)

# Loading Transcriptions from json

In [None]:
# These files are supposed to be downloaded from the RRC website and placed in the proper folder
# https://rrc.cvc.uab.es/?ch=19&com=downloads

with open(os.path.join(json_path,'cropped_train_v1.json'))as f:
    data_train = json.load(f)
    
with open(os.path.join(json_path,'cropped_val_v1.json'))as f:
    data_val = json.load(f)

# Creating dataset and writing transcriptions to txt files

In [None]:
os.makedirs(val_path,exist_ok=True)
os.makedirs(train_path,exist_ok=True)

for entry in tqdm(data_val):
    
    img_file = str(entry['text_id'])+'.jpg'
    
    # Consider if the string is made of ASCII characters, if the image file is not empty, and finally exclude entries where NULL, '\n' and '\t' are present
    if entry['transcription'].isascii() and os.path.getsize(os.path.join(zip_dest,'val',img_file)) and all([x not in entry['transcription'] for x in exclude_ascii ]):
        
        # Symlink to image to avoid unnecessary data copy
        os.symlink(os.path.realpath(os.path.join(zip_dest,'val',img_file)),os.path.join(val_path,img_file))
        
        # Store transcription in txt file
        with open(os.path.join(val_path,img_file[:-3]+'txt'),'w')as f:
            f.write(entry['transcription'])
            


for entry in tqdm(data_train):
    
    img_file = str(entry['text_id'])+'.jpg'
    
    # Consider if the string is made of ASCII characters, if the image file is not empty, and finally exclude entries where NULL, '\n' and '\t' are present
    if entry['transcription'].isascii() and os.path.getsize(os.path.join(zip_dest,'train',img_file)) and all([x not in entry['transcription'] for x in exclude_ascii ]):
        
        # Symlink to image to avoid unnecessary data copy
        os.symlink(os.path.realpath(os.path.join(zip_dest,'train',img_file)),os.path.join(train_path,img_file))
        
        # Store transcription in txt file
        with open(os.path.join(train_path,img_file[:-3]+'txt'),'w')as f:
            f.write(entry['transcription'])

# Delete one problematic file

In [None]:
# This image has some problem, no EOS character. So manually remove
os.remove(os.path.join(train_path,'4985023.jpg'))
os.remove(os.path.join(train_path,'4985023.txt'))

# Read all txt files in val and train to further clean data

In [None]:
txtl_val = glob.glob(os.path.join(val_path,'*.txt'))
txtl_train = glob.glob(os.path.join(train_path,'*.txt'))

# Convert to numpy array for easy manipulation
all_files = np.array(txtl_val+txtl_train)

txtl_val = np.array(txtl_val)
txtl_train = np.array(txtl_train)

# Query all the image dimensions

In [None]:
# Iterate over the images and store width, height and w/h ratio
w_l, h_l, rat_l = [], [], []

for file in tqdm(all_files):
    width, height = imagesize.get(file[:-3]+'jpg')
    w_l.append(width)
    h_l.append(height)
    rat_l.append(width/height)
    
w_l, h_l, rat_l = np.array(w_l), np.array(h_l), np.array(rat_l)

# take inverse of w/h ratio to get h/w ratio as well
inv_l = 1/rat_l

# Query all the transcription lengths

In [None]:
str_len = []
for file in tqdm(all_files):
    with open(file)as f:
        line = f.readline()
    
    str_len.append(len(line))
    
str_len = np.array(str_len)

# Filter the files based on cutoff criteria

In [None]:
# Filter based on max h/w ratio, max w/h ratio, max height, max width, min height, min width and max string length

remove_bool = ((inv_l>max_HbyW) | (rat_l>max_WbyH) | (h_l>max_height) | (w_l>max_width) | (h_l<min_height) | (w_l<min_width) | (str_len>max_str_len))
remove_files = all_files[remove_bool]

# remove selected files
for file in remove_files:
    os.remove(file)
    os.remove(file[:-3]+'jpg')
    

# Separate dataset into Tall, Square, and Wide

In [None]:
val_len = len(txtl_val)

val_remove_bool = remove_bool[:val_len]
train_remove_bool = remove_bool[val_len:]

# Files are separated into, tall, square and wide
tall = rat_l < (1/max_HbyW_for_sq)
square = (rat_l <= (max_WbyH_for_sq)) & (rat_l >= (1/max_HbyW_for_sq))
wide = rat_l > (max_WbyH_for_sq)

# Filenames for each category extracted

In [None]:
# Files are considered if they fit their resolution criteria and have not been filtered
tall_val_files = txtl_val[((tall[:val_len]) & (~val_remove_bool))]
square_val_files = txtl_val[((square[:val_len]) & (~val_remove_bool))]
wide_val_files = txtl_val[((wide[:val_len]) & (~val_remove_bool))]

tall_train_files = txtl_train[((tall[val_len:]) & (~train_remove_bool))]
square_train_files = txtl_train[((square[val_len:]) & (~train_remove_bool))]
wide_train_files = txtl_train[((wide[val_len:]) & (~train_remove_bool))]

# Create dictionary which will be used at train time

In [None]:
val_dict = {'tall':[x.split(os.sep)[-1] for x in tall_val_files],
            'square':[x.split(os.sep)[-1] for x in square_val_files],
            'wide':[x.split(os.sep)[-1] for x in wide_val_files]}

train_dict = {'tall':[x.split(os.sep)[-1] for x in tall_train_files],
              'square':[x.split(os.sep)[-1] for x in square_train_files],
              'wide':[x.split(os.sep)[-1] for x in wide_train_files]}


# Save dictionaries to pkl file for easy access

In [None]:
with open('val_dict.pkl', 'wb') as f:
    pickle.dump(val_dict, f)
    
with open('train_dict.pkl', 'wb') as f:
    pickle.dump(train_dict, f)