<a href="https://colab.research.google.com/github/balandongiv/mmocr_tutorial/blob/main/helper_download_det_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MMOCR Tutorial

Welcome to MMOCR! This is the **unofficial** colab tutorial for using MMOCR. In this tutorial, you will learn how to

- Automatically download ocr public dataset that commonly used for training and validating tex detection and recognition.
- Prepare mmocr-compatible annotation format

Prepared by `Rodney Petrus Balandong`

# Setting

In [None]:
!pip install wget
import os
from os.path import exists,isfile
import requests
import shutil
import logging
import wget
from google.colab import drive
drive.flush_and_unmount()
logging.basicConfig(level=logging.INFO)
logging.info('test')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


INFO:root:test


Drive not mounted, so nothing to flush and unmount.


In [None]:
# Install mmcv-full thus we could use CUDA operators
!pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html

# Install mmdetection
!pip install mmdet

# # Install mmocr
!git clone https://github.com/open-mmlab/mmocr.git
%cd mmocr
!pip install -r requirements.txt
!pip install -v -e .

In [None]:
def check_dw(sfile,url,wget_dw=False):
  if not isfile(sfile):
      logging.info(f"Downloading {os.path.split(sfile)[-1]} from"
                     f" {url}.")
      if wget_dw:
        wget.download(url, out=sfile)
      else:
        r = requests.get(url, verify=False,stream=True)  
        with open(sfile, 'wb') as f:
          f.write(r.content)
def ch_make_folder(f):
  if not os.path.exists(f):
    os.makedirs(f)

def move_files(source_dir,dest):
  file_names = os.listdir(source_dir)
  for file_name in file_names:
    shutil.move(os.path.join(source_dir, file_name),dest)

def move_files_to_des(file_names,dest):
    for file_name in file_names:
        shutil.move(file_name ,dest)

# ICDAR 2011 (Born-Digital Images)


## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#icdar-2011-born-digital-images)

In [None]:
def icdar2011(npath,cleanup=False):
    # Recognition
    root=os.path.join(npath,'icdar2011')
    dannot=os.path.join(root,'annotations')
    dcrops=os.path.join(root,'crops')
    dpath=dict(tr_img=dict(URL = "https://rrc.cvc.uab.es/downloads/Challenge1_Training_Task3_Images_GT.zip",
                           fpath=os.path.join(root,'Challenge1_Training_Task3_Images_GT.zip')),
               ts_img=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge1_Test_Task3_Images.zip',
                           fpath=os.path.join(root,'Challenge1_Test_Task3_Images.zip')),
               ts_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge1_Test_Task3_GT.txt',
                           fpath=os.path.join(root,'Challenge1_Test_Task3_GT.txt'))
               )



    for dp in ([root,dcrops,dannot]):
        ch_make_folder(dp)

    for dp in (['tr_img','ts_img','ts_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])



    dimg_tr=os.path.join(dcrops,'train')
    dimg_ts=os.path.join(dcrops,'test')

    logging.info(f'Unpacking file')

    for dp, dirc_f in zip(['tr_img','ts_img'],
                          [dimg_tr,dimg_ts]):
        logging.info(f"Unpacking {dpath[dp]['fpath']} to {dirc_f}. ")
        shutil.unpack_archive(dpath[dp]['fpath'],dirc_f)


    logging.info(f'Move the annotation')
    fannot_ts=os.path.join(dannot,'Challenge1_Test_Task3_GT.txt')
    shutil.move(dpath['ts_lbl']['fpath'],fannot_ts)


    shutil.move(os.path.join(dimg_tr,'gt.txt'),
                os.path.join(dannot,'Challenge1_Train_Task3_GT.txt'))

    # Text Recognition
    from tools.data.textrecog.ic11_converter import convert_annotations
    format = 'jsonl'
    for split in ['Train', 'Test']:
        convert_annotations(root, split, format)
        logging.info(f'{split} split converted.')

    #     ├── icdar2011
    # │   ├── crops
    # │   ├── train_label.jsonl
    # │   └── test_label.jsonl


    ## Text Detection
    if cleanup:
        logging.info ('Cleaning up')
        for dp in ['tr_img','ts_img']:
            os.remove(dpath[dp]['fpath'])
    

icdar2011('/content/drive/MyDrive/dataset/recognition')

INFO:root:Downloading Challenge1_Training_Task3_Images_GT.zip from https://rrc.cvc.uab.es/downloads/Challenge1_Training_Task3_Images_GT.zip.
INFO:root:Downloading Challenge1_Test_Task3_Images.zip from https://rrc.cvc.uab.es/downloads/Challenge1_Test_Task3_Images.zip.
INFO:root:Downloading Challenge1_Test_Task3_GT.txt from https://rrc.cvc.uab.es/downloads/Challenge1_Test_Task3_GT.txt.
INFO:root:Unpacking file
INFO:root:Unpacking /content/drive/MyDrive/dataset/recognition/icdar2011/Challenge1_Training_Task3_Images_GT.zip to /content/drive/MyDrive/dataset/recognition/icdar2011/crops/train. 
INFO:root:Unpacking /content/drive/MyDrive/dataset/recognition/icdar2011/Challenge1_Test_Task3_Images.zip to /content/drive/MyDrive/dataset/recognition/icdar2011/crops/test. 
INFO:root:Move the annotation
INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt
INFO:root:Train split converted.


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#icdar-2011-born-digital-images)
ICDAR 2011 (Born-Digital Images)

In [None]:
def icdar2011(npath,cleanup=False):
    # Detection
    root=os.path.join(npath,'icdar2011')
    dannot=os.path.join(root,'annotations')
    dcrops=os.path.join(root,'imgs')
    dpath=dict(tr_img=dict(URL = "https://rrc.cvc.uab.es/downloads/Challenge1_Training_Task12_Images.zip",
                           fpath=os.path.join(root,'Challenge1_Training_Task12_Images.zip')),
               tr_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge1_Training_Task1_GT.zip',
                           fpath=os.path.join(root,'Challenge1_Training_Task1_GT.zip')),
               ts_img=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge1_Test_Task12_Images.zip',
                           fpath=os.path.join(root,'Challenge1_Test_Task12_Images.zip')),
               ts_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge1_Test_Task1_GT.zip',
                           fpath=os.path.join(root,'Challenge1_Test_Task1_GT.zip')),
               )


    ch_make_folder(root)

    for dp in (['tr_img','ts_img','ts_lbl','tr_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])



    dimg_tr=os.path.join(dcrops,'training')
    dimg_ts=os.path.join(dcrops,'test')
    lbl_tr=os.path.join(dannot,'training')
    lbl_ts=os.path.join(dannot,'test')

    logging.info(f'Unpacking file')
    for dp, dirc_f in zip(['tr_img','ts_img','tr_lbl','ts_lbl'],
                          [dimg_tr,dimg_ts,lbl_tr,lbl_ts]):
        logging.info(f"Unpacking {dpath[dp]['fpath']} to {dirc_f}. ")
        shutil.unpack_archive(dpath[dp]['fpath'],dirc_f)
  
    # Step 2: Generate instances_training.json and instances_test.json with the following command:

    #python tools/data/textdet/ic11_converter.py PATH/TO/icdar2011 --nproc 4


    import os.path as osp

    import mmcv


    from mmocr.utils import convert_annotations
    from tools.data.textdet.ic11_converter import collect_files,collect_annotations
    nproc=10

    root_path =root

    for split in ['training', 'test']:
        print(f'Processing {split} set...')
        with mmcv.Timer(print_tmpl='It takes {}s to convert annotation'):
            files = collect_files(
                osp.join(root_path, 'imgs', split),
                osp.join(root_path, 'annotations', split))
            image_infos = collect_annotations(files, nproc=nproc)
            convert_annotations(
                image_infos, osp.join(root_path,
                                      'instances_' + split + '.json'))

    #     │── icdar2011
    # │   ├── imgs
    # │   ├── instances_test.json
    # │   └── instances_training.json


    if cleanup:
        logging.info ('Cleaning up')
        for dp in ['tr_img','ts_img']:
            os.remove(dpath[dp]['fpath'])


icdar2011('/content/drive/MyDrive/dataset/detection')

Step 2: Generate instances_training.json and instances_test.json with the following command:

`python tools/data/textdet/ic11_converter.py PATH/TO/icdar2011 --nproc 4`

# ICDAR 2013 (Focused Scene Text)

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#icdar-2013-deprecated)

In [None]:
def icdar2013(npath,cleanup=False):
    # Recognition
    root=os.path.join(npath,'icdar2013')
    dannot=os.path.join(root,'annotations')
    dcrops=os.path.join(root,'crops')
    dpath=dict(tr_img=dict(URL = "https://rrc.cvc.uab.es/downloads/Challenge2_Training_Task3_Images_GT.zip",
                           fpath=os.path.join(root,'Challenge2_Training_Task3_Images_GT.zip')),
               ts_img=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge2_Test_Task3_Images.zip',
                           fpath=os.path.join(root,'Challenge2_Test_Task3_Images.zip')),
               ts_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge2_Test_Task3_GT.txt',
                           fpath=os.path.join(root,'Challenge2_Test_Task3_GT.txt'))
               )

    for dp in ([root,dcrops,dannot]):
        ch_make_folder(dp)

    for dp in (['tr_img','ts_img','ts_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])


    dimg_tr=os.path.join(dcrops,'train')
    dimg_ts=os.path.join(dcrops,'test')

    logging.info(f'Unpacking file')

    for dp, dirc_f in zip(['tr_img','ts_img'],
                          [dimg_tr,dimg_ts]):
        logging.info(f"Unpacking {dpath[dp]['fpath']} to {dirc_f}. ")
        shutil.unpack_archive(dpath[dp]['fpath'],dirc_f)


    logging.info(f'Move the annotation')
    fannot_ts=os.path.join(dannot,'Challenge2_Test_Task3_GT.txt')
    shutil.move(dpath['ts_lbl']['fpath'],fannot_ts)

    fannot_from=os.path.join(dimg_tr,'gt.txt')
    fannot_tr=os.path.join(dannot,'Challenge2_Train_Task3_GT.txt')
    shutil.move(fannot_from,fannot_tr)

    format = 'jsonl'
    from tools.data.textrecog.ic13_converter import convert_annotations
    for split in ['Train', 'Test']:
        convert_annotations(root, split, format)
        print(f'{split} split converted.')


    if cleanup:
        logging.info ('Cleaning up')
        for dp in ['tr_img','ts_img']:
            os.remove(dpath[dp]['fpath'])


    #   ├── icdar_2013
    # │   ├── train_label.txt
    # │   ├── test_label_1015.txt
    # │   ├── test_label_1095.txt
    # │   ├── Challenge2_Training_Task3_Images_GT
    # │   └──  Challenge2_Test_Task3_Images         

icdar2013('/content/drive/MyDrive/dataset/recognition')

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#icdar-2013-focused-scene-text)

In [None]:
def icdar2013(npath,cleanup=False):
    # Detection
    root=os.path.join(npath,'icdar2013')
    dannot=os.path.join(root,'annotations')
    dcrops=os.path.join(root,'imgs')
    dpath=dict(tr_img=dict(URL = "https://rrc.cvc.uab.es/downloads/Challenge2_Training_Task12_Images.zip",
                           fpath=os.path.join(root,'Challenge2_Training_Task12_Images.zip')),
               ts_img=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge2_Test_Task12_Images.zip',
                           fpath=os.path.join(root,'Challenge2_Test_Task12_Images.zip')),
               tr_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge2_Training_Task1_GT.zip',
                           fpath=os.path.join(root,'Challenge2_Training_Task1_GT.zip')),
               ts_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge2_Test_Task1_GT.zip',
                           fpath=os.path.join(root,'Challenge2_Test_Task1_GT.zip')),
               )


    ch_make_folder(root)

    for dp in (['tr_img','ts_img','ts_lbl','tr_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])



    dimg_tr=os.path.join(dcrops,'training')
    dimg_ts=os.path.join(dcrops,'test')
    lbl_tr=os.path.join(dannot,'training')
    lbl_ts=os.path.join(dannot,'test')

    logging.info(f'Unpacking file')
    for dp, dirc_f in zip(['tr_img','ts_img','tr_lbl','ts_lbl'],
                          [dimg_tr,dimg_ts,lbl_tr,lbl_ts]):
        logging.info(f"Unpacking {dpath[dp]['fpath']} to {dirc_f}. ")
        shutil.unpack_archive(dpath[dp]['fpath'],dirc_f)


    # Step 2: Generate instances_training.json and instances_test.json with the following command:

    # python tools/data/textdet/ic13_converter.py PATH/TO/icdar2013 --nproc 4


    import mmcv

    import os.path as osp
    from mmocr.utils import convert_annotations
    from tools.data.textdet.ic13_converter import collect_files,collect_annotations
    nproc=10


    root_path =root

    for split in ['training', 'test']:
        print(f'Processing {split} set...')
        with mmcv.Timer(print_tmpl='It takes {}s to convert IC13 annotation'):
            files = collect_files(
                osp.join(root_path, 'imgs', split),
                osp.join(root_path, 'annotations', split), split)
            image_infos = collect_annotations(files, nproc=nproc)
            convert_annotations(
                image_infos, osp.join(root_path,
                                      'instances_' + split + '.json'))


    # │── icdar2013
    # │   ├── imgs
    # │   ├── instances_test.json
    # │   └── instances_training.json


icdar2013('/content/drive/MyDrive/dataset/detection')

# icdar2015

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#icdar-2015)

In [None]:
def icdar2015(npath,cleanup=False):
    """
    # Recognition
    Remark: icdar2015 does not have special python  converter
    :param npath: 
    :param cleanup: 
    :return: 
    """

    root=os.path.join(npath,'icdar2015')
    dts=os.path.join(root,'ch4_test_word_images_gt')
    dtr=os.path.join(root,'ch4_training_word_images_gt')
    dpath=dict(tr_img=dict(URL = "https://rrc.cvc.uab.es/downloads/ch4_training_word_images_gt.zip",
                           fpath=os.path.join(root,'ch4_training_word_images_gt.zip')),
               ts_img=dict(URL='https://rrc.cvc.uab.es/downloads/ch4_test_word_images_gt.zip',
                           fpath=os.path.join(root,'ch4_test_word_images_gt.zip')),
               # ts_lblx=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge4_Test_Task3_GT.txt',
               #              fpath=os.path.join(root,'Challenge4_Test_Task3_GT.txt')),
               ts_lbl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/icdar_2015/test_label.txt',
                           fpath=os.path.join(root,'test_label.txt')),
               tr_lbl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/icdar_2015/train_label.txt',
                           fpath=os.path.join(root,'train_label.txt')),
               )


    for dp in ([root,dts,dtr]):
        ch_make_folder(dp)


    for dp in (['tr_img','ts_img','ts_lbl','tr_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])



    logging.info(f'Unpacking file')
    for dp, dirc_f in zip(['tr_img','ts_img'],
                          [dtr,dts]):
        logging.info(f"Unpacking {dpath[dp]['fpath']} to {dirc_f}. ")
        shutil.unpack_archive(dpath[dp]['fpath'],dirc_f)


    if cleanup:
        logging.info ('Cleaning up')
        for dp in ['tr_img','ts_img']:
            os.remove(dpath[dp]['fpath'])

    #  ```
    #   text
    # ├── icdar2015
    # │   ├── imgs
    # │   ├── annotations
    # │   ├── instances_test.json
    # │   └── instances_training.json
    # ```
icdar2015('/content/drive/MyDrive/dataset/recognition')

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#icdar-2015)

In [None]:
def icdar2015(npath,cleanup=False):

    """
    # Detection
    Got to many `ignore text` when extracting the annotation
    """
    root=os.path.join(npath,'icdar2015')
    dannot=os.path.join(root,'annotations')
    dcrops=os.path.join(root,'imgs')

    dpath=dict(tr_img=dict(URL = "https://rrc.cvc.uab.es/downloads/ch4_training_images.zip",
                           fpath=os.path.join(root,'ch4_training_images.zip')),
               tr_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/ch4_training_localization_transcription_gt.zip',
                           fpath=os.path.join(root,'ch4_test_images.zip')),
               ts_img=dict(URL='https://rrc.cvc.uab.es/downloads/ch4_test_images.zip',
                           fpath=os.path.join(root,'ch4_training_localization_transcription_gt.zip')),
               ts_lbl=dict(URL='https://rrc.cvc.uab.es/downloads/Challenge4_Test_Task1_GT.zip',
                           fpath=os.path.join(root,'Challenge4_Test_Task1_GT.zip'))
               )


    ch_make_folder(root)

    for dp in (['tr_img','ts_img','ts_lbl','tr_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])



    dimg_tr=os.path.join(dcrops,'training')
    dimg_ts=os.path.join(dcrops,'test')
    lbl_tr=os.path.join(dannot,'training')
    lbl_ts=os.path.join(dannot,'test')

    logging.info(f'Unpacking file')
    for dp, dirc_f in zip(['tr_img','ts_img','tr_lbl','ts_lbl'],
                          [dimg_tr,dimg_ts,lbl_tr,lbl_ts]):
        logging.info(f"Unpacking {dpath[dp]['fpath']} to {dirc_f}. ")
        shutil.unpack_archive(dpath[dp]['fpath'],dirc_f)



    # generate instances_training.json and instances_test.json with the following command:

    # python tools/data/textdet/icdar_converter.py /path/to/icdar2015 -o /path/to/icdar2015 -d icdar2015 --split-list training test

    """
    Why got to may ignore text?
    """

    import os.path as osp
    import mmcv
    from tools.data.textdet.icdar_converter import collect_files,collect_annotations,convert_annotations

    icdar_path = root
    out_dir= root
    dataset='icdar2015'
    split_list=['training', 'test']
    nproc=10
    out_dir = out_dir if out_dir else icdar_path
    mmcv.mkdir_or_exist(out_dir)

    img_dir = osp.join(icdar_path, 'imgs')
    gt_dir = osp.join(icdar_path, 'annotations')

    set_name = {}
    for split in split_list:
        set_name.update({split: 'instances_' + split + '.json'})
        assert osp.exists(osp.join(img_dir, split))

    for split, json_name in set_name.items():
        print(f'Converting {split} into {json_name}')
        with mmcv.Timer(print_tmpl='It takes {}s to convert icdar annotation'):
            files = collect_files(
                osp.join(img_dir, split), osp.join(gt_dir, split))
            image_infos = collect_annotations(
                files, dataset, nproc=nproc)
            convert_annotations(image_infos, osp.join(out_dir, json_name))

icdar2015('/content/drive/MyDrive/dataset/detection')

# IIIT5K

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#iiit5k)

In [None]:
def IIIT5K(npath,cleanup=False):
    """
      # Recognition
      Remark: IIIT5K does not have special python  converter
      :param npath: 
      :param cleanup: 
      :return: 
    """
    root=os.path.join(npath,'IIIT5K')
    dpath=dict(dt_img=dict(URL = "http://cvit.iiit.ac.in/projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz",
                           fpath=os.path.join(root,'IIIT5K-Word_V3.0.tar.gz')),
               ts_lbl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/test_label.txt',
                           fpath=os.path.join(root,'test_label.txt')),
               tr_lbl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/IIIT5K/train_label.txt',
                           fpath=os.path.join(root,'train_label.txt')),
               )

    ch_make_folder(root)

    for dp in (['dt_img','ts_lbl','tr_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

    logging.info(f'Unpacking file')
    shutil.unpack_archive(dpath['dt_img']['fpath'],root)

    logging.info(f'Moving file')
    ftest_from=os.path.join(root,'IIIT5K','test')
    shutil.move(ftest_from,root)

    ftrain_from=os.path.join(root,'IIIT5K','train')
    shutil.move(ftrain_from,root)

    if cleanup:
        logging.info ('Cleaning up')
        shutil.rmtree(os.path.join(root,'IIIT5K'))
        os.remove(dpath['dt_img']['fpath'])

    # ├── III5K
    # │   ├── train_label.txt
    # │   ├── test_label.txt
    # │   ├── train
    # │   └── test

IIIT5K('/content/drive/MyDrive/dataset/recognition')


# SVT

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#svt)

In [None]:

def list_to_file(filename, lines):
    """Write a list of strings to a text file.

    Args:
        filename (str): The output filename. It will be created/overwritten.
        lines (list(str)): Data to be written.
    """
    import mmcv
    mmcv.mkdir_or_exist(os.path.dirname(filename))
    with open(filename, 'w', encoding='utf-8') as fw:
        for line in lines:
            fw.write(f'{line}\n')

def svt(npath,cleanup=False,height=32,width=100,resize=False):
    # Recognition
    import os.path as osp
    import xml.etree.ElementTree as ET
    import cv2
    root=os.path.join(npath,'svt')

    dimg=os.path.join(root,'image')
    dpath=dict(dt_img=dict(URL = "http://www.iapr-tc11.org/dataset/SVT/svt.zip",
                           fpath=os.path.join(root,'svt.zip')),
               ts_lbl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/svt/test_label.txt',
                           fpath=os.path.join(root,'test_label.txt'))
               )


    for dp in ([root,dimg]):
        ch_make_folder(dp)

    for dp in (['dt_img','ts_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

    logging.info(f'Unpacking file')
    shutil.unpack_archive(dpath['dt_img']['fpath'],root)
    # inputs
    root_path=os.path.join(root,'svt1')
    src_label_file = osp.join(root_path, 'test.xml')
    if not osp.exists(src_label_file):
        raise Exception(
            f'{src_label_file} not exists, please check and try again.')
    src_image_root = root_path

    # outputs
    dst_label_file = osp.join(root, 'test_label.txt')
    dst_image_root = dimg

    tree = ET.parse(src_label_file)
    rootxmls = tree.getroot()

    index = 1
    lines = []
    total_img_num = len(rootxmls)
    i = 1
    for image_node in rootxmls.findall('image'):
        image_name = image_node.find('imageName').text
        # print(f'[{i}/{total_img_num}] Process image: {image_name}')
        i += 1
        lexicon = image_node.find('lex').text.lower()
        lexicon_list = lexicon.split(',')
        lex_size = len(lexicon_list)
        src_img = cv2.imread(osp.join(src_image_root, image_name))
        for rectangle in image_node.find('taggedRectangles'):
            x = int(rectangle.get('x'))
            y = int(rectangle.get('y'))
            w = int(rectangle.get('width'))
            h = int(rectangle.get('height'))
            rb, re = max(0, y), max(0, y + h)
            cb, ce = max(0, x), max(0, x + w)
            dst_img = src_img[rb:re, cb:ce]
            text_label = rectangle.find('tag').text.lower()
            if resize:
                dst_img = cv2.resize(dst_img, (width, height))
            dst_img_name = f'img_{index:04}' + '.jpg'
            index += 1
            dst_img_path = osp.join(dst_image_root, dst_img_name)
            cv2.imwrite(dst_img_path, dst_img)
            lines.append(f'{osp.basename(dst_image_root)}/{dst_img_name} '
                         f'{text_label} {lex_size} {lexicon}')


    list_to_file(dst_label_file, lines)
    print(f'Finish to generate svt testset, '
          f'with label file {dst_label_file}')

    if cleanup:
        logging.info ('Cleaning up')
        shutil.rmtree(os.path.join(root,'__MACOSX'))
        shutil.rmtree(os.path.join(root,'svt1'))
        os.remove(dpath['dt_img']['fpath'])
    #   ├── svt
    # │   ├── test_label.txt
    # │   └── image
svt('/content/drive/MyDrive/dataset/recognition')

INFO:root:Downloading svt.zip from http://www.iapr-tc11.org/dataset/SVT/svt.zip.
INFO:root:Downloading svt.zip from http://www.iapr-tc11.org/dataset/SVT/svt.zip.
INFO:root:Downloading test_label.txt from https://download.openmmlab.com/mmocr/data/mixture/svt/test_label.txt.
INFO:root:Downloading test_label.txt from https://download.openmmlab.com/mmocr/data/mixture/svt/test_label.txt.
INFO:root:Unpacking file


Finish to generate svt testset, with label file /content/drive/MyDrive/dataset/recognition/svt/test_label.txt


# COCO-Text

ICDAR2017 Robust Reading Challenge on COCO-Text

In [None]:
!wget http://msvocds.blob.core.windows.net/coco2014/train2014.zip --no-check-certificate

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#coco-text)


In [None]:
def coco_text(npath,cleanup=False):

    """
    # Recognition
    Remark: IIIT5K does not have special python  converter
    """

    root=os.path.join(npath,'coco_text')
    dannot=os.path.join(root,'annotations')
    dcrops=os.path.join(root,'crops')
    dpath=dict(dt_img=dict(URL = "https://datasets.cvc.uab.es/rrc/COCO-Text-words-trainval.zip",
                           fpath=os.path.join(root,'COCO-Text-words-trainval.zip')),
               tr_lbl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/coco_text/train_label.txt',
                           fpath=os.path.join(root,'train_label.txt'))
               )




    for dp in ([root,dcrops,dannot]):
        ch_make_folder(dp)

    for dp in (['dt_img','tr_lbl']):
        check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])




    dimg_tr=os.path.join(dcrops,'train')
    dimg_ts=os.path.join(dcrops,'test')

    logging.info(f'Unpacking file')
    shutil.unpack_archive(dpath['dt_img']['fpath'],root)

    if cleanup:
        logging.info ('Cleaning up')
        os.remove(dpath['dt_img']['fpath'])


coco_text('/content/drive/MyDrive/dataset/recognition') 

INFO:root:Downloading COCO-Text-words-trainval.zip from https://datasets.cvc.uab.es/rrc/COCO-Text-words-trainval.zip.
INFO:root:Downloading COCO-Text-words-trainval.zip from https://datasets.cvc.uab.es/rrc/COCO-Text-words-trainval.zip.
INFO:root:Downloading train_label.txt from https://download.openmmlab.com/mmocr/data/mixture/coco_text/train_label.txt.
INFO:root:Downloading train_label.txt from https://download.openmmlab.com/mmocr/data/mixture/coco_text/train_label.txt.
INFO:root:Unpacking file


## Text Detection (KIV) [link text](https://mmocr.readthedocs.io/en/latest/datasets/det.html#icdar-2017)



I am still confuse where to download the images.zip file. Is it from this [link](https://rrc.cvc.uab.es/?ch=5&com=downloads)?

Or, generate instances_training.json and instances_test.json with the following command:

`python tools/data/textdet/icdar_converter.py /path/to/icdar2015 -o /path/to/icdar2015 -d icdar2015 --split-list training test`


# __MJSynth (Syn90k)

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#mjsynth-syn90k)

KIV since the file is to large

In [None]:
def Syn90k(npath,cleanup=False):
  # Recognition (TODO)
  root=os.path.join(npath,'Syn90k')
  dannot=os.path.join(root,'annotations')
  dcrops=os.path.join(root,'crops')
  dpath=dict(dt_img=dict(URL = "https://thor.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz",
                         fpath=os.path.join(root,'mjsynth.tar.gz')),
             shfl_lbl=dict(URL = "https://download.openmmlab.com/mmocr/data/mixture/Syn90k/shuffle_labels.txt",
                         fpath=os.path.join(root,'shuffle_labels.txt')),
             ts_lbl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/Syn90k/label.txt',
                         fpath=os.path.join(root,'label.txt'))
             )
  

  if not isfile(dpath['dt_img']['fpath']):
    raise (f"Please download the file at {dpath['dt_img']['fpath']} and save the file in {root}")


  
  for dp in ([root,dcrops,dannot]):
    ch_make_folder(dp)


  logging.info(f'This going to take a very long time to download. 9.98 Gb')
  for dp in (['dt_img','ts_lbl','shfl_lbl']):
    logging.info(f"Downloading {os.path.split(dpath[dp]['fpath'])[-1]} from"
                     f" {dpath[dp]['URL']}.")
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])


  logging.info(f'Unpacking file')

  # python tools/data/utils/txt2lmdb.py -i data/mixture/Syn90k/label.txt -o data/mixture/Syn90k/label.lmdb
#   ├── Syn90k
# │   ├── shuffle_labels.txt
# │   ├── label.txt
# │   ├── label.lmdb (optional)
# │   └── mnt
Syn90k('/content')

# [SynthText](https://mmocr.readthedocs.io/en/latest/datasets/det.html#synthtext)

Yet  to test since the file is damn large


Overview

This is a synthetically generated dataset, in which word instances are placed in natural scene images, while taking into account the scene layout.

The dataset consists of 800 thousand images with approximately 8 million synthetic word instances. Each text instance is annotated with its text-string, word-level and character-level bounding-boxes. 

# __SynthText (Synth800k)
KIV since the file is to large


Overview

This is a synthetically generated dataset, in which word instances are placed in natural scene images, while taking into account the scene layout.

The dataset consists of 800 thousand images with approximately 8 million synthetic word instances. Each text instance is annotated with its text-string, word-level and character-level bounding-boxes. 

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#synthtext-synth800k)

In [None]:
def SynthText(npath,cleanup=False):
  root=os.path.join(npath,'SynthText')
  dpath=dict(dt_img=dict(URL = "https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip",
                         fpath=os.path.join(root,'SynthText.zip')),
             lbl_txt=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/SynthText/label.txt',
                         fpath=os.path.join(root,'label.txt')),
             lbl_shfl=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/SynthText/shuffle_labels.txt',
                         fpath=os.path.join(root,'shuffle_labels.txt')),
             lbl_tr=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/SynthText/instances_train.txt',
                         fpath=os.path.join(root,'instances_train'))
             )
  
  ch_make_folder(root)




  for dp in (['lbl_shfl','lbl_txt','lbl_tr']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])


  if not isfile(dpath['dt_img']['fpath']):
    raise (f"This file is large which is about 38Gb,Please download the file at {dpath['dt_img']['fpath']} and save the file in {root}\
    Further instruction on how to download the dataset can be found at https://www.robots.ox.ac.uk/~vgg/data/scenetext/")


# ├── SynthText
# │   ├── alphanumeric_labels.txt
# │   ├── shuffle_labels.txt
# │   ├── instances_train.txt
# │   ├── label.txt
# │   ├── label.lmdb (optional)
# │   └── synthtext
  

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#synthtext)

In [None]:
def synthtext(npath,cleanup=False):
  """


  The SynthText dataset (74 GiB) is available for download via BitTorrent from Academic Torrents. 
  This includes the pre-generated dataset as well as the pre-processed background images.

  We strongly recommend the use of BitTorrent protocol. For when that is not possible, 
  the pre-generated dataset (38 GiB) is available for download over http. For instructions
   on how to download the pre-processed background images over http, see the SynthText project on github.

  """
  root=os.path.join(npath,'Syn90k')
  # dannot=os.path.join(root,'annotations')
  dcrops=os.path.join(root,'imgs')
  dpath=dict(dt_img=dict(URL = "https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip",
                         fpath=os.path.join(root,'SynthText.zip')),
             lbl_dta=dict(URL = "https://download.openmmlab.com/mmocr/data/synthtext/instances_training.lmdb/data.mdb",
                         fpath=os.path.join(root,'data.mdb')),
             lbl_loc=dict(URL='https://download.openmmlab.com/mmocr/data/synthtext/instances_training.lmdb/lock.mdb',
                         fpath=os.path.join(root,'lock.mdb'))
             )
  

  # if not isfile(dpath['dt_img']['fpath']):
  #   raise (f"Please download the file at {dpath['dt_img']['fpath']} and save the file in {root}")


  
  for dp in ([root,dcrops]):
    ch_make_folder(dp)


  logging.info(f'This going to take a very long time to download. 38 Gb')
  for dp in (['dt_img','lbl_loc','lbl_dta']):
    logging.info(f"Downloading {os.path.split(dpath[dp]['fpath'])[-1]} from"
                     f" {dpath[dp]['URL']}.")
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img']['fpath'],dcrops)


  # ├── synthtext
  # │   ├── imgs
  # │   └── instances_training.lmdb
  # │       ├── data.mdb
  # │       └── lock.mdb
synthtext('/content')

INFO:root:This going to take a very long time to download
INFO:root:The file /content/Syn90k/dds.txt is not availaible, downloading from https://download.openmmlab.com/mmocr/data/mixture/Syn90k/label.txt
INFO:root:Unpacking file


In [None]:
SynthText('/content')

# __SynthAdd (KIV)

KIV SINCE THE WEBSITE IS IN CHINESE

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#synthadd)

In [None]:
def SynthAdd(npath):
  # TODO: REQUIRE LOGIN IN CHINESE
  root=os.path.join(npath,'SynthAdd')
  # dannot=os.path.join(root,'annotations')
  # dcrops=os.path.join(root,'crops')
  dpath=dict(dt_img=dict(URL = "https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip",
                         fpath=os.path.join(root,'SynthText.zip')),
             lbl_txt=dict(URL='https://download.openmmlab.com/mmocr/data/mixture/SynthAdd/label.txt',
                         fpath=os.path.join(root,'label.txt')),
             )
  
  ch_make_folder(root)


  check_dw(dpath['lbl_txt']['fpath'],dpath['lbl_txt']['URL'])

  if not isfile(dpath['dt_img']['fpath']):
    raise (f"This file is large which is about 38Gb,Please download the file at {dpath['dt_img']['fpath']} and save the file in {root}\
    Further instruction on how to download the dataset can be found at https://www.robots.ox.ac.uk/~vgg/data/scenetext/")

SynthAdd('/content')

In [None]:
!wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip --no-check-certificate

--2022-06-13 02:17:44--  https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7072297970 (6.6G) [application/zip]
Saving to: ‘train_val_images.zip’


2022-06-13 02:19:22 (68.9 MB/s) - ‘train_val_images.zip’ saved [7072297970/7072297970]



#TextOCR

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#textocr)

In [None]:
!wget -P /content/drive/MyDrive/dataset/recognition/textocr https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip --no-check-certificate

--2022-06-17 01:09:22--  https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7072297970 (6.6G) [application/zip]
Saving to: ‘/content/drive/MyDrive/dataset/recognition/textocr/train_val_images.zip.1’


2022-06-17 01:12:32 (35.5 MB/s) - ‘/content/drive/MyDrive/dataset/recognition/textocr/train_val_images.zip.1’ saved [7072297970/7072297970]



In [None]:
def textocr(npath,cleanup=False):
  # Recognition
  root=os.path.join(npath,'textocr')
  dpath=dict(dt_img=dict(URL = "https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip",
                         fpath=os.path.join(root,'train_val_images.zip')),
             lbl_tr=dict(URL='https://dl.fbaipublicfiles.com/textvqa/data/textocr/TextOCR_0.1_train.json',
                         fpath=os.path.join(root,'TextOCR_0.1_train.json')),
             lbl_val=dict(URL='https://dl.fbaipublicfiles.com/textvqa/data/textocr/TextOCR_0.1_val.json',
                         fpath=os.path.join(root,'TextOCR_0.1_val.json')),
             )
  
  ch_make_folder(root)

  # Downloading the train_val_images.zip in Google Colab might have an issue whereby the session crashed after using all available RAM. One way to bypass is to use
  # !wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip --no-check-certificate --continue 
  # and move the train_val_images.zip onto root.




  for dp in (['dt_img','lbl_tr','lbl_val']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

  if not isfile(dpath['dt_img']['fpath']):
      raise ValueError('No file available. !wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip --no-check-certificate --continue ')

  logging.info(f'Unpacking file and it may take sometime')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)


  os.rename(os.path.join(root,'train_images'),
            os.path.join(root,'train'))
  print('Create annotation')
  from tools.data.textrecog.textocr_converter import convert_textocr
  root_path=root
  n_proc=10 # Utilise parallel processing
  num_train_imgs = convert_textocr(
          root_path=root_path,
          dst_image_path='image',
          dst_label_filename='train_label.txt',
          annotation_filename='TextOCR_0.1_train.json',
          nproc=n_proc)


  print(f'Total number of the training images: {num_train_imgs}')
  print('Processing validation set...')
  convert_textocr(
          root_path=root_path,
          dst_image_path='image',
          dst_label_filename='val_label.txt',
          annotation_filename='TextOCR_0.1_val.json',
          img_start_idx=num_train_imgs,
          nproc=n_proc)

  #   ├── TextOCR
  # │   ├── image
  # │   ├── train_label.txt
  # │   └── val_label.txt
textocr('/content/drive/MyDrive/dataset/recognition') 

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#textocr)

File preparation for the `text detection` is similar to `text recognition` except for creating the mmocr-compatible annotation file

In [None]:
!wget -P /content/drive/MyDrive/dataset/detection/textocr https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip --no-check-certificate

In [None]:
def textocr(npath,cleanup=False):
  # Detection
  root=os.path.join(npath,'textocr')
  dpath=dict(dt_img=dict(URL = "https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip",
                         fpath=os.path.join(root,'train_val_images.zip')),
             lbl_tr=dict(URL='https://dl.fbaipublicfiles.com/textvqa/data/textocr/TextOCR_0.1_train.json',
                         fpath=os.path.join(root,'TextOCR_0.1_train.json')),
             lbl_val=dict(URL='https://dl.fbaipublicfiles.com/textvqa/data/textocr/TextOCR_0.1_val.json',
                         fpath=os.path.join(root,'TextOCR_0.1_val.json')),
             )

  ch_make_folder(root)

  # Downloading the train_val_images.zip in Google Colab might have an issue whereby the session crashed after using all available RAM. One way to bypass is to use
  # !wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip --no-check-certificate --continue 
  # and move the train_val_images.zip onto root.




  for dp in (['dt_img','lbl_tr','lbl_val']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'],wget_dw=True)

  if not isfile(dpath['dt_img']['fpath']):
      raise ValueError('No file available. !wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip --no-check-certificate --continue ')

  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)


  os.rename(os.path.join(root,'train_images'),
            os.path.join(root,'train'))


  from tools.data.textdet.textocr_converter import collect_textocr_info,convert_annotations
  import os.path as osp
  root_path = root
  print('Processing training set...')
  training_infos = collect_textocr_info(root_path, 'TextOCR_0.1_train.json')
  convert_annotations(training_infos,
                      osp.join(root_path, 'instances_training.json'))
  print('Processing validation set...')
  val_infos = collect_textocr_info(root_path, 'TextOCR_0.1_val.json')
  convert_annotations(val_infos, osp.join(root_path, 'instances_val.json'))
  print('Finish')


  # The resulting directory structure looks like the following:

  # ├── textocr
  # │   ├── train
  # │   ├── instances_training.json
  # │   └── instances_val.json

textocr('/content/drive/MyDrive/dataset/detection')

# Totaltext

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#totaltext)

In [None]:
!pip install gdown

In [None]:
def totaltext(npath,cleanup=False):
  # Recognition
  import gdown
  root=os.path.join(npath,'totaltext')
  dannot=os.path.join(root,'annotations','training')
  dcrops=os.path.join(root,'imgs')


  dpath=dict(dt_img=dict(URL = 'https://drive.google.com/open?id=1bC68CzsSVTusZVvOkk7imSZSbgD1MqK2&authuser=0',
                         fpath=os.path.join(root,'totaltext.zip')),
             lbl_tr=dict(URL='https://drive.google.com/open?id=1-XrQBoU9as1PXaB_0dUrDTJgvGFFOnDE',
                         fpath=os.path.join(root,'TT_new_train_GT.zip')),
             lbl_txt=dict(URL='https://drive.google.com/file/d/1v-pd-74EkZ3dWe6k0qppRtetjdPQ3ms1/view',
                         fpath=os.path.join(root,'groundtruth_text.zip'))
             )
  

  for dp in ([root,dannot]):
    ch_make_folder(dp)

  if not isfile(dpath['dt_img']['fpath']):
    print('Start download')
    gdown.download(url=dpath['dt_img']['URL'], output=dpath['dt_img']['fpath'], quiet=True, fuzzy=True)

  if not isfile(dpath['lbl_tr']['fpath']):
    print('Start download')
    gdown.download(url=dpath['lbl_tr']['URL'], output=dpath['lbl_tr']['fpath'], quiet=True, fuzzy=True)



  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)
  shutil.unpack_archive(dpath['lbl_tr']['fpath'],root)


  os.rename(os.path.join(root,'Images','Test'),
            os.path.join(root,'Images','test'))
  
  os.rename(os.path.join(root,'Images','Train'),
            os.path.join(root,'Images','training'))
  
  os.rename(os.path.join(root,'Images'),
            os.path.join(root,'imgs'))

  move_files(os.path.join(root,'Train'),dannot)


  from tools.data.textrecog.totaltext_converter import collect_files,collect_annotations,generate_ann
  import mmcv
  import os.path as osp
  nproc=10
  img_dir = osp.join(root, 'imgs')
  gt_dir = osp.join(root, 'annotations')

  set_name = {}
  for split in ['training']:  # Originally ['training', 'test'], but since we only have `training`, we drop the `test`
    set_name.update({split: split + '_label' + '.txt'})
    assert osp.exists(osp.join(img_dir, split))

  for split, ann_name in set_name.items():
    print(f'Converting {split} into {ann_name}')
    with mmcv.Timer(print_tmpl='It takes {}s to convert totaltext annotation'):
      files = collect_files(osp.join(img_dir, split), osp.join(gt_dir, split))
      image_infos = collect_annotations(files, nproc=nproc)
      generate_ann(root, split, image_infos)

  #   ├── TextOCR
  # │   ├── image
  # │   ├── train_label.txt
  # │   └── val_label.txt
totaltext('/content/drive/MyDrive/dataset/recognition') 

Start download
Start download


INFO:root:Unpacking file


Converting training into training_label.txt
Loaded 1255 images from /content/drive/MyDrive/dataset/recognition/totaltext/imgs/training
[>>>>>>>>>>>>>>>>>>>>>>>>>>>] 1255/1255, 41.8 task/s, elapsed: 30s, ETA:     0s
It takes 83.5185935497284s to convert totaltext annotation


## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#totaltext)

In [None]:
def totaltext(npath,cleanup=False):
  # Detection
  import gdown
  root=os.path.join(npath,'totaltext')
  dannot=os.path.join(root,'annotations','training')
  dcrops=os.path.join(root,'imgs')


  dpath=dict(dt_img=dict(URL = 'https://drive.google.com/open?id=1bC68CzsSVTusZVvOkk7imSZSbgD1MqK2&authuser=0',
                         fpath=os.path.join(root,'totaltext.zip')),
             lbl_tr=dict(URL='https://drive.google.com/open?id=1-XrQBoU9as1PXaB_0dUrDTJgvGFFOnDE',
                         fpath=os.path.join(root,'TT_new_train_GT.zip')),
             lbl_txt=dict(URL='https://drive.google.com/file/d/1v-pd-74EkZ3dWe6k0qppRtetjdPQ3ms1/view',
                         fpath=os.path.join(root,'groundtruth_text.zip'))
             )
  

  for dp in ([root,dannot]):
    ch_make_folder(dp)

  if not isfile(dpath['dt_img']['fpath']):
    print('Start download')
    gdown.download(url=dpath['dt_img']['URL'], output=dpath['dt_img']['fpath'], quiet=True, fuzzy=True)

  if not isfile(dpath['lbl_tr']['fpath']):
    print('Start download')
    gdown.download(url=dpath['lbl_tr']['URL'], output=dpath['lbl_tr']['fpath'], quiet=True, fuzzy=True)



  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)
  shutil.unpack_archive(dpath['lbl_tr']['fpath'],root)


  os.rename(os.path.join(root,'Images','Test'),
            os.path.join(root,'Images','test'))
  
  os.rename(os.path.join(root,'Images','Train'),
            os.path.join(root,'Images','training'))
  
  os.rename(os.path.join(root,'Images'),
            os.path.join(root,'imgs'))

  move_files(os.path.join(root,'Train'),dannot)


  from tools.data.textdet.totaltext_converter import collect_files,collect_annotations,convert_annotations
  import mmcv
  import os.path as osp
  root_path = root
  nproc=10
  img_dir = osp.join(root_path, 'imgs')
  gt_dir = osp.join(root_path, 'annotations')

  set_name = {}
  for split in ['training']: # Originally ['training', 'test'], but since we only have `training`, we drop the `test`
      set_name.update({split: 'instances_' + split + '.json'})
      assert osp.exists(osp.join(img_dir, split))

  for split, json_name in set_name.items():
      print(f'Converting {split} into {json_name}')
      with mmcv.Timer(
              print_tmpl='It takes {}s to convert totaltext annotation'):
          files = collect_files(
              osp.join(img_dir, split), osp.join(gt_dir, split))
          image_infos = collect_annotations(files, nproc=nproc)
          convert_annotations(image_infos, osp.join(root_path, json_name))

totaltext('/content/drive/MyDrive/dataset/detection')

Start download


INFO:root:Unpacking file


Converting training into instances_training.json
Loaded 1255 images from /content/drive/MyDrive/dataset/detection/totaltext/imgs/training
[>>>>>>>>>>>>>>>>>>>>>>>>>>>] 1255/1255, 41.2 task/s, elapsed: 30s, ETA:     0s
It takes 32.14170575141907s to convert totaltext annotation


# DeText

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#detext)

In [None]:
def detext(npath,cleanup=False):

  # Recognition
  root=os.path.join(npath,'detext')
  dannot_tr=os.path.join(root,'annotations','training')
  dannot_val=os.path.join(root,'annotations','val')
  dtrain=os.path.join(root,'imgs','training')
  dval=os.path.join(root,'imgs','val')


  
  dpath=dict(dt_tr=dict(URL = 'https://rrc.cvc.uab.es/downloads/ch9_training_images.zip',
                         fpath=os.path.join(root,'ch9_training_images.zip')),
             dt_tr_loc=dict(URL='https://rrc.cvc.uab.es/downloads/ch9_training_localization_transcription_gt.zip',
                         fpath=os.path.join(root,'ch9_training_localization_transcription_gt.zip')),
             dt_val=dict(URL='https://rrc.cvc.uab.es/downloads/ch9_validation_images.zip',
                         fpath=os.path.join(root,'ch9_validation_images.zip')),
             dt_val_loc=dict(URL='https://rrc.cvc.uab.es/downloads/ch9_validation_localization_transcription_gt.zip',
                         fpath=os.path.join(root,'ch9_validation_localization_transcription_gt.zip')),
             )
  

  for dp in ([root,dtrain,dval,dannot_tr,dannot_val]):
    ch_make_folder(dp)

  for dp in (['dt_tr','dt_tr_loc','dt_val','dt_val_loc']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])



  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],dtrain)
  shutil.unpack_archive(dpath['dt_tr_loc']['fpath'],dannot_tr)
  shutil.unpack_archive(dpath['dt_val']['fpath'],dval)
  shutil.unpack_archive(dpath['dt_val_loc']['fpath'],dannot_val)

  from tools.data.textrecog.detext_converter import collect_files,collect_annotations,generate_ann

  import os.path as osp

  # root_path=root
  nproc=10
  preserve_vertical=True
  format='jsonl'
  for split in ['training', 'val']:
    print(f'Processing {split} set...')
    files = collect_files(
            osp.join(root, 'imgs', split),
            osp.join(root, 'annotations', split))
    image_infos = collect_annotations(files, nproc=nproc)
    generate_ann(root, split, image_infos, preserve_vertical,format)


  # ├── detext
  # │   ├── crops
  # │   ├── ignores
  # │   ├── train_label.jsonl
  # │   └── test_label.jsonl
detext('/content/drive/MyDrive/dataset/recognition')

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#detext)

Data preparation is similar to text recognition

In [None]:
def detext(npath,cleanup=False):

  # Detection
  root=os.path.join(npath,'detext')
  dannot_tr=os.path.join(root,'annotations','training')
  dannot_val=os.path.join(root,'annotations','val')
  dtrain=os.path.join(root,'imgs','training')
  dval=os.path.join(root,'imgs','val')


  
  dpath=dict(dt_tr=dict(URL = 'https://rrc.cvc.uab.es/downloads/ch9_training_images.zip',
                         fpath=os.path.join(root,'ch9_training_images.zip')),
             dt_tr_loc=dict(URL='https://rrc.cvc.uab.es/downloads/ch9_training_localization_transcription_gt.zip',
                         fpath=os.path.join(root,'ch9_training_localization_transcription_gt.zip')),
             dt_val=dict(URL='https://rrc.cvc.uab.es/downloads/ch9_validation_images.zip',
                         fpath=os.path.join(root,'ch9_validation_images.zip')),
             dt_val_loc=dict(URL='https://rrc.cvc.uab.es/downloads/ch9_validation_localization_transcription_gt.zip',
                         fpath=os.path.join(root,'ch9_validation_localization_transcription_gt.zip')),
             )
  

  for dp in ([root,dtrain,dval,dannot_tr,dannot_val]):
    ch_make_folder(dp)

  for dp in (['dt_tr','dt_tr_loc','dt_val','dt_val_loc']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])



  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],dtrain)
  shutil.unpack_archive(dpath['dt_tr_loc']['fpath'],dannot_tr)
  shutil.unpack_archive(dpath['dt_val']['fpath'],dval)
  shutil.unpack_archive(dpath['dt_val_loc']['fpath'],dannot_val)

  # Step2: Generate instances_training.json and instances_val.json with following command:

  from tools.data.textdet.detext_converter import collect_files,collect_annotations,convert_annotations
  import mmcv
  import os.path as osp
  root_path =root
  nproc=10
  for split in ['training', 'val']:
      print(f'Processing {split} set...')
      with mmcv.Timer(
              print_tmpl='It takes {}s to convert DeText annotation'):
          files = collect_files(
              osp.join(root_path, 'imgs', split),
              osp.join(root_path, 'annotations', split))
          image_infos = collect_annotations(files, nproc=nproc)
          convert_annotations(
              image_infos, osp.join(root_path,
                                    'instances_' + split + '.json'))

  # After running the above codes, the directory structure should be as follows:

  # │── detext
  # │   ├── annotations
  # │   ├── imgs
  # │   ├── instances_test.json
  # │   └── instances_training.json

detext('/content/drive/MyDrive/dataset/detection/')

# NAF

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#naf)

In [None]:
!pip install GitPython

In [None]:
def naf(npath,cleanup=False):
  # Recognition
  from git import Repo
  # !pip install GitPython
  root=os.path.join(npath,'naf')
  dannot=os.path.join(root,'annotations')
  dpath=dict(dt_tr=dict(URL = 'https://github.com/herobd/NAF_dataset/releases/download/v1.0/labeled_images.tar.gz',
                         fpath=os.path.join(root,'labeled_images.tar.gz')))
  

  for dp in ([root,dannot]):
    ch_make_folder(dp)


  logging.info ('This may take sometime to download ~800 Mb tar file')
  check_dw(dpath['dt_tr']['fpath'],dpath['dt_tr']['URL'])

  if not isfile(os.path.join(root,'NAF_dataset','train_valid_test_split.json')):
    logging.info ('Downloading annotation from github')
    Repo.clone_from('https://github.com/herobd/NAF_dataset.git', os.path.join(root,'NAF_dataset'))
  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],root)
  os.rename(os.path.join(root,'labeled_images'), os.path.join(root,'imgs'))

  logging.info ('Reorganise file and folder')
  shutil.move(os.path.join(root,'NAF_dataset','train_valid_test_split.json'),
              os.path.join(root,'annotations','train_valid_test_split.json'))


  shutil.move(os.path.join(root,'NAF_dataset','groups'),
              os.path.join(root,'annotations','groups'))



  from tools.data.textrecog.naf_converter import collect_files,collect_annotations,generate_ann

  import os.path as osp
  import mmcv
  preserve_vertical=True
  format='jsonl'
  nproc=4
  root_path = root
  split_info = mmcv.load( osp.join(root_path, 'annotations', 'train_valid_test_split.json'))
  split_info['training'] = split_info.pop('train')
  split_info['val'] = split_info.pop('valid')
  for split in ['training', 'val', 'test']:
    print(f'Processing {split} set...')
    with mmcv.Timer(print_tmpl='It takes {}s to convert NAF annotation'):
      files = collect_files(osp.join(root_path, 'imgs'),
                            osp.join(root_path, 'annotations'), split_info[split])
      image_infos = collect_annotations(files, nproc=nproc)
      generate_ann(root_path, split, image_infos, preserve_vertical,format)


  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'NAF_dataset')) 
    os.remove(dpath['dt_tr']['fpath'])


  #   ├── naf
  # │   ├── crops
  # │   ├── train_label.txt
  # │   ├── val_label.txt
  # │   └── test_label.txt

  
naf('/content/drive/MyDrive/dataset/recognition',cleanup=True)

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#naf)

In [None]:
def naf(npath,cleanup=False):
  # Detection
  from git import Repo
  # !pip install GitPython
  root=os.path.join(npath,'naf')
  dannot=os.path.join(root,'annotations')
  dpath=dict(dt_tr=dict(URL = 'https://github.com/herobd/NAF_dataset/releases/download/v1.0/labeled_images.tar.gz',
                         fpath=os.path.join(root,'labeled_images.tar.gz')))
  

  for dp in ([root,dannot]):
    ch_make_folder(dp)


  logging.info ('This may take sometime to download ~800 Mb tar file')
  check_dw(dpath['dt_tr']['fpath'],dpath['dt_tr']['URL'])

  if not isfile(os.path.join(root,'NAF_dataset','train_valid_test_split.json')):
    logging.info ('Downloading annotation from github')
    Repo.clone_from('https://github.com/herobd/NAF_dataset.git', os.path.join(root,'NAF_dataset'))
  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],root)
  os.rename(os.path.join(root,'labeled_images'), os.path.join(root,'imgs'))

  logging.info ('Reorganise file and folder')
  shutil.move(os.path.join(root,'NAF_dataset','train_valid_test_split.json'),
              os.path.join(root,'annotations','train_valid_test_split.json'))


  shutil.move(os.path.join(root,'NAF_dataset','groups'),
              os.path.join(root,'annotations','groups'))



  # Step2: Generate instances_training.json, instances_val.json, and instances_test.json with following command:

  # python tools/data/textdet/naf_converter.py PATH/TO/naf --nproc 4

  # After running the above codes, the directory structure should be as follows:

  from tools.data.textdet.naf_converter import collect_files,collect_annotations,convert_annotations

  import os.path as osp
  import mmcv
  nproc=10
  root_path = root
  split_info = mmcv.load(
      osp.join(root_path, 'annotations', 'train_valid_test_split.json'))
  split_info['training'] = split_info.pop('train')
  split_info['val'] = split_info.pop('valid')
  for split in ['training', 'val', 'test']:
      print(f'Processing {split} set...')
      with mmcv.Timer(print_tmpl='It takes {}s to convert NAF annotation'):
          files = collect_files(
              osp.join(root_path, 'imgs'),
              osp.join(root_path, 'annotations'), split_info[split])
          image_infos = collect_annotations(files, nproc=nproc)
          convert_annotations(
              image_infos, osp.join(root_path,
                                    'instances_' + split + '.json'))
        

  # │── naf
  # │   ├── annotations
  # │   ├── imgs
  # │   ├── instances_test.json
  # │   ├── instances_val.json
  # │   └── instances_training.json

  
naf('/content/drive/MyDrive/dataset/detection',cleanup=False)

# Lecture Video DB

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#lecture-video-db)

In [None]:
def lv(npath,cleanup=False):

  # Recognition
  
  from git import Repo
  # !pip install GitPython
  root=os.path.join(npath,'lv')
  dannot=os.path.join(root,'annotations')
  dpath=dict(dt_tr=dict(URL = 'http://cdn.iiit.ac.in/cdn/preon.iiit.ac.in/~kartik/IIIT-CVid.zip',
                         fpath=os.path.join(root,'IIIT-CVid.zip')))
  
  ch_make_folder(root)

  

  logging.info ('This may take sometime to download ~2.26 Gb zip file (~2 m)')
  check_dw(dpath['dt_tr']['fpath'],dpath['dt_tr']['URL'])

  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],root)

  logging.info ('Reorganise file and folder')
  shutil.move(os.path.join(root,'IIIT-CVid','Crops'),
              os.path.join(root,'Crops'))

  shutil.move(os.path.join(root,'IIIT-CVid','train.txt'),
              os.path.join(root,'train_label.txt'))

  shutil.move(os.path.join(root,'IIIT-CVid','val.txt'),
              os.path.join(root,'val_label.txt'))
  
  shutil.move(os.path.join(root,'IIIT-CVid','test.txt'),
              os.path.join(root,'test_label.txt'))


  from tools.data.textrecog.lv_converter import convert_annotations

  root_path=root
  format='jsonl'
  for split in ['train', 'val', 'test']:
    convert_annotations(root_path, split, format)
    print(f'{split} split converted.')

  # TODO. move the test_label,train_label_val_label from iiit-cvid to root
  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'NAF_dataset')) 
    os.remove(dpath['dt_tr']['fpath'])


# ├── lv
# │   ├── Crops
# │   ├── train_label.jsonl
# │   └── test_label.jsonl

  
lv('/content/drive/MyDrive/dataset/recognition') # Done verification

INFO:root:This may take sometime to download ~2.26 Gb zip file (~2 m)
INFO:root:Downloading IIIT-CVid.zip from http://cdn.iiit.ac.in/cdn/preon.iiit.ac.in/~kartik/IIIT-CVid.zip.
INFO:root:Unpacking file
INFO:root:Reorganise file and folder


train split converted.
val split converted.
test split converted.


## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#lecture-video-db)

This example require user to modify

the line

 `img_file = img_file.split('data/lv/')[1]`

 to 

`img_file =os.path.split(img_file)[-1]`

at

`mmocr/tools/data/textdet/lv_converter.py`

The issue has been raised at [Issue 1078](https://github.com/open-mmlab/mmocr/issues/1078) for  `File out of list!`



In [None]:
def lv(npath,cleanup=False):
  # Detection
  # Only working with tools.data.textdet.lv_converter
  from git import Repo
  # !pip install GitPython
  root=os.path.join(npath,'lv')
  dannot=os.path.join(root,'annotations')
  dpath=dict(dt_tr=dict(URL = 'http://cdn.iiit.ac.in/cdn/preon.iiit.ac.in/~kartik/IIIT-CVid.zip',
                         fpath=os.path.join(root,'IIIT-CVid.zip')))
  
  ch_make_folder(root)

  

  logging.info ('This may take sometime to download ~2.26 Gb zip file (download time about ~5 mins with Colab)')
  check_dw(dpath['dt_tr']['fpath'],dpath['dt_tr']['URL'],wget_dw=True)

  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],root)

  logging.info ('Reorganise file and folder')
  shutil.move(os.path.join(root,'IIIT-CVid','Frames'),  # Diffrence frame vs crop
              os.path.join(root,'imgs'))

  # Step2: Generate instances_training.json, instances_val.json, and instances_test.json with following command:

  # python tools/data/textdet/lv_converter.py PATH/TO/lv --nproc 4


  from tools.data.textdet.lv_converter import collect_files,collect_annotations,convert_annotations
  import os.path as osp
  import mmcv
  nproc=4
  root_path = '/content/detection/lv'

  for split in ['train', 'val', 'test']:
      print(f'Processing {split} set...')
      with mmcv.Timer(print_tmpl='It takes {}s to convert LV annotation'):
          files = collect_files(osp.join(root_path, 'imgs', split))
          print(files)
          image_infos = collect_annotations(files, nproc=nproc)
          convert_annotations(
              image_infos, osp.join(root_path,
                                    'instances_' + split + '.json'))

  # The resulting directory structure looks like the following:


  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'NAF_dataset')) 
    os.remove(dpath['dt_tr']['fpath'])


  # │── lv
  # │   ├── imgs
  # │   ├── instances_test.json
  # │   ├── instances_training.json
  # │   └── instances_val.json

  
lv('/content/detection/')

INFO:root:This may take sometime to download ~2.26 Gb zip file (download time about ~5 mins with Colab)
INFO:root:Unpacking file
INFO:root:Reorganise file and folder


In [None]:
from tools.data.textdet.lv_converter import collect_files,collect_annotations,convert_annotations
import os.path as osp
import mmcv
nproc=4
root_path = '/content/detection/lv'

for split in ['train', 'val', 'test']:
    print(f'Processing {split} set...')
    with mmcv.Timer(print_tmpl='It takes {}s to convert LV annotation'):
        files = collect_files(osp.join(root_path, 'imgs', split))
        print(files)
        image_infos = collect_annotations(files, nproc=nproc)
        convert_annotations(
            image_infos, osp.join(root_path,
                                  'instances_' + split + '.json'))

#  LSVT

Han Character

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#lsvt)

In [None]:
def lsvt(npath,cleanup=False):


  # Recognition
  root=os.path.join(npath,'lsvt')
  dannot=os.path.join(root,'annotations')
  dimg=os.path.join(root,'imgs')
  dpath=dict(dt_img1=dict(URL = 'https://dataset-bj.cdn.bcebos.com/lsvt/train_full_images_0.tar.gz',
                         fpath=os.path.join(root,'train_full_images_0.tar.gz')),
             dt_img2=dict(URL = 'https://dataset-bj.cdn.bcebos.com/lsvt/train_full_images_1.tar.gz',
                         fpath=os.path.join(root,'train_full_images_1.tar.gz')),
             lbl=dict(URL = 'http://dataset-bj.cdn.bcebos.com/lsvt/train_full_labels.json',
                         fpath=os.path.join(root,'train_full_labels.json')),
            )
  
  
  for dp in ([root,dimg,dannot]):
    ch_make_folder(dp)

  logging.info ('This may take sometime to download ~8 Gb zip file (20 m)')
  for dp in ['dt_img1','dt_img2','lbl']:
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img1']['fpath'],root)
  os.rename(os.path.join(root,'train_full_images_0'), os.path.join(root,'imgs'))

  shutil.unpack_archive(dpath['dt_img2']['fpath'],root)
  source_dir=os.path.join(root,'train_full_images_1')

  move_files(source_dir,dimg)

  logging.info ('Reorganise file and folder')
  shutil.move(os.path.join(root,'train_full_labels.json'),dannot)


  from tools.data.textrecog.lsvt_converter import convert_lsvt
  root_path = root

  preserve_vertical=True
  val_ratio=0.2
  nproc=10
  print('Processing training set...')
  num_train_imgs = convert_lsvt(
      root_path=root_path,
      split='train',
      ratio=val_ratio,
      preserve_vertical=preserve_vertical,
      format=format,
      nproc=nproc)
  

  if val_ratio > 0:
      print('Processing validation set...')
      convert_lsvt(
          root_path=root_path,
          split='val',
          ratio=val_ratio,
          preserve_vertical=preserve_vertical,
          format=format,
          nproc=nproc,
          img_start_idx=num_train_imgs)
  print('Finish')

  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'train_full_images_1')) 
    os.remove(dpath['dt_img1']['fpath'])
    os.remove(dpath['dt_img2']['fpath'])


  # ├── lsvt
  # │   ├── crops
  # │   ├── ignores
  # │   ├── train_label.jsonl
  # │   └── val_label.jsonl (optional)

  
lsvt('/content/drive/MyDrive/dataset/recognition') # Got problem, to debug in local

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#lsvt)

In [None]:
def lsvt(npath,cleanup=False):

  #Detection

  root=os.path.join(npath,'lsvt')
  dannot=os.path.join(root,'annotations')
  dimg=os.path.join(root,'imgs')
  dpath=dict(dt_img1=dict(URL = 'https://dataset-bj.cdn.bcebos.com/lsvt/train_full_images_0.tar.gz',
                         fpath=os.path.join(root,'train_full_images_0.tar.gz')),
             dt_img2=dict(URL = 'https://dataset-bj.cdn.bcebos.com/lsvt/train_full_images_1.tar.gz',
                         fpath=os.path.join(root,'train_full_images_1.tar.gz')),
             lbl=dict(URL = 'http://dataset-bj.cdn.bcebos.com/lsvt/train_full_labels.json',
                         fpath=os.path.join(root,'train_full_labels.json')),
            )
  
  
  for dp in ([root,dimg,dannot]):
    ch_make_folder(dp)

  logging.info ('This may take sometime to download ~8 Gb zip file (20 m)')
  for dp in ['dt_img1','dt_img2','lbl']:
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img1']['fpath'],root)
  os.rename(os.path.join(root,'train_full_images_0'), os.path.join(root,'imgs'))

  shutil.unpack_archive(dpath['dt_img2']['fpath'],root)
  source_dir=os.path.join(root,'train_full_images_1')

  move_files(source_dir,dimg)

  logging.info ('Reorganise file and folder')
  shutil.move(os.path.join(root,'train_full_labels.json'),dannot)



  from tools.data.textdet.lsvt_converter import collect_lsvt_info,convert_annotations
  import os.path as osp
  val_ratio=0.2
  root_path = root
  print('Processing training set...')
  training_infos = collect_lsvt_info(root_path, 'train', val_ratio)
  convert_annotations(training_infos,
                      osp.join(root_path, 'instances_training.json'))
  if val_ratio > 0:
      print('Processing validation set...')
      val_infos = collect_lsvt_info(root_path, 'val', val_ratio)
      convert_annotations(val_infos, osp.join(root_path,
                                              'instances_val.json'))
  print('Finish')


  # # Annotations of LSVT test split is not publicly available, split a validation
  # # set by adding --val-ratio 0.2
  # python tools/data/textdet/lsvt_converter.py PATH/TO/lsvt

  # After running the above codes, the directory structure should be as follows:

  # |── lsvt
  # │   ├── imgs
  # │   ├── instances_training.json
  # │   └── instances_val.json (optional)

  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'train_full_images_1')) 
    os.remove(dpath['dt_img1']['fpath'])
    os.remove(dpath['dt_img2']['fpath'])


  
lsvt('/content/drive/MyDrive/dataset/detection') # Works like a charm

INFO:root:This may take sometime to download ~8 Gb zip file (20 m)
INFO:root:Downloading train_full_images_0.tar.gz from https://dataset-bj.cdn.bcebos.com/lsvt/train_full_images_0.tar.gz.
INFO:root:Downloading train_full_images_1.tar.gz from https://dataset-bj.cdn.bcebos.com/lsvt/train_full_images_1.tar.gz.


# FUNSD

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#funsd)

In [None]:
def funsd(npath,cleanup=False):


  #Recognition
  root=os.path.join(npath,'funsd')
  dannot_ts=os.path.join(root,'annotations','test')
  dannot_tr=os.path.join(root,'annotations','training')
  dimg=os.path.join(root,'imgs')
  dpath=dict(dt_img=dict(URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip',
                         fpath=os.path.join(root,'dataset.zip'))
            )
  

  for dp in ([root,dannot_ts,dimg,dannot_tr]):
    ch_make_folder(dp)

  logging.info ('This may take sometime to download ~8 Gb zip file')
  check_dw(dpath['dt_img']['fpath'],dpath['dt_img']['URL'])

  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)



  move_files(os.path.join(root,'dataset','training_data','images'),
             dimg)


  move_files(os.path.join(root,'dataset','testing_data','images'),
            dimg) # img_ts 




  move_files(os.path.join(root,'dataset','testing_data','annotations'),
             dannot_ts)



  move_files(os.path.join(root,'dataset','training_data','annotations'),
             dannot_tr)
  


  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'__MACOSX')) 
    shutil.rmtree(os.path.join(root,'dataset')) 
    os.remove(dpath['dt_img']['fpath'])

  from tools.data.textrecog.funsd_converter import collect_files,collect_annotations,generate_ann
  import os.path as osp
  import mmcv
  nproc=10
  preserve_vertical=True
  format='jsonl'
  root_path = root

  for split in ['training', 'test']:
      print(f'Processing {split} set...')
      with mmcv.Timer(print_tmpl='It takes {}s to convert FUNSD annotation'):
          files = collect_files(
              osp.join(root_path, 'imgs'),
              osp.join(root_path, 'annotations', split))
          image_infos = collect_annotations(files, nproc=nproc)
          generate_ann(root_path, split, image_infos, preserve_vertical,
                      format)


  # ├── funsd
  # │   ├── imgs
  # │   ├── dst_imgs
  # │   ├── annotations
  # │   ├── train_label.txt
  # │   └── test_label.txt

  
funsd('/content/drive/MyDrive/dataset/recognition') # Check

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#funsd)

In [None]:
def funsd(npath,cleanup=False):

  # Detection

  root=os.path.join(npath,'funsd')
  dannot_ts=os.path.join(root,'annotations','test')
  dannot_tr=os.path.join(root,'annotations','training')
  dimg=os.path.join(root,'imgs')
  dpath=dict(dt_img=dict(URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip',
                         fpath=os.path.join(root,'dataset.zip'))
            )
  

  for dp in ([root,dannot_ts,dimg,dannot_tr]):
    ch_make_folder(dp)

  logging.info ('This may take sometime to download ~8 Gb zip file')
  check_dw(dpath['dt_img']['fpath'],dpath['dt_img']['URL'],wget_dw=True)

  
  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)



  move_files(os.path.join(root,'dataset','training_data','images'),
             dimg)


  move_files(os.path.join(root,'dataset','testing_data','images'),
            dimg) # img_ts 


  move_files(os.path.join(root,'dataset','testing_data','annotations'),
             dannot_ts)


  move_files(os.path.join(root,'dataset','training_data','annotations'),
             dannot_tr)
  
  from tools.data.textdet.funsd_converter import collect_files,collect_annotations,convert_annotations
  import os.path as osp
  import mmcv
  nproc=10
  root_path = root

  for split in ['training', 'test']:
      print(f'Processing {split} set...')
      with mmcv.Timer(print_tmpl='It takes {}s to convert FUNSD annotation'):
          files = collect_files(
              osp.join(root_path, 'imgs'),
              osp.join(root_path, 'annotations', split))
          image_infos = collect_annotations(files, nproc=nproc)
          convert_annotations(
              image_infos, osp.join(root_path,
                                    'instances_' + split + '.json'))
  # The resulting directory structure looks like the following:

  # │── funsd
  # │   ├── annotations
  # │   ├── imgs
  # │   ├── instances_test.json
  # │   └── instances_training.json


  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'__MACOSX')) 
    shutil.rmtree(os.path.join(root,'dataset')) 
    os.remove(dpath['dt_img']['fpath'])


  
funsd('/content/drive/MyDrive/dataset/detection') # Work like a charm

# COCO Text v2

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#coco-text-v2)

In [None]:
!pip install wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def coco_textv2(npath,cleanup=False):


  # Recognition
  root=os.path.join(npath,'coco_textv2')
  dannot=os.path.join(root,'annotations')

  # dimg=os.path.join(root,'imgs')

  dpath=dict(dt_tr=dict(URL = 'http://images.cocodataset.org/zips/train2014.zip',
                         fpath=os.path.join(root,'train2014.zip')),
             dt_trx=dict(URL = 'https://github.com/bgshih/cocotext/releases/download/dl/cocotext.v2.zip',
                         fpath=os.path.join(root,'cocotext.v2.zip'))
            )
  

  for dp in ([root,dannot]):
    ch_make_folder(dp)

  # Potential error with Google Colab wherby the session crashed after using all available RAM
  logging.info ('This may take sometime to download ~12.58 Gb zip file')

  for dp in ['dt_tr','dt_trx']:
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'],wget_dw=False)

  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],root)
  shutil.unpack_archive(dpath['dt_trx']['fpath'],root)

  os.rename(os.path.join(root,'train2014'),
            os.path.join(root,'imgs'))

  shutil.move(os.path.join(root, 'cocotext.v2.json'),
              dannot)


  from tools.data.textrecog.cocotext_converter import convert_cocotext
  root_path = root
  preserve_vertical=True
  nproc=10
  format='jsonl'
  print('Processing training set...')
  num_train_imgs = convert_cocotext(
      root_path=root_path,
      split='train',
      preserve_vertical=preserve_vertical,
      format=format,
      nproc=nproc)
  print('Processing validation set...')
  convert_cocotext(
      root_path=root_path,
      split='val',
      preserve_vertical=preserve_vertical,
      format=format,
      nproc=nproc,
      img_start_idx=num_train_imgs)
  print('Finish')

  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'dataset')) 
    os.remove(dpath['dt_tr']['fpath'])
    os.remove(dpath['dt_trx']['fpath'])

  #   ├── coco_textv2
  # │   ├── crops
  # │   ├── ignores
  # │   ├── train_label.jsonl
  # │   └── val_label.jsonl
    
coco_textv2('/content/drive/MyDrive/dataset/recognition') # Working locally

INFO:root:This may take sometime to download ~12.58 Gb zip file
INFO:root:Downloading train2014.zip from http://images.cocodataset.org/zips/train2014.zip.


## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#coco-text-v2)

In [None]:
def coco_textv2(npath,cleanup=False):



  root=os.path.join(npath,'coco_textv2')
  dannot=os.path.join(root,'annotations')

  # dimg=os.path.join(root,'imgs')

  dpath=dict(dt_tr=dict(URL = 'http://images.cocodataset.org/zips/train2014.zip',
                         fpath=os.path.join(root,'train2014.zip')),
             dt_trx=dict(URL = 'https://github.com/bgshih/cocotext/releases/download/dl/cocotext.v2.zip',
                         fpath=os.path.join(root,'cocotext.v2.zip'))
            )
  

  for dp in ([root,dannot]):
    ch_make_folder(dp)

  # Potential error with Google Colab wherby the session crashed after using all available RAM
  logging.info ('This may take sometime to download ~12.58 Gb zip file')

  for dp in ['dt_tr','dt_trx']:
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'],wget_dw=True)

  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_tr']['fpath'],root)
  shutil.unpack_archive(dpath['dt_trx']['fpath'],root)

  os.rename(os.path.join(root,'train2014'),
            os.path.join(root,'imgs'))

  shutil.move(os.path.join(root, 'cocotext.v2.json'),
              dannot)



  from tools.data.textdet.cocotext_converter import collect_cocotext_info,convert_annotations
  import os.path as osp

  # preserve_vertical=True
  # nproc=10
  # format='jsonl'


  root_path =root
  print('Processing training set...')
  training_infos = collect_cocotext_info(root_path, 'train')
  convert_annotations(training_infos,
                      osp.join(root_path, 'instances_training.json'))
  print('Processing validation set...')
  val_infos = collect_cocotext_info(root_path, 'val')
  convert_annotations(val_infos, osp.join(root_path, 'instances_val.json'))
  print('Finish')


  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'dataset')) 
    os.remove(dpath['dt_tr']['fpath'])
    os.remove(dpath['dt_trx']['fpath'])

  # Step2: Generate instances_training.json and instances_val.json with the following command:

  # python tools/data/textdet/cocotext_converter.py PATH/TO/coco_textv2

  # After running the above codes, the directory structure should be as follows:

  # │── coco_textv2
  # │   ├── annotations
  # │   ├── imgs
  # │   ├── instances_training.json
  # │   └── instances_val.json
    
coco_textv2('/content/drive/MyDrive/dataset/detection') # Work like a charm at local

INFO:root:This may take sometime to download ~12.58 Gb zip file
INFO:root:Downloading train2014.zip from http://images.cocodataset.org/zips/train2014.zip.
INFO:root:Downloading cocotext.v2.zip from https://github.com/bgshih/cocotext/releases/download/dl/cocotext.v2.zip.
INFO:root:Unpacking file


# Vintext

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#vintext)

In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml" -O vintext.zip && rm -rf /tmp/cookies.txt

In [None]:
def vintext(npath,cleanup=False):

  # Recognition

  root=os.path.join(npath,'vintext')
  dannot=os.path.join(root,'annotations')

  dimg=os.path.join(root,'imgs')
  dimg_tr=os.path.join(dimg,'training')
  dimg_ts=os.path.join(dimg,'test')
  dimg_ur=os.path.join(dimg,'unseen_test')
  dpath=dict(dt_img=dict(URL = 'https://docs.google.com/uc?export=download&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml',
                         fpath=os.path.join(root,'vintext.zip'))
            )
  
  for dp in [root,dannot,dimg,
             dimg_tr,
             dimg_ur,
             dimg_ts]:
    ch_make_folder(dp)


  if not isfile(dpath['dt_img']['fpath']):
      fd,fname=os.path.split(dpath['dt_img']['fpath'])
      logging.info (f'Please download the {fname} and stored under {fd}.or use the command')
      # raise ('wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml" -O vintext.zip && rm -rf /tmp/cookies.txt')


  logging.info(f'Unpacking file')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)


  logging.info(f'Create annotation')
  for dfrom,dto in zip(['labels','train_images','test_image','unseen_test_images'],
                       [dannot,dimg_tr,dimg_ts,dimg_ur]):
    move_files(os.path.join(root,'vietnamese',dfrom),dto)

 

  preserve_vertical=True
  nproc=10
  format='jsonl'
  import os.path as osp
  import mmcv
  from tools.data.textrecog.vintext_converter import collect_files,collect_annotations,generate_ann
  root_path = root
  for split in ['training', 'test', 'unseen_test']:
      print(f'Processing {split} set...')
      with mmcv.Timer(
              print_tmpl='It takes {}s to convert VinText annotation'):
          files = collect_files(
              osp.join(root_path, 'imgs', split),
              osp.join(root_path, 'annotations'))
          image_infos = collect_annotations(files, nproc=nproc)
          generate_ann(root_path, split, image_infos, preserve_vertical,
                      format)
        
  if cleanup:
    logging.info ('Cleaning up')
    shutil.rmtree(os.path.join(root,'vietnamese')) 
    os.remove(dpath['dt_img']['fpath'])




    
vintext('/content/drive/MyDrive/dataset/recognition') # Working locally

In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml" -O vintext.zip && rm -rf /tmp/cookies.txt

##[Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#vintext)

In [None]:

def vintext(npath,cleanup=False):

    # Detection

    root=os.path.join(npath,'vintext')
    dannot=os.path.join(root,'annotations')

    dimg=os.path.join(root,'imgs')
    dimg_tr=os.path.join(dimg,'training')
    dimg_ts=os.path.join(dimg,'test')
    dimg_ur=os.path.join(dimg,'unseen_test')
    dpath=dict(dt_img=dict(URL = 'https://docs.google.com/uc?export=download&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml',
                           fpath=os.path.join(root,'vietnamese_original.zip'))
               )

    for dp in [root,dannot,dimg,
               dimg_tr,
               dimg_ur,
               dimg_ts]:
        ch_make_folder(dp)


    if not isfile(dpath['dt_img']['fpath']):
        fd,fname=os.path.split(dpath['dt_img']['fpath'])
        logging.info (f'Please download the {fname} and stored under {fd}.or use the command')
        # raise ('wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1UUQhNvzgpZy7zXBFQp0Qox-BBjunZ0ml" -O vintext.zip && rm -rf /tmp/cookies.txt')


    logging.info(f'Unpacking file')
    shutil.unpack_archive(dpath['dt_img']['fpath'],root)


    logging.info('Move files')
    for dfrom,dto in zip(['labels','train_images','test_image','unseen_test_images'],
                         [dannot,dimg_tr,dimg_ts,dimg_ur]):
        move_files(os.path.join(root,'vietnamese',dfrom),dto)

    nproc=10

    from tools.data.textdet.vintext_converter import collect_files,collect_annotations,convert_annotations
    root_path = root
    print('Prepare annotation')
    for split in ['training', 'test', 'unseen_test']:
        print(f'Processing {split} set...')
        with mmcv.Timer(
                print_tmpl='It takes {}s to convert VinText annotation'):
            files = collect_files(
                osp.join(root_path, 'imgs', split),
                osp.join(root_path, 'annotations'))
            image_infos = collect_annotations(files, nproc=nproc)
            convert_annotations(
                image_infos, osp.join(root_path,
                                      'instances_' + split + '.json'))

    if cleanup:
        logging.info ('Cleaning up')
        shutil.rmtree(os.path.join(root,'vietnamese'))
        os.remove(dpath['dt_img']['fpath'])
  # Step2: Generate instances_training.json, instances_test.json and instances_unseen_test.json

  # python tools/data/textdet/vintext_converter.py PATH/TO/vintext --nproc 4

  # After running the above codes, the directory structure should be as follows:

  # │── vintext
  # │   ├── annotations
  # │   ├── imgs
  # │   ├── instances_test.json
  # │   ├── instances_unseen_test.json
  # │   └── instances_training.json

  vintext('/content/drive/MyDrive/dataset/detection') # Work like a charm at local


# BID

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#bid)

In [None]:
!pip install gdown

In [None]:

def bid(npath,cleanup=False):
    # Recognition
    import gdown

    from glob import glob

    root=os.path.join(npath,'bid')


    dannot=os.path.join(root,'annotations')

    dimg=os.path.join(root,'imgs')

    dpath=dict(dt_img=dict(URL = 'https://drive.google.com/uc?id=1Oi88TRcpdjZmJ79WDLb9qFlBNG8q2De6&export=download',
                           fpath=os.path.join(root,'BID Dataset.zip'))
               )


    for dp in ([root,dannot,dimg]):
        ch_make_folder(dp)


    if not isfile(dpath['dt_img']['fpath']):
        logging.info ('It may take sometime to download 6.81 Gb zip file from Google Drive')
        # url = 'https://drive.google.com/uc?id=1Oi88TRcpdjZmJ79WDLb9qFlBNG8q2De6&export=download'
        gdown.download(dpath['dt_img']['URL'], dpath['dt_img']['fpath'], quiet=False)

    logging.info ('It may take sometime to extract 6.81 Gb zip')
    shutil.unpack_archive(dpath['dt_img']['fpath'],root)

    ls_jpg=glob(os.path.join(root,'BID Dataset','*','*.jpg'))
    ls_txt=glob(os.path.join(root,'BID Dataset','*','*.txt'))

    for ls_ext,ls_dest in zip([ls_jpg,ls_txt],
                      [dimg,dannot]):
        move_files_to_des(ls_ext,ls_dest)


    root_path = root
    preserve_vertical=True
    nproc=10
    format='jsonl'
    val_ratio=0.2
    import os.path as osp
    import mmcv
    from tools.data.textrecog.bid_converter import collect_files,collect_annotations,generate_ann
    # root_path = '/content/bid'
    with mmcv.Timer(print_tmpl='It takes {}s to convert BID annotation'):
        files = collect_files(
            osp.join(root_path, 'imgs'), osp.join(root_path, 'annotations'))
        print('Start Collect annotation')
        image_infos = collect_annotations(files, nproc=nproc)
        print('Start Generating  annotation')
        generate_ann(root_path, image_infos, preserve_vertical,
                     val_ratio, format)
        
                


        
    if cleanup:
        logging.info ('Cleaning up')
        move_files(os.path.join(root,'vietnamese','labels'),dannot)
  
#     ├── BID
# │   ├── crops
# │   ├── ignores
# │   ├── train_label.jsonl
# │   └── val_label.jsonl (optional)
bid('/content/drive/MyDrive/dataset/recognition') # Work like a charm locally

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#bid)

In [None]:
def bid(npath,cleanup=False):

    import gdown

    from glob import glob

    root=os.path.join(npath,'bid')


    dannot=os.path.join(root,'annotations')

    dimg=os.path.join(root,'imgs')

    dpath=dict(dt_img=dict(URL = 'https://drive.google.com/uc?id=1Oi88TRcpdjZmJ79WDLb9qFlBNG8q2De6&export=download',
                           fpath=os.path.join(root,'BID Dataset.zip'))
               )


    for dp in ([root,dannot,dimg]):
        ch_make_folder(dp)


    if not isfile(dpath['dt_img']['fpath']):
        logging.info ('It may take sometime to download 6.81 Gb zip file from Google Drive')
        # url = 'https://drive.google.com/uc?id=1Oi88TRcpdjZmJ79WDLb9qFlBNG8q2De6&export=download'
        gdown.download(dpath['dt_img']['URL'], dpath['dt_img']['fpath'], quiet=False)

    logging.info ('It may take sometime to extract 6.81 Gb zip')
    shutil.unpack_archive(dpath['dt_img']['fpath'],root)

    ls_jpg=glob(os.path.join(root,'BID Dataset','*','*.jpg'))
    ls_txt=glob(os.path.join(root,'BID Dataset','*','*.txt'))

    for ls_ext,ls_dest in zip([ls_jpg,ls_txt],
                      [dimg,dannot]):
        move_files_to_des(ls_ext,ls_dest)



  import os.path as osp
  import mmcv
  from tools.data.textdet.bid_converter import collect_files,collect_annotations,split_train_val_list,convert_annotations

  preserve_vertical=True
  nproc=10
  format='jsonl'
  val_ratio=0.2
  root_path =root
  with mmcv.Timer(print_tmpl='It takes {}s to convert BID annotation'):
      files = collect_files(
          osp.join(root_path, 'imgs'), osp.join(root_path, 'annotations'))
      image_infos = collect_annotations(files, nproc=nproc)
      if val_ratio:
          image_infos = split_train_val_list(image_infos, val_ratio)
          splits = ['training', 'val']
      else:
          image_infos = [image_infos]
          splits = ['training']
      for i, split in enumerate(splits):
          convert_annotations(
              image_infos[i],
              osp.join(root_path, 'instances_' + split + '.json'))
          
  # Step3: - Step3: Generate instances_training.json and instances_val.json (optional). Since the original dataset doesn’t have a validation set, you may specify --val-ratio to split the dataset. E.g., if val-ratio is 0.2, then 20% of the data are left out as the validation set in this example.

  # python tools/data/textdet/bid_converter.py PATH/TO/BID --nproc 4

  # After running the above codes, the directory structure should be as follows:

  # │── BID
  # │   ├── annotations
  # │   ├── imgs
  # │   ├── instances_training.json
  # │   └── instances_val.json (optional)

                


        
    if cleanup:
        logging.info ('Cleaning up')
        move_files(os.path.join(root,'vietnamese','labels'),dannot)

bid('/content/detection') # Yet to check

INFO:root:It may take sometime to extract 6.81 Gb zip


# ArT

## [Text Recognition](https://mmocr.readthedocs.io/en/latest/datasets/recog.html#art)

In [None]:
def art(npath,cleanup=False):


  # Recognition
  root=os.path.join(npath,'art')
  dannot=os.path.join(root,'annotations')

  dcrop=os.path.join(root,'crops')

  dpath=dict(dt_img=dict(URL = 'https://dataset-bj.cdn.bcebos.com/art/train_task2_images.tar.gz',
                         fpath=os.path.join(root,'train_task2_images.tar.gz')),
             lbl=dict(URL = 'https://dataset-bj.cdn.bcebos.com/art/train_task2_labels.json',
                         fpath=os.path.join(root,'train_task2_labels.json'))
            )
  for dp in ([root,dannot,dcrop]):
    ch_make_folder(dp)


  logging.info ('It may take sometime to extract 439 Mb tar.gz')

  for dp in (['dt_img','lbl']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

  logging.info ('It may take sometime to extract 439 Mb tar.gzp')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)
  os.rename(os.path.join(root,'train_task2_images'), os.path.join(root,'imgs'))

  
  shutil.move(os.path.join(root,'train_task2_labels.json'),
              os.path.join(dannot,'train_task2_labels.json'))
  

  root_path = root
  preserve_vertical=True
  nproc=10
  format='jsonl'
  val_ratio=0.2

  from tools.data.textrecog.art_converter import convert_art


  print('Processing training set...')
  convert_art(
      root_path=root_path,
      split='train',
      ratio=val_ratio,
      format=format)
  if val_ratio > 0:
      print('Processing validation set...')
      convert_art(
          root_path=root_path,
          split='val',
          ratio=val_ratio,
          format=format)
  print('Finish')


  if cleanup:
    logging.info ('Cleaning up')
    os.remove(dpath['dt_img']['fpath'])
# │── art
# │   ├── crops
# │   ├── train_label.jsonl
# │   └── val_label.jsonl (optional)
art('/content')

##[Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#art)

In [None]:
def art(npath,cleanup=False):



  root=os.path.join(npath,'art')
  dannot=os.path.join(root,'annotations')
  dpath=dict(dt_img=dict(URL = 'https://dataset-bj.cdn.bcebos.com/art/train_images.tar.gz',
                         fpath=os.path.join(root,'train_images.tar.gz')),
             lbl=dict(URL = 'https://dataset-bj.cdn.bcebos.com/art/train_labels.json',
                         fpath=os.path.join(root,'train_labels.json'))
            )
  for dp in ([root,dannot,dcrop]):
    ch_make_folder(dp)


  logging.info ('It may take sometime to extract 439 Mb tar.gz')

  for dp in (['dt_img','lbl']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])

  logging.info ('It may take sometime to extract 6.81 Gb zip')
  shutil.unpack_archive(dpath['dt_img']['fpath'],root)
  os.rename(os.path.join(root,'train_images'), os.path.join(root,'imgs')) # The diffrent between recog and detection

  shutil.move(dpath['lbl']['fpath'],dannot)
  

  import os.path as osp

  from tools.data.textdet.art_converter import collect_art_info, convert_annotations

  root_path = root
  val_ratio=0.2
  print('Processing training set...')
  training_infos = collect_art_info(root_path, 'train', val_ratio)
  convert_annotations(training_infos,
                      osp.join(root_path, 'instances_training.json'))
  if val_ratio > 0:
      print('Processing validation set...')
      val_infos = collect_art_info(root_path, 'val', val_ratio)
      convert_annotations(val_infos, osp.join(root_path,
                                              'instances_val.json'))
  print('Finish')


  # Annotations of ArT test split is not publicly available, split a validation set by adding --val-ratio 0.2
  # python tools/data/textdet/art_converter.py PATH/TO/art --nproc 4

  # After running the above codes, the directory structure should be as follows:

  # │── art
  # │   ├── annotations
  # │   ├── imgs
  # │   ├── instances_training.json
  # │   └── instances_val.json (optional)




  # if cleanup:
  #   logging.info ('Cleaning up')
  #   os.remove(dpath['dt_img']['fpath'])

art('/content/detection/')

INFO:root:It may take sometime to extract 439 Mb tar.gz
INFO:root:It may take sometime to extract 6.81 Gb zip


Processing training set...
training #4482, val #1121
1000/4482
2000/4482
3000/4482
4000/4482
Processing validation set...
training #4482, val #1121
1000/1121
Finish


# CTW1500


## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#ctw1500)

In [None]:
def ctw1500(npath,cleanup=False):



  root=os.path.join(npath,'ctw1500')
  dtr=os.path.join(root,'training')
  dts=os.path.join(root,'test')

  dpath=dict(lbl_tr=dict(URL = 'https://universityofadelaide.box.com/shared/static/jikuazluzyj4lq6umzei7m2ppmt3afyw.zip',
                         fpath=os.path.join(root,'train_labels.zip')),
             lbl_ts=dict(URL = 'https://cloudstor.aarnet.edu.au/plus/s/uoeFl0pCN9BOCN5/download',
                         fpath=os.path.join(root,'test_labels.zip')),
             dt_tr=dict(URL = 'https://universityofadelaide.box.com/shared/static/py5uwlfyyytbb2pxzq9czvu6fuqbjdh8.zip',
                         fpath=os.path.join(root,'train_images.zip')),
             dt_ts=dict(URL = 'https://universityofadelaide.box.com/shared/static/t4w48ofnqkdw7jyc4t11nsukoeqk9c3d.zip',
                         fpath=os.path.join(root,'test_images.zip'))
            )
  for dp in ([root,dtr,dts]):
    ch_make_folder(dp)


  logging.info ('It may take sometime to extract 439 Mb tar.gz')
  for dp in (['lbl_tr','lbl_ts','dt_tr','dt_ts']):
    check_dw(dpath[dp]['fpath'],dpath[dp]['URL'])


  logging.info ('It may take sometime to extract 6.81 Gb zip')
  for dp,fdir in zip(['lbl_tr','lbl_ts','dt_tr','dt_ts'],
                [dtr,dts,dtr,dts]):
    shutil.unpack_archive(dpath[dp]['fpath'],fdir)


  if cleanup:
    logging.info ('Cleaning up')
    for dp in (['lbl_tr','lbl_ts','dt_tr','dt_ts']):
      print(dpath[dp]['fpath'])
      os.remove(dpath[dp]['fpath'])

# ├── ctw1500
# │   ├── imgs
# │   ├── annotations
# │   ├── instances_training.json
# │   └── instances_val.json
ctw1500('/content')

# CurvedSynText150k (KIV)

Since the file is to large, about 32 Gb, adviseable to first download and store in the folder

## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#curvedsyntext150k)

In [None]:



def curvedsyntext(npath,cleanup=False):


  Warning.warn('WIP: Still find an alternative on automatically download from Google Drive')
  root=os.path.join(npath,'curvedsyntext')
  dtr=os.path.join(root,'training')
  dts=os.path.join(root,'test')

  dpath=dict(dt_1=dict(URL = 'https://drive.google.com/open?id=1OSJ-zId2h3t_-I7g_wUkrK-VqQy153Kj&authuser=0',
                         fpath=os.path.join(root,'syntext1.zip ')),
             dt_2=dict(URL = 'https://drive.google.com/open?id=1EzkcOlIgEp5wmEubvHb7-J5EImHExYgY&authuser=0',
                         fpath=os.path.join(root,'syntext2.zip ')),
            )
  for dp in ([root,dtr,dts]):
    ch_make_folder(dp)

  import gdown
  for dp in (['dt_1','dt_2']):
    logging.info ('It may take sometime to download 32 Gb zip file from Google Drive')
    if not isfile(dpath[dp]['fpath']):
        gdown.download(dpath[dp]['URL'], dpath[dp]['fpath'], quiet=False)



  logging.info ('It may take sometime to extract 6.81 Gb zip')
  for dp,fdir in zip(['dt_1','dt_2'],
                [dtr,dts]):
    shutil.unpack_archive(dpath[dp]['fpath'],fdir)


  if cleanup:
    logging.info ('Cleaning up')
    for dp in (['lbl_tr','lbl_ts','dt_tr','dt_ts']):
      print(dpath[dp]['fpath'])
      os.remove(dpath[dp]['fpath'])

# ├── CurvedSynText150k
# │   ├── syntext_word_eng
# │   ├── emcs_imgs
# │   └── instances_training.json
curvedsyntext('/content')

# SROIE

warning.warn('WIP since need to download from Google Drive')

Links

Main page
https://rrc.cvc.uab.es/?ch=13&com=downloads


https://rrc.cvc.uab.es/?com=downloads&action=download&ch=13&f=aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL29wZW4/aWQ9MVNoSXROV1h5aVkxdEZETTVXMDJiY2VIdUpqeWVlSmwy





https://rrc.cvc.uab.es/?com=downloads&action=download&ch=13&f=aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL29wZW4/aWQ9MVNoSXROV1h5aVkxdEZETTVXMDJiY2VIdUpqeWVlSmwy


https://rrc.cvc.uab.es/?com=downloads&action=download&ch=13&f=aHR0cHM6Ly9ycmMuY3ZjLnVhYi5lcy9kb3dubG9hZHMvU1JPSUVfdGVzdF9pbWFnZXNfdGFza18zLnppcA==







## [Text Detection](https://mmocr.readthedocs.io/en/latest/datasets/det.html#sroie)

In [None]:
warning.warn('WIP since need to download from Google Drive')

Links

Main page
https://rrc.cvc.uab.es/?ch=13&com=downloads


https://rrc.cvc.uab.es/?com=downloads&action=download&ch=13&f=aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL29wZW4/aWQ9MVNoSXROV1h5aVkxdEZETTVXMDJiY2VIdUpqeWVlSmwy

# KAIST (KOREAN) NA

# MTWI (Chinese) NA

# ReCTS

obust Reading Challenge on Reading Chinese Text on Signboard

# IIIT-ILST


Devanagari ,Malayalam, Telugu

In [None]:
https://iiitaphyd-my.sharepoint.com/personal/minesh_mathew_research_iiit_ac_in/_layouts/15/download.aspx?UniqueId=dffd4198%2Dbcdc%2D4994%2D990d%2D682525da47dd

In [None]:
!wget https://iiitaphyd-my.sharepoint.com/personal/minesh_mathew_research_iiit_ac_in/_layouts/15/download.aspx?UniqueId=dffd4198%2Dbcdc%2D4994%2D990d%2D682525da47dd

--2022-06-12 06:50:28--  https://iiitaphyd-my.sharepoint.com/personal/minesh_mathew_research_iiit_ac_in/_layouts/15/download.aspx?UniqueId=dffd4198%2Dbcdc%2D4994%2D990d%2D682525da47dd
Resolving iiitaphyd-my.sharepoint.com (iiitaphyd-my.sharepoint.com)... 13.107.136.9, 13.107.138.9
Connecting to iiitaphyd-my.sharepoint.com (iiitaphyd-my.sharepoint.com)|13.107.136.9|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2022-06-12 06:50:29 ERROR 403: Forbidden.



# RCTW

ICDAR2017 Competition on Reading Chinese Text in the Wild

# HierText

ICDAR2017 Competition on Reading Chinese Text in the Wild


Step2: Clone HierText repo to get annotations

# IMGUR (TBD)

In [None]:
def imgur(npath):



  root=os.path.join(npath,'imgur')
  dannot=os.path.join(root,'annotations')
  dimg=os.path.join(root,'imgs')

  dpath=dict(dt_img=dict(URL = 'XXX',
                         fpath=os.path.join(root,'XX'))
            )
  

  for dp in ([root,dannot,dimg]):
    ch_make_folder(dp)

  logging.info ('Download images from imgur.com. This may take SEVERAL HOURS!')
 
imgur('/content')

# Other

In [None]:
# !wget https://rrc.cvc.uab.es/downloads/ch9_training_images.zip --no-check-certificate


def check_dw(sfile,url):
  if not isfile(sfile):
      logging.info(f'The file {sfile} is not availaible, downloading from {url}')
      wget.download(url, out=sfile)
      # r = requests.get(url, verify=False,stream=True)  
      # with open(sfile, 'wb') as f:
      #   f.write(r.content)
check_dw('ch9_training_images.zip','https://rrc.cvc.uab.es/downloads/ch9_training_images.zip')

In [None]:
import sys
import requests
def download(url, filename):
    with open(filename, 'wb') as f:
        response = requests.get(url, stream=True)
        total = response.headers.get('content-length')

        if total is None:
            f.write(response.content)
        else:
            downloaded = 0
            total = int(total)
            for data in response.iter_content(chunk_size=max(int(total/1000), 1024*1024)):
                downloaded += len(data)
                f.write(data)
                done = int(50*downloaded/total)
                sys.stdout.write('\r[{}{}]'.format('█' * done, '.' * (50-done)))
                sys.stdout.flush()
    sys.stdout.write('\n')

In [None]:

url='https://download.openmmlab.com/mmocr/data/mixture/Syn90k/label.txt'
download(url, 'label.txt')