# Install requirements

In [3]:
pip install -r drive/MyDrive/requirements.txt

Collecting PyMuPDF==1.18.10
[?25l  Downloading https://files.pythonhosted.org/packages/68/40/a6c75add19eebdc0aa17e54d05d5a52554291e75c6291d5a6be9cbdd457e/PyMuPDF-1.18.10-cp37-cp37m-manylinux2010_x86_64.whl (6.4MB)
[K     |████████████████████████████████| 6.4MB 6.0MB/s 
[?25hCollecting imageai==2.1.6
[?25l  Downloading https://files.pythonhosted.org/packages/73/44/3d5d8ef572888025666eec284e85f9243faf06ca8c12085dcff1ca9754ed/imageai-2.1.6-py3-none-any.whl (160kB)
[K     |████████████████████████████████| 163kB 41.0MB/s 
[?25hCollecting Pillow==7.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/f5/79/b2d5695d1a931474fa68b68ec93bdf08ba9acbc4d6b3b628eb6aac81d11c/Pillow-7.0.0-cp37-cp37m-manylinux1_x86_64.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 38.8MB/s 
[?25hCollecting tensorflow==2.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/94/0a/012cc33c643d844433d13001dd1db179e7020b05ddbbd0a9dc86c38a8efa/tensorflow-2.4.0-cp37-cp37m-manylinux

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Prepare dataset for training the model

In [25]:
import os
import shutil

import fitz
from PIL import Image
from imageai.Detection.Custom import CustomObjectDetection


def calc_percent(current, total, ndigits=2):
    return round(current / max(total, 1) * 100, ndigits)


def copy_label_images_data(src_dir='drive/MyDrive/pdf', dst_dir='drive/MyDrive/train_data/circle', train_data_ratio=0.8):
    """
    Copy the annotated pictures and marked files to the training data set folder
    :param src_dir: Annotated picture folder
    :param dst_dir: Training data folder
    :param train_data_ratio: Proportion of training data, 80% of the total data is used as training data by default
    """
    print('copy label images data: {} -> {}'.format(src_dir, dst_dir))

    # List of all picture folders
    pdf_dirs = []
    for dir_name in os.listdir(src_dir):
        if dir_name.startswith('.') or dir_name.endswith('.pdf'):
            continue

        dir_path = os.path.join(src_dir, dir_name)
        pdf_dirs.append(dir_path)

    # Training data set path
    train_data_dir = os.path.join(dst_dir, 'train')
    images_dir = os.path.join(train_data_dir, 'images')
    annotations_dir = os.path.join(train_data_dir, 'annotations')

    # Verify the data set path
    validation_dir = os.path.join(dst_dir, 'validation')
    images_valid_dir = os.path.join(validation_dir, 'images')
    annotations_valid_dir = os.path.join(validation_dir, 'annotations')

    # Find all the annotation data files
    annotations_files = []
    for pdf_dir in pdf_dirs:
        filenames = os.listdir(pdf_dir)
        filenames = list(filter(lambda name: name.endswith('.xml'), filenames))
        for filename in filenames:
            filename = filename.replace('.xml', '')
            filepath = pdf_dir
            annotations_files.append({
                'filename': filename,
                'path': filepath
            })

    # Calculate the number of training sets, and copy the data to the corresponding folders of the training set and the verification set
    train_data_cnt = round(len(annotations_files) * train_data_ratio)
    for item in annotations_files[:train_data_cnt]:
        filename = item['filename']
        filepath = item['path']

        dst_filename = '{}_{}'.format(os.path.basename(item['path']), filename)

        shutil.copyfile(
            os.path.join(filepath, '{}.jpg'.format(filename)),
            os.path.join(images_dir, '{}.jpg'.format(dst_filename))
        )

        shutil.copyfile(
            os.path.join(filepath, '{}.xml'.format(filename)),
            os.path.join(annotations_dir, '{}.xml'.format(dst_filename))
        )

    for item in annotations_files[train_data_cnt:]:
        filename = item['filename']
        filepath = item['path']

        dst_filename = '{}_{}'.format(os.path.basename(item['path']), filename)

        shutil.copyfile(
            os.path.join(filepath, '{}.jpg'.format(filename)),
            os.path.join(images_valid_dir, '{}.jpg'.format(dst_filename))
        )

        shutil.copyfile(
            os.path.join(filepath, '{}.xml'.format(filename)),
            os.path.join(annotations_valid_dir, '{}.xml'.format(dst_filename))
        )


def extract_images_from_pdf(pdf_dir):
    """
    Extract the pictures in the PDF file and store them in a folder named after the PDF file
    :param pdf_dir: The path of the folder where the PDF is stored
    """
    print('extract all pdf to images(jpg): {}'.format(pdf_dir))

    if not os.path.exists(pdf_dir):
        print('pdf dir not exists: {}'.format(pdf_dir))
        return

    # Get a list of all PDF file names
    pdf_names = os.listdir(pdf_dir)
    pdf_names = list(filter(lambda name: name.endswith('.pdf'), pdf_names))

    if not pdf_names:
        print('pdf file not found')
        return

    cnt = 1
    total = len(pdf_names)
    percent = calc_percent(cnt, total)
    print('{}% -> {}/{}'.format(percent, cnt, total))

    # Extract one by one
    for pdf_name in pdf_names:
        # PDF file name
        pdf_filename = os.path.join(pdf_dir, pdf_name)

        # Image output folder name
        img_output_dir = pdf_name.replace('.pdf', '')
        img_output_dir = os.path.join(pdf_dir, img_output_dir)

        # Delete the old picture output folder
        if os.path.exists(img_output_dir):
            shutil.rmtree(img_output_dir)

        # Create a new image output folder
        os.makedirs(img_output_dir)

        # Open PDF files
        with fitz.open(pdf_filename) as doc:
            page_cnt = 0
            page_total = doc.page_count

            # Traverse all pages
            for page in doc:
                img_filename = os.path.join(img_output_dir, 'page-{}.jpg'.format(page.number))

                # Get the bitmap of the page
                pix = page.get_pixmap()

                # Save the bitmap as a file
                img = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
                img.save(img_filename, 'PNG')

                page_cnt += 1
                page_percent = calc_percent(page_cnt, page_total)
                print('\rpdf: {}% -> {}/{}, page: {}% -> {}/{}'.format(percent, cnt, total, page_percent, page_cnt, page_total), end='')

        # Print the progress
        percent = calc_percent(cnt, total)
        cnt += 1
        print('{}% -> {}/{}'.format(percent, cnt, total))

In [26]:
pdf_dir = 'drive/MyDrive/pdf'
extract_images_from_pdf(pdf_dir)

extract all pdf to images(jpg): drive/MyDrive/pdf
5.26% -> 1/19
pdf: 5.26% -> 1/19, page: 100.0% -> 120/1205.26% -> 2/19
pdf: 5.26% -> 2/19, page: 100.0% -> 120/12010.53% -> 3/19
pdf: 10.53% -> 3/19, page: 100.0% -> 148/14815.79% -> 4/19
pdf: 15.79% -> 4/19, page: 100.0% -> 138/13821.05% -> 5/19
pdf: 21.05% -> 5/19, page: 100.0% -> 118/11826.32% -> 6/19
pdf: 26.32% -> 6/19, page: 100.0% -> 120/12031.58% -> 7/19
pdf: 31.58% -> 7/19, page: 100.0% -> 116/11636.84% -> 8/19
pdf: 36.84% -> 8/19, page: 100.0% -> 196/19642.11% -> 9/19
pdf: 42.11% -> 9/19, page: 100.0% -> 92/9247.37% -> 10/19
pdf: 47.37% -> 10/19, page: 100.0% -> 132/13252.63% -> 11/19
pdf: 52.63% -> 11/19, page: 100.0% -> 102/10257.89% -> 12/19
pdf: 57.89% -> 12/19, page: 100.0% -> 68/6863.16% -> 13/19
pdf: 63.16% -> 13/19, page: 100.0% -> 164/16468.42% -> 14/19
pdf: 68.42% -> 14/19, page: 100.0% -> 559/55973.68% -> 15/19
pdf: 73.68% -> 15/19, page: 100.0% -> 158/15878.95% -> 16/19
pdf: 78.95% -> 16/19, page: 100.0% -> 165/165

mupdf: cannot find object in xref (1896 0 R)
mupdf: cannot find object in xref (1891 0 R)
mupdf: cannot find object in xref (1886 0 R)
mupdf: cannot find object in xref (1881 0 R)
mupdf: cannot find object in xref (1876 0 R)
mupdf: cannot find object in xref (1871 0 R)
mupdf: cannot find object in xref (1866 0 R)
mupdf: cannot find object in xref (1861 0 R)
mupdf: cannot find object in xref (1856 0 R)
mupdf: cannot find object in xref (1851 0 R)
mupdf: cannot find object in xref (1846 0 R)
mupdf: cannot find object in xref (1841 0 R)
mupdf: cannot find object in xref (1836 0 R)
mupdf: cannot find object in xref (1831 0 R)
mupdf: cannot find object in xref (1826 0 R)
mupdf: cannot find object in xref (1821 0 R)
mupdf: cannot find object in xref (1816 0 R)
mupdf: cannot find object in xref (1811 0 R)
mupdf: cannot find object in xref (1806 0 R)
mupdf: cannot find object in xref (1801 0 R)
mupdf: cannot find object in xref (1796 0 R)
mupdf: cannot find object in xref (1791 0 R)
mupdf: can

pdf: 94.74% -> 19/19, page: 100.0% -> 4/4100.0% -> 20/19


# Copy the training data set

In [None]:
import os
import shutil

def copy_label_images_data(src_dir='drive/MyDrive/pdf', dst_dir='drive/MyDrive/train_data/circle', train_data_ratio=0.8):
    """
    Copy the annotated pictures and marked files to the training data set folder
    :param src_dir: Annotated picture folder
    :param dst_dir: Training data folder
    :param train_data_ratio: Proportion of training data, 80% of the total data is used as training data by default
    """
    print('copy label images data: {} -> {}'.format(src_dir, dst_dir))

    # List of all picture folders
    pdf_dirs = []
    for dir_name in os.listdir(src_dir):
        if dir_name.startswith('.') or dir_name.endswith('.pdf'):
            continue

        dir_path = os.path.join(src_dir, dir_name)
        pdf_dirs.append(dir_path)

    # Training data set path
    train_data_dir = os.path.join(dst_dir, 'train')
    images_dir = os.path.join(train_data_dir, 'images')
    annotations_dir = os.path.join(train_data_dir, 'annotations')

    # Verify the data set path
    validation_dir = os.path.join(dst_dir, 'validation')
    images_valid_dir = os.path.join(validation_dir, 'images')
    annotations_valid_dir = os.path.join(validation_dir, 'annotations')

    # Find all the annotation data files
    annotations_files = []
    for pdf_dir in pdf_dirs:
        filenames = os.listdir(pdf_dir)
        filenames = list(filter(lambda name: name.endswith('.xml'), filenames))
        for filename in filenames:
            filename = filename.replace('.xml', '')
            filepath = pdf_dir
            annotations_files.append({
                'filename': filename,
                'path': filepath
            })

    # Calculate the number of training sets, and copy the data to the corresponding folders of the training set and the verification set
    train_data_cnt = round(len(annotations_files) * train_data_ratio)
    for item in annotations_files[:train_data_cnt]:
        filename = item['filename']
        filepath = item['path']

        dst_filename = '{}_{}'.format(os.path.basename(item['path']), filename)

        shutil.copyfile(
            os.path.join(filepath, '{}.jpg'.format(filename)),
            os.path.join(images_dir, '{}.jpg'.format(dst_filename))
        )

        shutil.copyfile(
            os.path.join(filepath, '{}.xml'.format(filename)),
            os.path.join(annotations_dir, '{}.xml'.format(dst_filename))
        )

    for item in annotations_files[train_data_cnt:]:
        filename = item['filename']
        filepath = item['path']

        dst_filename = '{}_{}'.format(os.path.basename(item['path']), filename)

        shutil.copyfile(
            os.path.join(filepath, '{}.jpg'.format(filename)),
            os.path.join(images_valid_dir, '{}.jpg'.format(dst_filename))
        )

        shutil.copyfile(
            os.path.join(filepath, '{}.xml'.format(filename)),
            os.path.join(annotations_valid_dir, '{}.xml'.format(dst_filename))
        )

In [None]:
copy_label_images_data()

copy label images data: drive/MyDrive/pdf -> drive/MyDrive/train_data/circle


# Annotate the data set

# Correct the annotated data

In [None]:
import os
import re

def correct_train_data():
    xml_dirs = (
        os.path.join('drive','MyDrive','train_data', 'circle', 'train', 'annotations'),
        os.path.join('drive','MyDrive','train_data', 'circle', 'validation', 'annotations')
    )

    for xml_dir in xml_dirs:
        xml_file_paths = os.listdir(xml_dir)
        xml_file_paths = list(filter(lambda name: name.endswith('.xml'), xml_file_paths))

        for xml_file_path in xml_file_paths:
            print('correct train data: {}'.format(xml_file_path))
            img_filename = xml_file_path.replace('.xml', '.jpg')
            xml_file_path = os.path.join(xml_dir, xml_file_path)
            with open(xml_file_path, 'r+') as fp:
                xml_text = fp.read()
                xml_text = re.sub('<filename>.*</filename>', '<filename>{}</filename>'.format(img_filename), xml_text)
                fp.seek(0)
                fp.write(xml_text)

In [None]:
correct_train_data()

correct train data: EE10_ROUGE_page-102.xml
correct train data: 590787567_page-71.xml
correct train data: 590787567_page-65.xml
correct train data: 590787567_page-63.xml
correct train data: 590787567_page-69.xml
correct train data: 590787567_page-49.xml
correct train data: 590787567_page-53.xml
correct train data: 590787567_page-51.xml
correct train data: 590787567_page-47.xml
correct train data: 590787567_page-45.xml
correct train data: 590787567_page-61.xml
correct train data: 590787567_page-403.xml
correct train data: 590787567_page-397.xml
correct train data: 590787567_page-399.xml
correct train data: 590787567_page-401.xml
correct train data: 590787567_page-391.xml
correct train data: 590787567_page-385.xml
correct train data: 590787567_page-389.xml
correct train data: 590787567_page-393.xml
correct train data: 590787567_page-387.xml
correct train data: 590787567_page-395.xml
correct train data: 590787567_page-383.xml
correct train data: 590787567_page-381.xml
correct train data: 

# Start training the model

In [None]:
import tensorflow as tf
from imageai.Detection.Custom import DetectionModelTrainer


def train():
    trainer = DetectionModelTrainer()
    trainer.setModelTypeAsYOLOv3()
    trainer.setDataDirectory(data_directory=os.path.join('drive','MyDrive','train_data', 'circle'))
    trainer.setTrainConfig(
        object_names_array=['circle'],
        batch_size=4,
        num_experiments=200,
        train_from_pretrained_model='pretrained-yolov3.h5'
    )
    trainer.trainModel()


def init_tf():
    # Set up GPU video memory to apply on demand to prevent insufficient video memory
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)

    print_is_gpu_available()


def print_is_gpu_available():
    print('gpu available: {}'.format(tf.test.is_gpu_available()))

In [None]:
init_tf()
train()

1 Physical GPUs, 1 Logical GPUs
gpu available: True
Generating anchor boxes for training images and annotation...
junk after document element: line 87, column 0
Ignore this bad annotation: drive/MyDrive/train_data/circle/train/annotations/590787567_page-135.xml
junk after document element: line 99, column 0
Ignore this bad annotation: drive/MyDrive/train_data/circle/train/annotations/590787567_page-137.xml
not well-formed (invalid token): line 51, column 1
Ignore this bad annotation: drive/MyDrive/train_data/circle/train/annotations/590787567_page-139.xml
not well-formed (invalid token): line 39, column 1
Ignore this bad annotation: drive/MyDrive/train_data/circle/train/annotations/590787567_page-149.xml
not well-formed (invalid token): line 51, column 1
Ignore this bad annotation: drive/MyDrive/train_data/circle/train/annotations/590787567_page-15.xml
not well-formed (invalid token): line 27, column 1
Ignore this bad annotation: drive/MyDrive/train_data/circle/train/annotations/590787





  "Even though the tf.config.experimental_run_functions_eagerly "


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

# After training, test the recognition effect

In [None]:
from imageai.Detection.Custom import CustomObjectDetection

def test(img_path, img_out, minimum_percentage_probability=10):
    """
    Recognition test
    :param img_path \\Image file path
    :param img_out \\Result image path
    :param minimum_percentage_probability \\Minimum similarity of graphics
    """
    print('test: {} -> {}'.format(img_path, img_out))

    # Load the model
    detector = CustomObjectDetection()
    detector.setModelTypeAsYOLOv3()
    detector.setModelPath('drive/MyDrive/train_data/circle/models/detection_model-ex-138--loss-0014.575.h5')
    detector.setJsonPath('drive/MyDrive/train_data/circle/models/detection_config.json')
    detector.loadModel()

    # Detect
    detections = detector.detectObjectsFromImage(
        input_image=img_path,
        output_image_path=img_out,
        minimum_percentage_probability=minimum_percentage_probability
    )

    # Print the result
    for detection in detections:
        print("{}: {} -> {}".format(detection['name'], detection['percentage_probability'], detection['box_points']))

In [None]:
# Recognition effect test
img_test = 'test2.jpg'
img_out = 'test_out2.jpg'
test(img_test, img_out)

test: test2.jpg -> test_out2.jpg
circle: 60.18859148025513 -> [185, 91, 205, 136]
circle: 54.27982211112976 -> [286, 210, 310, 255]
circle: 31.01051151752472 -> [446, 263, 471, 310]
circle: 47.973573207855225 -> [337, 289, 362, 332]
circle: 34.994131326675415 -> [180, 375, 203, 422]
circle: 48.67052137851715 -> [221, 533, 244, 568]
circle: 29.37847375869751 -> [326, 567, 354, 612]
circle: 39.46417272090912 -> [379, 583, 405, 617]


  "Even though the tf.config.experimental_run_functions_eagerly "
