In [11]:
import pandas as pd
import glob
import os
from icecream import ic
import sqlite3
import random
from datetime import datetime
import shutil
import subprocess

In [12]:
IMAGE_DIR_PATH = '/home/aubrey/Desktop/Guam07-training-set/rawdata'
DB_PATH = 'active_learning.sqlite3'
DATASET_DIR = '/home/aubrey/Desktop/Guam07-training-set/datasets/active_learning'
LABELIMG_PATH = '/home/aubrey/labelImg/labelImg.py', 

In [13]:
def time_format(): 
    now = datetime.now() 
    return f'{now.strftime("%H:%M:%S")} --> '
  
ic.configureOutput(prefix=time_format) 

In [14]:
def initialize_db(image_dir_path: str, db_path: str):
    """
    Creates an SQLite database and adds an 'images' table with 3 fields: 'imagefilename', 'randy' and 'subset'
    The default value for 'subset' is 'pool'.
    'randy' is populated with 'random.randint(0, 1000000)'. 
    Set a seed in the main code to enable reproducable random sequences (eg: 'random.seed(42)').
    """
    ic()
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row   # enables accessing values in results by field name
    conn.execute('DROP TABLE IF EXISTS images;')
    conn.execute('CREATE TABLE images (imagefilename TEXT, randy INTEGER, subset TEXT DEFAULT "pool");')
    for imagepath in glob.iglob(f'{IMAGE_DIR_PATH}/*.jpg'):
        imagefilename = os.path.basename(imagepath)
        randy = random.randint(0, 1000000)
        conn.execute(f'INSERT INTO images (imagefilename, randy) VALUES("{imagefilename}", {randy});')
    conn.commit()
    conn.close()

In [15]:
def pool2subset(subset: str, n: int):
    '''
    Changes the images.subset DB field in "n" randomly selected records from "pool" to the value specified by subset.
    '''
    ic()
    conn = sqlite3.connect(DB_PATH)
    sql = f'''UPDATE images SET subset = "{subset}"
        WHERE subset = "pool"
        ORDER BY randy 
        LIMIT {n};'''
    conn.execute(sql)
    conn.commit()
    conn.close()

# # Adds 50 records to 'train000' subset    
# pool2subset('train000', 50)
# # Adds an additional 100 records 'train000' subset
# pool2subset('train000', 100)

In [16]:
def runquery(sql: str):
    """ 
    This function facilitates ad hoc queries.
    """
    conn = sqlite3.connect(DB_PATH)
    results = conn.execute(sql).fetchall()
    conn.close()
    return results

# runquery('select count(*) from images GROUP BY subset;')

In [17]:
def create_dataset():
    """
    
    """

    # delete folders and files if they exist

    dirpath = DATASET_DIR
    if os.path.exists(dirpath) and os.path.isdir(dirpath):
        shutil.rmtree(dirpath)

    # create folder structure

    for dir in ['train000', 'validate000']:
        os.makedirs(f'{DATASET_DIR}/{dir}')

    # save data.yaml in top level folder

    yaml = f'''train: {DATASET_DIR}/train000
    val: {DATASET_DIR}/validate000
    nc: 4
    names: [undamaged, damaged, dead, vcut]'''

    with open(f'{DATASET_DIR}/data.yaml', 'w') as f:
        f.write(yaml)
        
    # save classes.txt in train, val and test folders

    classes = f'''undamaged
    damaged
    dead
    vcut'''

    with open(f'{DATASET_DIR}/classes.txt', 'w') as f:
        f.write(classes)
        
    # Create symlinks to images
    conn = sqlite3.connect(DB_PATH)
    conn.row_factory = sqlite3.Row   # enables accessing values in results by field name

    for row in conn.execute('SELECT imagefilename, subset FROM images WHERE subset != "pool" GROUP BY imagefilename, subset;'):
        imagefilename = row['imagefilename']
        subset = row['subset']
        
        # Create symlink to image (*.jpg)
        src = f'{IMAGE_DIR_PATH}/{imagefilename}'
        dst = f'{DATASET_DIR}/{subset}'
        # ic(src, dst)
        subprocess.run(['ln', '-s', src, dst])       
    conn.close()

In [18]:
def annotate(subset: str):
    """
    Executes labelImg to annotate specified subset
    """
    subprocess.run([
        'python3', 
        '/home/aubrey/labelImg/labelImg.py',
        f'/home/aubrey/Desktop/Guam07-training-set/datasets/active_learning/{subset}',
        '/home/aubrey/Desktop/Guam07-training-set/datasets/active_learning/classes.txt'])
    
# annotate('validate000')

In [19]:
# MAIN

random.seed(42) # set seed to ensure reproducible random number sequences

# Create database
initialize_db(IMAGE_DIR_PATH, DB_PATH)
ic(runquery('SELECT subset, COUNT(*) FROM images GROUP BY subset;'))
pool2subset('train000', 50)
pool2subset('validate000', 50)
ic(runquery('SELECT subset, COUNT(*) FROM images GROUP BY subset;'))

# Create dataset

create_dataset()

# Annotate train000

# annotate('train000')

ic('FINISHED')
ic('Run "annotate(train000) to annotate train000 using labelImg,')

19:52:44 --> 3644326808.py:8 in initialize_db() at 19:52:44.921


19:52:45 --> runquery('SELECT subset, COUNT(*) FROM images GROUP BY subset;'): [('pool', 26263)]
19:52:45 --> 3236093874.py:5 in pool2subset() at 19:52:45.175
19:52:45 --> 3236093874.py:5 in pool2subset() at 19:52:45.198
19:52:45 --> runquery('SELECT subset, COUNT(*) FROM images GROUP BY subset;'): [('pool', 26163), ('train000', 50), ('validate000', 50)]
19:52:45 --> 'FINISHED'
19:52:45 --> 'Run "annotate(train000) to annotate train000 using labelImg,': 'Run "annotate(train000) to annotate train000 using labelImg,'


'Run "annotate(train000) to annotate train000 using labelImg,'

In [21]:
# annotate('train000')

Cancel creation.
