# Installation

Clone the repo
```
git clone https://github.com/aubreymoore/CRB-Damage-Dataset-Improvement
```

Move to the new folder
```
cd CRB-Damage-Dataset-Improvement
```

Create a virtual environment
```
python3 -m venv .venv
```

Activate the new virtual environment
```
source venv/bin/activate
```

Install required python modules
```
pip install -r code/requirements.txt
```

Create a .gitignore file and add .venv to the list of files and folders to be ignored.
Adding a virtual environmant to a repository is bad practice.
```
echo ".venv" >> .gitignore
```

# References

https://pybit.es/articles/a-better-place-to-put-your-python-virtual-environments/

In [1]:
import os
import shutil
import glob
import fiftyone as fo
import logging
import sys

In [48]:
def update_requirements_file():
    os.system('pip list --format=freeze > requirements.txt')

# update_requirements_file()

In [49]:
def create_new_dataset(original_ds_path, new_ds_path):
    """ 
    """
    os.mkdir(new_ds_path)
    os.mkdir(f'{new_ds_path}/images')
    os.mkdir(f'{new_ds_path}/images/train')
    os.mkdir(f'{new_ds_path}/images/val')
    os.mkdir(f'{new_ds_path}/labels')
    os.mkdir(f'{new_ds_path}/labels/train')
    os.mkdir(f'{new_ds_path}/labels/val')
    
    for filepath in glob.glob(f'{original_ds_path}/train/*.jpg'):
        shutil.copy2(filepath, f'{new_ds_path}/images/train')
    for filepath in glob.glob(f'{original_ds_path}/train/*.txt'):
        shutil.copy2(filepath, f'{new_ds_path}/labels/train')
    for filepath in glob.glob(f'{original_ds_path}/val/*.jpg'):
        shutil.copy2(filepath, f'{new_ds_path}/images/val')
    for filepath in glob.glob(f'{original_ds_path}/val/*.txt'):
        shutil.copy2(filepath, f'{new_ds_path}/labels/val')
        
    s = f'path: {new_ds_path} \n'
    s += 'train: ./images/train/ \n'
    s += 'val: ./images/val/ \n'
    s += 'names: \n'
    s += '  0: live \n'
    s += '  1: dead \n'
    s += '  2: vcut \n'
    with open(f'{new_ds_path}/dataset.yaml', 'w') as f:
        f.write(s)

In [50]:
def yolo2fiftyone(fo_dataset_name, dataset_dir, splits=["train", "val"]):
    """ 
    Imports a dataset in YOLO5 format into FiftyOne, using tags to mark the samples in each split 
    """ 
    dataset = fo.Dataset(name, persistent=True)
    for split in splits:
        dataset.add_dir(
            dataset_dir=dataset_dir,
            dataset_type=fo.types.YOLOv5Dataset,
            split=split,
            tags=split,
    )
    return dataset

In [51]:
def create_datetime_tag():
    """
    """
    pass

In [52]:
def remove_autocorrelated_images(dataset):
    """ 
    """ 
    pass

# import numpy as np
# from numpy.linalg import norm

# def cosine_similarity(a, b):
#     return np.dot(a,b)/(norm(a)*norm(b))
 
# # a = np.array([2,1,2,3,2,9])
# # b = np.array([3,4,2,4,5,5])
# # cosine_similarity(a, b)


# sorted_by_datetime_view = dataset.load_saved_view('sorted_by_datatime_view')

# thresh = 0.92

# first_sample = True
# for sample in sorted_by_datetime_view:
#     if first_sample:
#         current_embeddings = sample.embeddings
#         similarity = 0.0
#         first_sample = False
#     else:
#         previous_embeddings = current_embeddings
#         current_embeddings = sample.embeddings
#         similarity = cosine_similarity(previous_embeddings, current_embeddings)
#         sample['similarity_with_prev_img'] = similarity
#     if similarity > thresh:
#         sample.tags.append(f'similarity>{thresh}')
#     else:
#         sample.tags.append('similarity OK') 
#     sample.save()

In [None]:
# MAIN
original_ds_path = '/home/aubrey/Desktop/Guam07-training-set/datasets/3class-no-symlinks'
new_ds_path = '/home/aubrey/datasets/Guam07v3'
fo_dataset_name = 'Guam07v3'
logfile = 'create_new_dataset.log'

# Set up logger to log to notebook and logfile

logging.basicConfig(
    level=logging.DEBUG, 
    format='[%(asctime)s] {%(pathname)s:%(lineno)d} %(levelname)s - %(message)s',
    datefmt='%H:%M:%S',
    handlers=[
        logging.FileHandler(filename=logfile),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger()

# wrangle dataset into YOLOv5 format

if os.path.exists(new_ds_path):
    logger.info(f'{new_ds_path} already exists in YOLOv5 format')
else:
    logger.info(f'creating dataset "{new_ds_path}" in YOLOv5 format')
    create_new_dataset(original_ds_path, new_ds_path)

# Create new FiftyOne dataset

if fo_dataset_name in fo.list_datasets():
    logger.info(f'FiftyOne dataset "{fo_dataset_name}" already exists') 
else:
    dataset = yolo2fiftyone(name=fo_dataset_name, dataset_dir=new_ds_path)

# Load FiftyOne dataset and launch FiftyOne app in browser

logger.info(f'Loading FiftyOne dataset "{fo_dataset_name}"')
dataset = fo.load_dataset(fo_dataset_name)
logger.info(dataset)
logger.info(f'Launching FifyOne app in browser')
fo.launch_app(dataset, auto=False)

[{3365230751.py:22} INFO - /home/aubrey/datasets/Guam07v3 already exists in YOLOv5 format
[{3365230751.py:30} INFO - FiftyOne dataset "Guam07v3" already exists
[{3365230751.py:36} INFO - Loading FiftyOne dataset "Guam07v3"
[{3365230751.py:38} INFO - Name:        Guam07v3
Media type:  image
Num samples: 10414
Persistent:  True
Tags:        []
Sample fields:
    id:               fiftyone.core.fields.ObjectIdField
    filepath:         fiftyone.core.fields.StringField
    tags:             fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:         fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    created_at:       fiftyone.core.fields.DateTimeField
    last_modified_at: fiftyone.core.fields.DateTimeField
    ground_truth:     fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
[{3365230751.py:39} INFO - Launching FifyOne app in browser
Session launched. Run `session.show()` to open the App in a cell ou

Dataset:          Guam07v3
Media type:       image
Num samples:      10414
Selected samples: 0
Selected labels:  0
Session URL:      http://localhost:5151/