# Tag images from a file system
This notebook demonstrates how to use Splash-ML to scan a file system for files and then introduce them to splash-ml, how to save tag sets using the TagService in the tagging packages, and how to query on those tags.

The notebook uses mongomock to mimic a mongo database instance in memory. This could easily be replaced with a MongoClient from pymongo.

This notebook downloads and imports a pre-laballed data set that was generously provided by Carolin Sutter-Fella.

<cite>
    This data was acquired at beamline 12.3.2 at the Advanced Light
Source, which is a DOE Office of Science User Facility under
contract no. DE-AC02-05CH11231.
</cite>

The notebook downloads the dataset, which is a set of .tiff files and a csv that contains labels and the human tagger's confidence for each file. 
Lables include 'peaks', 'rings', 'rods' and 'arcs'. This notebook does the following:
- Download sample files locally into a folder called 'data/labelled'
- "Anonymize" the files, which copies them into a folder 'data/anonymous'
- Setups a TagService instance and introduce those files from 'data/anonymous' and their labels from the labels.csv file
- Demonstrates querying for those tags



In [None]:
import sys; sys.path.insert(0, '../..')
from datetime import datetime
import glob
import os
from pathlib import Path
import pandas as pd

from mongomock import MongoClient
from tagging.model import FileDataset, TagSource, TaggingEvent, DatasetType, Tag
from tagging.tag_service import TagService
from tagging.util.files import anonymize_copy

# By default, this does not require a mongo server running, but can use the mongomock
# library to create an in-memory simulation of mongo. This will be deleted when the 
# kernel is shutdown. 
in_memory_db = True
if in_memory_db:
    from mongomock import MongoClient
else:
    # for now, expects mongo running on localhost:27
    from pymongo import MongoClient

src_root_path = os.path.join("data", "labelled")
src_relative_path = "labelled"
dest_root = os.path.join("data", "anonymous", src_relative_path)



In [None]:
def download_zip():
    from tqdm import tqdm
    import time
    import requests
    from zipfile import ZipFile
    link = "https://portal.nersc.gov/cfs/als/splash_ml/labelled.zip"
    file_name = "data/labelled.zip"

    if os.path.exists('data'):
        print("Directory data/labelled exists...skipping")
        
    else:
        os.mkdir('data')
        with open(file_name, "wb") as file:
            print(f"downloading {link} to {file_name}")
            response = requests.get(link, stream=True)
            total_length = int(response.headers.get('content-length'))
            chunk_size = 4096  # 1 MB
            num_bars = int(total_length / chunk_size)
            print(total_length)
            if total_length is None:
                f.write(response.content)
            else:
                for chunk in tqdm(
                    response.iter_content(chunk_size=chunk_size), total=num_bars, unit='KB', desc="labelled.zip", leave=True, file=sys.stdout):
                    file.write(chunk)

    with ZipFile('data/labelled.zip', 'r') as zipObj:
        # Extract all the contents of zip file in different directory
        zipObj.extractall('data')
        print('File is unzipped into  "data/labelled" folder') 

download_zip()

In [None]:
db = MongoClient()

#use glob to find all the files to ingest
paths = glob.glob('data/labelled/**/*.tif*', recursive=True)

# tag_svc instance to be used throughout creating and querying tags
tag_svc = TagService(db, db_name='tagging')

# tagger represents the entity creates tags on the assets...in this case, it's us!
tagger = tag_svc.create_tag_source(TagSource(type="human", name="build_tag notebook"))

# event is recorded with each tag on each asset so we know when and what created the tags, in this case, it's us and now!
tagging_event = tag_svc.create_tagging_event(TaggingEvent(tagger_id=tagger.uid, run_time=datetime.now()))

# read the csv for tags
labels = pd.read_csv("data/labelled/labels.csv", header=[0])

num_files = 0
for src_root_path in paths:
    anonymous_file = anonymize_copy(src_root_path, src_relative_path, dest_root)
    # We'll take advantage of the fact that the anonymous file name is unique (hash of the file)
    anonymous_file_name = os.path.splitext(os.path.split(anonymous_file)[1])[0]
    # get label row from csv using file name
    file_name = os.path.splitext(os.path.split(src_root_path)[1])[0]
    
    3 # An Dataset is a reference in the tagging database that stores information about something being tagged as well
    # as its tags. In this case, we have a file, so we're creating a FileDataset. Databroker assets will be available soon!
    # We pass the anonymous file's name, which is a hash of the file itself, as the uid. This is optional, a uid will be created
    # if none is passed.
    asset = FileDataset(uid=anonymous_file_name, uri=anonymous_file)
    # Associate this file with the name listed in the csv
    row = labels.loc[labels['image name'] == int(file_name)]
    tags = []
    tags.append(Tag(name="peaks", confidence=row['peaks'].values[0], event_id=tagging_event.uid))
    tags.append(Tag(name="rings", confidence=row['rings'].values[0], event_id=tagging_event.uid))
    tags.append(Tag(name="rods", confidence=row['rods'].values[0], event_id=tagging_event.uid))
    tags.append(Tag(name="arcs", confidence=row['arcs'].values[0], event_id=tagging_event.uid))
    asset.tags = tags
    tag_svc.create_dataset(asset)
    num_files += 1
print(f"Anonymized, imported and tagged {num_files} files")

Now that we have loaded the tagging database, we can do some queries on what we have. First, find random tagging events.

In [None]:
assets_with_peaks = tag_svc.find_datasets(tags=["peaks"])
for asset in assets_with_peaks:
    tag = [tag for tag in asset.tags if tag.name == "peaks"]
    print(f"uid: {asset.uid} at {asset.uri} has peak tag: {tag}")

We query based on tags. (note that this signature will be enhanced to make confidence parameters a range and optional)