# Aculei notebook

This notebook is the collection of all steps made to create a dataset from hunter-camera images.

The dataset is used to feed a database that serves [aculei](http://aculei.xyz) archive.

In [1]:
import pandas as pd

from PIL import Image

from tqdm import tqdm

from transformers import pipeline

import utils.hasher as hasher
import utils.moonphase as moonphase
import utils.ocr as ocr

import exiftool

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "openai/clip-vit-large-patch14"

detector = pipeline(model=checkpoint, task="zero-shot-image-classification")

candidate_labels = ["porcupine", "wild boar", "fox", "hare", "deer", 
                    "badger", "wolf", "horse", "dog", "cat", "buzzard", "heron", "mallard", "squirrel"]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use mps:0


In [3]:
data_folder = 'data/'
image_folder = 'images/'
folders = os.listdir(image_folder)
folders = [f for f in folders if os.path.isdir(image_folder + f)]
experience_folder = os.path.join(image_folder, 'experience')

In [None]:
df = pd.DataFrame(columns=['id', 'image_name', 'predicted_animal', 'moon_phase', 'temperature', 'date'])

for folder in folders:
    files = os.listdir(image_folder + folder)
    images = [f for f in files if f.endswith('.jpg')]
    image_paths = [os.path.join(image_folder, folder, path) for path in images]
    
    metadata_dict = {}
    with exiftool.ExifToolHelper() as et:
        metadata = et.get_metadata(image_paths)
        for d in metadata:
            try:
                metadata_dict[d["SourceFile"]] = d["EXIF:DateTimeOriginal"]
            except KeyError:
                pass

    for path in tqdm(image_paths, desc=f"Processing images from {folder}"):
        image = Image.open(path)

        predictions = detector(image, candidate_labels=candidate_labels)
        animal_label = predictions[0]["label"]

        id = hasher.generate_md5_image_id(image=image)
        
        date = None
        moon_phase = None
        try:
            date = metadata_dict[path]
            date = date.replace(':', '-', 2)
            moon_phase = moonphase.phase(date)
        except KeyError:
            date = ocr.extract_date(image)
            if date:
                moon_phase = moonphase.phase(date)
        
        temperature = ocr.extract_temperature(image)

        image_name = path.split('/')[-1]
        
        row = {'id': id, 'image_name': image_name, 'predicted_animal': animal_label, 'moon_phase': moon_phase, 
               'temperature': temperature, 'date': date, 'cam': folder}
        
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

df.to_csv(os.path.join(data_folder, 'archive.csv'), index=False)

Processing images from CAM2: 100%|██████████| 49/49 [00:22<00:00,  2.14it/s]
Processing images from CAM5: 100%|██████████| 75/75 [00:27<00:00,  2.73it/s]
Processing images from CAM4: 100%|██████████| 110/110 [00:39<00:00,  2.80it/s]
Processing images from CAM3: 100%|██████████| 180/180 [01:07<00:00,  2.67it/s]
Processing images from CAM6: 100%|██████████| 146/146 [01:13<00:00,  1.98it/s]
Processing images from CAM1: 100%|██████████| 49/49 [00:20<00:00,  2.44it/s]
Processing images from CAM7: 100%|██████████| 57/57 [00:27<00:00,  2.04it/s]


In [5]:
df = pd.DataFrame(columns=['id', 'image_name', 'predicted_animal', 'moon_phase', 'temperature', 'date'])

files = os.listdir(experience_folder)
images = [f for f in files if f.endswith('.jpg')]
image_paths = [os.path.join(experience_folder, path) for path in images]

metadata_dict = {}
with exiftool.ExifToolHelper() as et:
    metadata = et.get_metadata(image_paths)
    for d in metadata:
        try:
            metadata_dict[d["SourceFile"]] = d["EXIF:DateTimeOriginal"]
        except KeyError:
            pass

for path in tqdm(image_paths, desc=f"Processing images from {experience_folder}"):
    image = Image.open(path)

    predictions = detector(image, candidate_labels=candidate_labels)
    animal_label = predictions[0]["label"]

    id = hasher.generate_md5_image_id(image=image)
    
    date = None
    moon_phase = None
    try:
        date = metadata_dict[path]
        date = date.replace(':', '-', 2)
        moon_phase = moonphase.phase(date)
    except KeyError:
        date = ocr.extract_date(image)
        if date:
            moon_phase = moonphase.phase(date)
    
    temperature = ocr.extract_temperature(image)

    camera = ocr.extract_camera(image)

    image_name = path.split('/')[-1]
    
    row = {'id': id, 'image_name': image_name, 'predicted_animal': animal_label, 'moon_phase': moon_phase, 
            'temperature': temperature, 'date': date, 'cam': camera}
    
    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

df.to_csv(os.path.join(data_folder, 'experience.csv'), index=False)

Processing images from images/experience: 100%|██████████| 230/230 [01:54<00:00,  2.00it/s]
