# Aculei notebook

This notebook is the collection of all steps made to create a dataset from hunter-camera images.

The dataset is used to feed a database that serves [aculei](http://aculei.xyz) archive.

In [19]:
import sys
sys.path.append('../')

import pandas as pd

from PIL import Image

from tqdm import tqdm

from transformers import pipeline

import utils.hasher as hasher
import utils.moonphase as moonphase
import utils.ocr as ocr

import exiftool

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [20]:
checkpoint = "openai/clip-vit-large-patch14"

detector = pipeline(model=checkpoint, task="zero-shot-image-classification")

candidate_labels = ["porcupine", "wild boar", "fox", "hare", "deer", 
                    "badger", "wolf", "horse", "dog", "cat", "buzzard", "heron", "mallard", "squirrel", "crow",
                    "human", "girl", "man", "woman", "old woman", "boy", "old man", "person", "people"]

Device set to use mps:0


In [25]:
data_folder = '../data/'
image_folder = '../images/'
folders = os.listdir(image_folder)
folders = [f for f in folders if os.path.isdir(image_folder + f)]
experience_folder = os.path.join(image_folder, 'experience')

In [26]:
def ishuman(label):
    return label in ["human", "girl", "man", "woman", "old woman", "boy", "old man", "person", "people"]

In [27]:
df = pd.DataFrame(columns=['id', 'image_name', 'predicted_animal', 'moon_phase', 'temperature', 'date'])

for folder in folders:
    files = os.listdir(image_folder + folder)
    images = [f for f in files if f.endswith('.jpg')]
    image_paths = [os.path.join(image_folder, folder, path) for path in images]
    
    metadata_dict = {}
    with exiftool.ExifToolHelper() as et:
        metadata = et.get_metadata(image_paths)
        for d in metadata:
            try:
                metadata_dict[d["SourceFile"]] = d["EXIF:DateTimeOriginal"]
            except KeyError:
                pass

    for path in tqdm(image_paths, desc=f"Processing images from {folder}"):
        image = Image.open(path)

        predictions = detector(image, candidate_labels=candidate_labels)
        animal_label = predictions[0]["label"]

        if ishuman(animal_label):
            animal_label = "human"

        top_predictions = predictions[:3]
        for pred in top_predictions:
            if ishuman(pred['label']):
                pred['label'] = 'human'

        id = hasher.generate_md5_image_id(image=image)
        
        date = None
        moon_phase = None
        try:
            date = metadata_dict[path]
            date = date.replace(':', '-', 2)
            moon_phase = moonphase.phase(date)
        except KeyError:
            date = ocr.extract_date(image)
            if date:
                moon_phase = moonphase.phase(date)
        
        temperature = ocr.extract_temperature(image)

        image_name = path.split('/')[-1]
        
        row = {'id': id, 'image_name': image_name, 'predicted_animal': animal_label, 'moon_phase': moon_phase, 
               'temperature': temperature, 'date': date, 'cam': folder, 'top_predictions': top_predictions}
        
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

aculei_df = pd.read_csv(os.path.join(data_folder, 'archive.csv'))
df.to_csv(os.path.join(data_folder, 'archive-update.csv'), index=False)
archive_df = pd.concat([aculei_df, df], ignore_index=True)
archive_df.to_csv(os.path.join(data_folder, 'archive.csv'), index=False)

Processing images from CAM 7: 100%|██████████| 24/24 [00:10<00:00,  2.38it/s]
Processing images from CAM 6: 100%|██████████| 8/8 [00:06<00:00,  1.16it/s]
Processing images from CAM 4: 100%|██████████| 2/2 [00:01<00:00,  1.66it/s]


In [28]:
experience_df = pd.DataFrame(columns=['id', 'image_name', 'predicted_animal', 'moon_phase', 'temperature', 'date'])

files = os.listdir(experience_folder)
images = [f for f in files if f.endswith('.jpg')]
image_paths = [os.path.join(experience_folder, path) for path in images]

metadata_dict = {}
with exiftool.ExifToolHelper() as et:
    metadata = et.get_metadata(image_paths)
    for d in metadata:
        try:
            metadata_dict[d["SourceFile"]] = d["EXIF:DateTimeOriginal"]
        except KeyError:
            pass

for path in tqdm(image_paths, desc=f"Processing images from {experience_folder}"):
    image = Image.open(path)

    predictions = detector(image, candidate_labels=candidate_labels)
    animal_label = predictions[0]["label"]

    if ishuman(animal_label):
        animal_label = "human"

    top_predictions = predictions[:3]
    for pred in top_predictions:
        if ishuman(pred['label']):
            pred['label'] = 'human'

    id = hasher.generate_md5_image_id(image=image)
    
    date = None
    moon_phase = None
    try:
        date = metadata_dict[path]
        date = date.replace(':', '-', 2)
        moon_phase = moonphase.phase(date)
    except KeyError:
        date = ocr.extract_date(image)
        if date:
            moon_phase = moonphase.phase(date)
    
    temperature = ocr.extract_temperature(image)

    camera = ocr.extract_camera(image)

    image_name = path.split('/')[-1]
    
    row = {'id': id, 'image_name': image_name, 'predicted_animal': animal_label, 'moon_phase': moon_phase, 
            'temperature': temperature, 'date': date, 'cam': camera, 'top_predictions': top_predictions}
    
    experience_df = pd.concat([experience_df, pd.DataFrame([row])], ignore_index=True)

experience_df.to_csv(os.path.join(data_folder, 'experience-update.csv'), index=False)

exp_df = pd.read_csv(os.path.join(data_folder, 'experience.csv'))
exp_df = pd.concat([exp_df, experience_df], ignore_index=True)
exp_df.to_csv(os.path.join(data_folder, 'experience.csv'), index=False)

Processing images from ../images/experience: 100%|██████████| 8/8 [00:07<00:00,  1.13it/s]
