# Exploring the image metadata

## Load the dataset

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import geopandas
from scivision.io import load_dataset

In [None]:
cat = load_dataset('https://github.com/alan-turing-institute/plankton-dsg-challenge')

ds_all = cat.plankton_multiple().to_dask()
labels = cat.labels().read()

labels_dedup = xr.Dataset.from_dataframe(
    labels
    .drop_duplicates(subset=["filename"])
    .set_index("filename")
    .sort_index()
)

ds_labelled = (
    ds_all
    .swap_dims({"concat_dim": "filename"})
    .merge(labels_dedup, join="inner")
    .swap_dims({"filename": "concat_dim"})
)

## Extracting EXIF data

We work with a small sample of the data below, since loading the full dataset can take some time.

The EXIF data variables have types from the [ExifRead](https://pypi.org/project/ExifRead/) library.

In [None]:
ds_labelled = ds_labelled.assign(
    datetime=pd.to_datetime(
        ds_labelled['EXIF Image DateTime']
            .to_pandas()
            .apply(lambda x: x.values)
            # The format used in the timestamp uses ':' and '' inconsistently
            # Replace with something else before we extract the value as a datetime
            .str.replace("[:-]", "_"),
        format="%Y_%m_%d %H_%M_%S"
    )
)

### GPS coordinates

The GPS coordinates are stored as a list of rational numbers, '[degrees, minutes, seconds]', in `EXIF GPS GPSLatitude` and `EXIF GPS GPSLongitude`. Longitudes are given as east or west of the prime meridian depending on `EXIF GPS GPSLongitudeRef`, and latitudes are given as north or south of the equator depending on `EXIF GPS GPSLatitudeRef`.

In [None]:
@np.vectorize
def GPS_to_float(v, ref):
    dms = [r.decimal() for r in v.values]
    result_abs = dms[0] + dms[1] / 60.0 + dms[2] / 3600.0
    sign = 1.0 if ref.values == 'E' or ref.values == 'N' else -1.0
    return sign * result_abs

In [None]:
ds_labelled = ds_labelled.assign(
    latitude=("concat_dim", GPS_to_float(ds_labelled['EXIF GPS GPSLatitude'], ds_labelled['EXIF GPS GPSLatitudeRef'])),
    longitude=("concat_dim", GPS_to_float(ds_labelled['EXIF GPS GPSLongitude'], ds_labelled['EXIF GPS GPSLongitudeRef'])),
)

In [None]:
df = ds_labelled[['datetime', 'latitude', 'longitude', 'label1', 'label2', 'label3']].to_dataframe()
df.head()

In [None]:
len(df)