<a href="https://colab.research.google.com/github/brian-ho/mde-preterm-2024/blob/main/day_1/notebooks/Images_to_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Images to CSV: extracting GPS and color information from photos

```
2024 MDE Preterm
Brian Ho - brian@brian-ho.io
Jenny Fan - me@jennyfan.com
```

A short notebook you can use to get [EXIF metadata](https://en.wikipedia.org/wiki/Exif) and color frequency from photos.

- When run in Colab, inputs can be images saved in your Google Drive account
- When run locally, inputs are images your local environment
- Saves a CSV file with the information

For each image, write the following to a column in the CSV file:

- GPS location information: `latitude`, `longitude`, and `altitude`
- The `date` the image was taken
- The top 10 most common `color`s in each image, as hex values
- The `colors_count` is a corresponding array with the number of pixels for each color
- The `red`, `green`, `blue` values for each color as corresponding arrays
- The `image_width` in pixels
- The `image_height` in pixels


In [36]:
# Let's import our dependencies.
import os
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS, IFD
import pandas as pd
from pathlib import Path
from datetime import datetime

In [35]:
# This gives us a set of image file extensions so we can identify the right files
IMAGE_FILE_EXTENSIONS = {
    ex.lower() for ex, f in Image.registered_extensions().items() if f in Image.OPEN
}
IMAGE_FILE_EXTENSIONS.add(".HEIC".lower())

In [None]:
# When running on Colab, we can connect to a Google Drive folder
from google.colab import drive

drive.mount("/content/drive")

In [29]:
# Update the text in quotes with the path to your image data on Google Drive
# This will probably be something like: "/content/drive/MyDrive/path/to/folder"
# Don't forget to run this cell!
directory = "example_data"

In [30]:
GPS_LAT_REF = "GPSLatitudeRef"
GPS_LAT = "GPSLatitude"
GPS_LNG_REF = "GPSLongitudeRef"
GPS_LNG = "GPSLongitude"
GPS_ALT_REF = "GPSAltitudeRef"
GPS_ALT = "GPSAltitude"
DATE_TIME = "DateTime"


def get_exif_info(img):
    """Extracts desired EXIF information from an image."""

    # Get the EXIF metadata
    img_exif = img.getexif()

    try:
        gps_ifd = img_exif.get_ifd(IFD.GPSInfo)
        gps_info = {GPSTAGS.get(tag, tag): value for tag, value in gps_ifd.items()}
        other_info = {TAGS.get(tag, tag): value for tag, value in img_exif.items()}

        exif_info = {**gps_info, **other_info}

        for attr in (
            GPS_LAT_REF,
            GPS_LAT,
            GPS_LNG_REF,
            GPS_LNG,
            GPS_ALT_REF,
            GPS_ALT,
            DATE_TIME,
        ):
            exif_info.setdefault(attr, None)

        return exif_info

    except KeyError:
        return None


def parse_degrees_to_decimals(degrees, reference):
    """Transforms angular degrees into decimal degrees."""

    if not degrees or not reference:
        return None

    if isinstance(reference, bytes):
        ref_from_bytes = int.from_bytes(reference, byteorder="big")
        sign = -1.0 if ref_from_bytes == 1 else 1.0
        return degrees * sign
    else:
        sign = -1.0 if reference in ("S", "W") else 1.0
        decimals = degrees[0] + degrees[1] / 60 + (degrees[2] / (60.0 * 60.0))
        return decimals * sign


def parse_date(exif_date):
    """Parses GPS data string to a date object."""

    if not exif_date:
        return None

    return datetime.strptime(exif_date, "%Y:%m:%d %H:%M:%S")


def parse_exif_info(exif_info):
    """Gets decimal latitude, longitude, and altitude from GPS metadata."""

    if not exif_info:
        return {"latitude": None, "longitude": None, "altitude": None, "date": None}

    else:
        return {
            "latitude": parse_degrees_to_decimals(
                exif_info[GPS_LAT], exif_info[GPS_LAT_REF]
            ),
            "longitude": parse_degrees_to_decimals(
                exif_info[GPS_LNG], exif_info[GPS_LNG_REF]
            ),
            "altitude": parse_degrees_to_decimals(
                exif_info[GPS_ALT], exif_info[GPS_ALT_REF]
            ),
            "date": parse_date(exif_info[DATE_TIME]),
        }


def palette_color_to_rgb(palette, color):
    """Converts an index to an RGB color value from the palette."""
    return palette[color * 3 : color * 3 + 3]


def rgb_to_hex(r, g, b):
    """Converts red, green, and blue values to hex code"""
    return f"#%02x%02x%02x" % (r, g, b)


def get_color_data(img):
    """Gets the 10 most common colors in an image."""
    converted_img = img.convert("P")
    palette = converted_img.getpalette()
    converted_img.thumbnail((100, 100))

    color_counts = converted_img.getcolors()
    color_counts.sort(key=lambda x: x[0], reverse=True)

    color_data = {
        "color_count": [],
        "red": [],
        "green": [],
        "blue": [],
    }

    for i, (k, v) in enumerate(color_counts[:5]):
        r, g, b = palette_color_to_rgb(palette, v)
        color_data["color_count"].append(k)
        color_data["red"].append(r)
        color_data["green"].append(g)
        color_data["blue"].append(b)
        color_data[f"color_{i}"] = rgb_to_hex(r, g, b)

    # TODO (brian) remove this
    for i, (k, v) in enumerate(color_counts[-6:]):
        j = i + 4
        r, g, b = palette_color_to_rgb(palette, v)
        color_data["color_count"].append(k)
        color_data["red"].append(r)
        color_data["green"].append(g)
        color_data["blue"].append(b)
        color_data[f"color_{j}"] = rgb_to_hex(r, g, b)

    return color_data


def get_image_data(img_file):
    """Gets decimal latitude, longitude, and altitude from an image file."""

    # Create a PIL Image object
    img = Image.open(img_file)

    gps_info = get_exif_info(img)
    color_data = get_color_data(img)
    return {
        "image_name": img_file.name,
        "image_width": img.width,
        "image_height": img.height,
        **parse_exif_info(gps_info),
        **color_data,
    }

In [31]:
images_data = []

for filepath in sorted(os.listdir(directory)):
    # This filters out any unsupported non-image files
    if any(filepath.lower().endswith(ext) for ext in IMAGE_FILE_EXTENSIONS):
        gps_data = get_image_data(Path(directory) / filepath)
        images_data.append(gps_data)

In [34]:
# We are construction a DataFrame from a list of dictionaries
# This is just one of many different ways to create a DataFrame
df = pd.DataFrame(images_data).sort_values("image_name")

# This removes data with empty values! Use with caution.
df = df.dropna(axis=0)

# Save to a file. Feel free to update the path as needed
df.to_csv("../p5/end_to_end_demo/data/data.csv", index=False)