# extract_gps_data.ipynb

* 2023-04-09 Timestamp extracted from image filename instead of EXIF. An attempt to fix infrequent errors (2 out of 20000) in timestamp provided by EXIF
* 2022-12-03 Added error handling for when no image files are found
* 2022-09-04 Added error handling code which deletes images from which gps coordinates cannot be extracted
* 2021-11-20 Fixed problems in get_gps_coordinates()
* 2021-05-06 Added code to adjust coordinates using rolling averages
* 2021-05-02 First version by Aubrey Moore

Extracts geotagging data from EXIF tags stored in one or more image files.

Example usage:

    papermill extract_gps_data.ipynb \
    '../output/extract_gps_data_output.ipynb' \
    -p IMAGE_FILE_PATH '../rawdata/*.jpg' \
    -p CSV_OUTPUT_FILE '../rawdata/gps-data.csv'
    
When the above command line is executed in the directory containing **extract_gps_data.ipynb**, 
GPS data will be extracted from all **jpg** files in the **IMAGE_FILE_PATH** and results will be saved in 
**CSV_OUTPUT_FILE**.


2022-09-03T05:46:57+1000 [ERROR] create_dataframe Could not get gps coordinates from ../rawdata/IMG_20220221_112311.jpg; image ignored

## References

https://developer.here.com/blog/getting-started-with-geocoding-exif-image-metadata-in-python3

http://www.50northspatial.org/using-open-camera-geotagging-photos/

In [1]:
from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
import glob
import pandas as pd
import os
import plotly.express as px
import numpy as np
import logging

In [2]:
# parameters for papermill

IMAGE_FILE_PATH = '../rawdata/*.jpg'         # Path to one or more image files. Can include wildcards. See https://pymotw.com/2/glob/ for pattern matching details.
CSV_OUTPUT_FILE = '../rawdata/gps-data.csv'  # Path to a CSV file where the GPS data will be stored. 
ADJUST_COORDINATES = True
MAKE_MAPS = False

In [3]:
def get_exif(filename):
    image = Image.open(filename)
    image.verify()
    return image._getexif()


def get_geotagging(exif):
    if not exif:
        raise ValueError("No EXIF metadata found")
    geotagging = {}
    for (idx, tag) in TAGS.items():
        if tag == 'GPSInfo':
            if idx not in exif:
                raise ValueError("No EXIF geotagging found")
            for (key, val) in GPSTAGS.items():
                if key in exif[idx]:
                    geotagging[val] = exif[idx][key]
    return geotagging


def get_gps_coordinates(image_file_name):
    exif = get_exif(image_file_name)
    gpsdata = get_geotagging(exif)
        
    d,m,s = gpsdata['GPSLatitude']
    latitude = d + m/60.0 + s/3600.0
    if gpsdata['GPSLatitudeRef']=='S':
        latitude = -latitude
    latitude = round(latitude, 6)

    d,m,s = gpsdata['GPSLongitude']
    longitude = d + m/60.0 + s/3600.0
    if gpsdata['GPSLongitudeRef']=='W':
        longitude = -longitude
    longitude = round(longitude, 6)
    
# # Get timestamp from EXIF data    
#     date = gpsdata['GPSDateStamp']
#     date = date.replace(':', '-')
#     h, m, s = gpsdata['GPSTimeStamp']
#     timestamp = f'{date} {int(h):02}:{int(m):02}:{int(s):02}'

    # get timestamp from filename
    timestamp = os.path.basename(image_file_name).replace('IMG_', '').replace('.jpg', '')
    timestamp = pd.to_datetime(timestamp, format='%Y%m%d_%H%M%S')
    
    return longitude, latitude, timestamp

def create_dataframe():
    # Get a sorted list of image files
    image_files = sorted(glob.glob(IMAGE_FILE_PATH))
    n = len(image_files)
    
    if n == 0:
        raise Exception(f"No image files were found in {IMAGE_FILE_PATH}")

    # Extract coordinates from each image file
    df = pd.DataFrame(columns=['imagefile','longitude','latitude','timestamp'])
    for i, image_file in enumerate(image_files):
        try:
            longitude, latitude, timestamp = get_gps_coordinates(image_file)
            df = df.append({'imagefile':os.path.basename(image_file),
                            'longitude':longitude,
                            'latitude':latitude,
                            'timestamp':pd.to_datetime(timestamp)}, ignore_index=True) 
        except:
#            os.remove(image_file) # WAY TOO RISKY. WHAT WAS I THINKING?
            logging.error(f'Could not get gps coordinates from {image_file}; image deleted')
        
        if ((i+1) % 100 == 0):
            logging.info(f'{i+1} of {n} images processed')
                
    return df 


def adjust_gps_coordinates():
    '''
    Calculates rolling averages for latitude and longitude to get better estimates for camera 
    positions and saves them in new columns: longitude_adjusted and latitude_adjusted. 
    This is a work-a-round for low precision GPS EXIF data data saved by the 
    OpenCamera app. For some reason, the app saves only degrees, minutes and seconds without decimal places. 
    This notebook calculates new points using 5-point rolling averages of latitude and longitude. 
    '''
    df['time_diff'] = df["timestamp"].diff().apply(lambda x: x/np.timedelta64(1,'s')).fillna(0).astype('int64')
    
    # Find location of segment breaks
    # A new segment begins when an (image is taken is more than 60s after previous image

    segments = []
    segment_breaks = df.index[df['time_diff'] > 60].tolist()
    segment_breaks.append(df.shape[0]) # Last index plus 1
    for i, segment_break in enumerate(segment_breaks):
        if i == 0:
            start = 0
        else:
            start = segment_breaks[i-1]
        segments.append({'first_index': start, 'last_index': segment_break-1})
    logging.info(f'segments: {segments}')

    # Calculate rolling averages to locations within each segment
           
    for segment in segments:
        i1 = segment['first_index']
        i2 = segment['last_index']
        df.loc[i1:i2, 'longitude_adjusted'] = df.loc[i1:i2, 'longitude'].rolling(5, center=True, min_periods=1).mean() 
        df.loc[i1:i2, 'latitude_adjusted'] = df.loc[i1:i2, 'latitude'].rolling(5, center=True, min_periods=1).mean() 
    
    return

In [4]:
%%time

# MAIN

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(funcName)s %(message)s",
    datefmt="%Y-%m-%dT%H:%M:%S%z",
    handlers=[logging.StreamHandler()])
logging.info('Starting georef.py')

df = create_dataframe()

if ADJUST_COORDINATES:
    logging.info('Adjusting coordinates')
    adjust_gps_coordinates()
    
df.to_csv(CSV_OUTPUT_FILE, index=False)
logging.info(f'FINISHED: Data saved in {CSV_OUTPUT_FILE}')

2023-04-09T17:28:01+1000 [INFO] <module> Starting georef.py
2023-04-09T17:28:02+1000 [INFO] create_dataframe 100 of 22273 images processed
2023-04-09T17:28:02+1000 [INFO] create_dataframe 200 of 22273 images processed
2023-04-09T17:28:02+1000 [INFO] create_dataframe 300 of 22273 images processed
2023-04-09T17:28:03+1000 [INFO] create_dataframe 400 of 22273 images processed
2023-04-09T17:28:03+1000 [INFO] create_dataframe 500 of 22273 images processed
2023-04-09T17:28:03+1000 [INFO] create_dataframe 600 of 22273 images processed
2023-04-09T17:28:04+1000 [INFO] create_dataframe 700 of 22273 images processed
2023-04-09T17:28:04+1000 [INFO] create_dataframe 800 of 22273 images processed
2023-04-09T17:28:04+1000 [INFO] create_dataframe 900 of 22273 images processed
2023-04-09T17:28:04+1000 [INFO] create_dataframe 1000 of 22273 images processed
2023-04-09T17:28:05+1000 [INFO] create_dataframe 1100 of 22273 images processed
2023-04-09T17:28:05+1000 [INFO] create_dataframe 1200 of 22273 images

CPU times: user 1min 9s, sys: 670 ms, total: 1min 10s
Wall time: 1min 10s


In [5]:
df

Unnamed: 0,imagefile,longitude,latitude,timestamp,time_diff,longitude_adjusted,latitude_adjusted
0,IMG_20230403_102010.jpg,144.800278,13.431111,2023-04-03 10:20:10,0,144.800278,13.431111
1,IMG_20230403_102011.jpg,144.800278,13.431111,2023-04-03 10:20:11,1,144.800278,13.431111
2,IMG_20230403_102012.jpg,144.800278,13.431111,2023-04-03 10:20:12,1,144.800278,13.431111
3,IMG_20230403_102013.jpg,144.800278,13.431111,2023-04-03 10:20:13,1,144.800278,13.431111
4,IMG_20230403_102015.jpg,144.800278,13.431111,2023-04-03 10:20:15,2,144.800278,13.431111
...,...,...,...,...,...,...,...
22268,IMG_20230407_170443.jpg,144.851667,13.486944,2023-04-07 17:04:43,1,144.851667,13.486944
22269,IMG_20230407_170444.jpg,144.851667,13.486944,2023-04-07 17:04:44,1,144.851667,13.486944
22270,IMG_20230407_170446.jpg,144.851667,13.486944,2023-04-07 17:04:46,2,144.851667,13.486944
22271,IMG_20230407_170447.jpg,144.851667,13.486944,2023-04-07 17:04:47,1,144.851667,13.486944


In [6]:
if MAKE_MAPS:
    fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", title='Original coordinates', zoom=9)
    fig.update_layout(mapbox_style="open-street-map", margin={"r":0,"t":30,"l":0,"b":0})
    fig.show()

In [7]:
if MAKE_MAPS and ADJUST_COORDINATES:
    fig = px.scatter_mapbox(df, lat="latitude_adjusted", lon="longitude_adjusted", 
                            title='Adjusted coordinates', zoom=9)
    fig.update_layout(mapbox_style="open-street-map", margin={"r":0,"t":30,"l":0,"b":0})
    fig.show()    

In [8]:
df.timestamp.min()

Timestamp('2023-04-03 10:20:10')

In [9]:
df.timestamp.max()

Timestamp('2023-04-07 17:04:48')