# Processing Hazelnut Yield

The raw data provided contains data points with timestamps, GPS coordinates, and collection cart weight. This information was collected as a cart moves along rows of trees and harvests nuts.

The goal is to create a heatmap of productivity. We measure change in weight with respect to distance traveled, and pool data from multiple harvests into a grid to display the total productivity spatially.

In [1]:
import os
import json
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point,Polygon, LineString
import spectra
import re

Our method relies on configuring the boundaries of the fields. In the [configuration file](files/config.json), the coordinates of the corners of multiple fields are specified.

In [2]:
def create_field_grids(cfg):
    grids = {}
    for field in cfg['field_boundaries']:
        field_polygon = Polygon([
                                  (field['NW_corner']['latitude'],field['NW_corner']['longitude']),
                                  (field['NE_corner']['latitude'], field['NE_corner']['longitude']),
                                  (field['SE_corner']['latitude'], field['SE_corner']['longitude']),
                                  (field['SW_corner']['latitude'], field['SW_corner']['longitude'])
                               ])
        xmin,ymin,xmax,ymax =  field_polygon.bounds
        length = 2e-4
        wide = 2e-4

        cols = list(np.arange(xmin, xmax, wide))
        rows = list(np.arange(ymin, ymax, length))
        rows.reverse()

        polygons = []
        for x in cols:
            for y in rows:
                polygons.append(Polygon([(x,y), (x+wide, y), (x+wide, y-length), (x, y-length)]))

        grid = gpd.GeoDataFrame(geometry=polygons)
        field_df = gpd.GeoDataFrame(geometry=gpd.GeoSeries(field_polygon))
        grid = gpd.overlay(grid, field_df, how='intersection')
        grids[field['name']] = grid
        
    return grids

When we load the raw CSV lists, we ignore labels and null points:

In [3]:
def load_file(filepath):
    df = pd.read_csv(filepath, encoding = "ISO-8859-1")

    data = []
    for index, row in df.iterrows():
        try:
            point = Point(float(row['lat']), float(row['long']))
            # If the point does not contain zeros, add to list
            if not np.isnan(point.x) and not np.isnan(point.y):
                data.append({'lat': row['lat'], 'long':row['long'], 'weight':row['weight']})
        except:
            pass
    return data

For each pair of consecutive data points, the intensity is calculated as the difference in weight is divided by the distance traveled. The geometry is a line from the first position to the last. Everywhere on this line is considered to have the calculated intensity.

Since all CSVs have been merged into one, pairs with large distances are ignored as they represent moving to a separate harvest.

In [4]:
def quantize_yield_intensity(data):
    intensities = []
    geometries = []
    if len(data) > 0:
        for i in range(0, len(data)-1):
            initial_point = data[i]
            final_point = data[i+1]
            initial_location = Point(float(initial_point['lat']), float(initial_point['long']))
            final_location = Point(float(final_point['lat']), float(final_point['long']))
            distance = gpd.GeoSeries(initial_location).distance(gpd.GeoSeries(final_location))[0]
            if distance > 0.0 and distance < 1e-4:
                try:
                    # Removing anything that isn't a decimal point or a digit from recorded weights
                    final_weight = float(re.sub('[^A-Za-z0-9.]+', '', final_point['weight']))
                    initial_weight = float(re.sub('[^A-Za-z0-9.]+', '', initial_point['weight']))
                    line = LineString([(initial_location.x, initial_location.y), (final_location.x, final_location.y)])
                    geometries.append(line)
                    intensity = (final_weight - initial_weight)/distance
                    intensities.append(intensity)
                except:
                    pass
    return intensities, geometries

Using predefined functions to genereate field grids and load data:

In [5]:
config_file = open("config.json", 'r')
cfg = json.load(config_file)
field_grids = create_field_grids(cfg)
data = load_file(cfg['csv_filepath'])
config_file.close()

Using predefined function to process data for intensities and geometries:

In [6]:
intensities, geometries = quantize_yield_intensity(data)

For each field grid, the yield in each cell is summed. For every line segment in the cell, the length of the portion within the cell is found, and then scaled by the associated intensity. This produces a change in weight attributable to that cell. These changes in weights are summed to produce total yield for the cell.

In [7]:
yield_df = pd.DataFrame(intensities, columns = ['intensity'])
yield_df = gpd.GeoDataFrame(yield_df, geometry=geometries)

# recording maximum yield for calibrating color range
max_yield = 0

cell_yields = {}
for field in field_grids:
    cell_yields[field] = []
    for cell in field_grids[field]['geometry']:
        cell_df = gpd.GeoDataFrame(geometry=gpd.GeoSeries(cell))

        yield_in_cell = gpd.sjoin(yield_df, cell_df, op='intersects')
        yield_in_cell_magnitude = 0
        if not yield_in_cell.empty:
            for row in yield_in_cell.iterrows():
                line = row[1]['geometry']
                intensity = row[1]['intensity']
                intersection = cell.exterior.intersection(line)
                if intersection.type is 'MultiPoint':
                    initial_point, final_point = intersection
                    distance = gpd.GeoSeries(initial_point).distance(gpd.GeoSeries(final_point))[0]

                if intersection.type is "Point":
                    initial_point = Point(line.coords[0])

                    if not initial_point.within(cell):
                        initial_point = Point(line.coords[1])

                    distance = gpd.GeoSeries(initial_point).distance(gpd.GeoSeries(intersection))[0]
                
                # filtering negative and unreasonably large intensities:
                if intensity > 0 and intensity < 4000000:                            
                    yield_in_cell_magnitude += distance*intensity

        if yield_in_cell_magnitude > max_yield:
            max_yield = yield_in_cell_magnitude
        cell_yields[field].append(yield_in_cell_magnitude)

Data frames for grids with color corresponding to cell yield is constructed:

In [8]:
color_scale = spectra.scale([ "LightPink", "Maroon" ]).domain([0 , max_yield])
field_dfs = {}

for field in field_grids:
    yield_colors = [] # [yield, color]
    for cell_yield in cell_yields[field]:
        if cell_yield == 0:
            color = "gray"
        else:
            color = color_scale(cell_yield).hexcode
        yield_colors.append([cell_yield, color])

    field_dfs[field] = pd.DataFrame(yield_colors, columns = ['yield', 'fill'])
    field_dfs[field] = gpd.GeoDataFrame(field_dfs[field], geometry=field_grids[field]['geometry'])

Exporting field data frames for geojson:

In [9]:
for field in field_dfs:
    field_dfs[field].to_file(f'{field}.json', driver="GeoJSON")

    # geojson.io requires coordinates in lat, long
    with open(f'{field}.json', 'r') as json_data_file:
        field_json = json.load(json_data_file)
        for feature in field_json['features']:
            polygon = feature['geometry']['coordinates']
            for corner in polygon:
                for coordinates in corner:
                    long = coordinates[0]
                    lat = coordinates[1]
                    coordinates[0] = lat
                    coordinates[1] = long

    with open(f'{field}_geo.json', 'w') as outfile:
        json.dump(field_json, outfile)

## Visualizing field data frames in geojson.io:
[Field 1](http://geojson.io/#id=gist:wilsonia/9a1a86b680fa12a27341a517db5850cd&map=16/45.1060/-123.2836)

[Field 2](http://geojson.io/#id=gist:wilsonia/44526a48e8d33b2de307ef0524625f36&map=17/45.10945/-123.28099)