In [2]:
import os
import requests

import pandas as pd
import geopandas as gpd
from fuzzywuzzy import fuzz, process

EPICOLLECT_SECRET = os.environ["EPICOLLECT_SECRET"]
EPICOLLECT_CLIENT_ID = os.environ["EPICOLLECT_CLIENT_ID"]

EPICOLLECT_AUTH_ENDPOINT = "https://five.epicollect.net/api/oauth/token"
EPICOLLECT_DATA_ENDPOINT = "https://five.epicollect.net/api/export/entries/chicago-tree-identification"

https://medium.com/@tjt28165/collecting-real-world-data-from-the-field-with-epicollect5-and-python-f488b8f554e0

In [3]:
auth_response = requests.post(
    EPICOLLECT_AUTH_ENDPOINT,
    data={
        "grant_type": "client_credentials",
        "client_id": EPICOLLECT_CLIENT_ID,
        "client_secret": EPICOLLECT_SECRET
    }
)
access_token = auth_response.json()["access_token"]

data_response = requests.get(
    EPICOLLECT_DATA_ENDPOINT, 
    headers={'Authorization': access_token}
)

data_df = pd.DataFrame(data_response.json()["data"]["entries"])
data_df = pd.concat(
    [
        data_df,
        data_df["2_Location"].apply(pd.Series)
    ],
    axis=1
)
data_gdf = gpd.GeoDataFrame(
    data_df, 
    geometry=gpd.points_from_xy(
        data_df.longitude, 
        data_df.latitude
    ), 
    crs="EPSG:4326"
)

In [4]:
data_gdf

Unnamed: 0,ec5_uuid,created_at,uploaded_at,created_by,title,1_Species,2_Location,3_Photo,4_Notes,5_Date,6_Time,latitude,longitude,accuracy,UTM_Northing,UTM_Easting,UTM_Zone,geometry
0,721d8c15-d94d-4698-a668-e8995c0ed347,2021-02-28T00:30:29.416Z,2021-02-28T18:38:08.000Z,camen.piho.r@gmail.com,American basswood 27/02/2021 18:30:27,American basswood,"{'latitude': 42.007242, 'longitude': -87.65798...",,,27/02/2021,18:30:27,42.007242,-87.657984,7,4650789,445513,16T,POINT (-87.65798 42.00724)
1,1098777b-b1a8-4e29-b03d-889e03087e96,2021-02-27T21:32:42.415Z,2021-02-27T22:58:14.000Z,camen.piho.r@gmail.com,eastern white pine 27/02/2021 15:32:40,eastern white pine,"{'latitude': 42.008059, 'longitude': -87.65956...",1098777b-b1a8-4e29-b03d-889e03087e96_161446155...,,27/02/2021,15:32:40,42.008059,-87.659569,4,4650881,445382,16T,POINT (-87.65957 42.00806)


In [5]:
data_gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [6]:
def fuzzy_match(search_string, collection):
    return process.extract(
        search_string, 
        collection, 
        limit=10, 
        scorer=fuzz.token_sort_ratio
    )

In [7]:
trees = pd.read_csv("resources/plantlst.txt")
trees = trees[~trees["Common Name"].isnull()]
trees = trees.drop_duplicates(subset=["Common Name"], keep="first")
trees = trees.set_index("Common Name")

In [8]:
corrected = []
for species in data_gdf["1_Species"].unique():
    matches = fuzzy_match(species, trees.index)
    
    best_score = matches[0][1]
    matched_idx = 0
    
    if best_score < 95:
        matched_idx = int(input(f"0-indexed ({species}): "))
        print()
        
    matched = trees.loc[matches[matched_idx][0]]
    matched = matched.to_frame().T.reset_index()
    matched["original_common_name"] = species
    matched = matched.rename(
        columns={
            "index": "common_name",
            "Symbol": "symbol",
            "Synonym Symbol": "synonym_symbol",
            "Scientific Name with Author": "scientific_name",
            "Family": "family"
        }
    )
    corrected.append(matched)

    
corrected = pd.concat(corrected)

corrected

Unnamed: 0,common_name,symbol,synonym_symbol,scientific_name,family,original_common_name
0,American basswood,TIAM,,Tilia americana L.,Tiliaceae,American basswood
0,eastern white pine,PIST,,Pinus strobus L.,Pinaceae,eastern white pine


In [9]:
gdf = pd.merge(
    left=data_gdf,
    right=corrected,
    how="inner",
    left_on="1_Species",
    right_on="original_common_name",
)[["common_name", "symbol", "scientific_name", "family", "accuracy", "geometry"]]

gdf["common_name"] = gdf["common_name"].str.title()
gdf["scientific_name"] = gdf["scientific_name"].str.title()
gdf["family"] = gdf["family"].str.title()
gdf

Unnamed: 0,common_name,symbol,scientific_name,family,accuracy,geometry
0,American Basswood,TIAM,Tilia Americana L.,Tiliaceae,7,POINT (-87.65798 42.00724)
1,Eastern White Pine,PIST,Pinus Strobus L.,Pinaceae,4,POINT (-87.65957 42.00806)


In [10]:
gdf.to_file("resources/tree_locations.gpkg", driver="GPKG")