In [24]:
# Import dependencies

# The star of today
import geopandas as gpd

# Other actors
import pyogrio
import pandas as pd
import shapely
import pyproj
import folium
#from shapely.geometry import Point, box
#from folium import GeoJson

# BASIC AND COMPARISONS

In [18]:
# Creating a GeoDataFrame by hand

# Define nonspatial data
my_nonspatial_df = pd.DataFrame(data=[{"some_variable": 38}])

# Define geometry
my_geometry = shapely.geometry.Point(-62.2159, -3.4653)

# Define CRS
my_crs = pyproj.CRS.from_epsg(4326)

# Look at it!
print("Nonspatial data:")
print(my_nonspatial_df)
print(type(my_nonspatial_df))

print("\nGeometry:")
print(my_geometry)
print(type(my_geometry))

print("\nCRS:")
print(my_crs)
print(type(my_crs))

Nonspatial data:
   some_variable
0             38
<class 'pandas.core.frame.DataFrame'>

Geometry:
POINT (-62.2159 -3.4653)
<class 'shapely.geometry.point.Point'>

CRS:
EPSG:4326
<class 'pyproj.crs.crs.CRS'>


In [19]:
# Gotta give the geometry as a list or gpd not happy!
my_gdf = gpd.GeoDataFrame(data=my_nonspatial_df, geometry=[my_geometry], crs=my_crs)

# Look at it!

print("A complete GeoDataFrame:")
print(my_gdf)
print(type(my_gdf))

A complete GeoDataFrame:
   some_variable                  geometry
0             38  POINT (-62.2159 -3.4653)
<class 'geopandas.geodataframe.GeoDataFrame'>


In [None]:
# Use the .explore() method of the GeoDataFrame
my_gdf.explore(zoom_start=5)

In [14]:
# See what type of object it is
my_explore = my_gdf.explore()

#
print("The my_explore object:")
print(my_explore)
print(type(my_explore))

The my_explore object:
<folium.folium.Map object at 0x000001F0A3DB39B0>
<class 'folium.folium.Map'>


In [None]:
# Write your GeoDataFrame to file
my_gdf.to_file(filename="../data/my_gdf.geojson", driver="GeoJSON")
#my_gdf.to_file(filename="../data/my_gdf.geojson")

In [25]:
# See the various formats GeoPandas can read from and to
pyogrio.list_drivers()

{'PCIDSK': 'rw',
 'PDS4': 'rw',
 'VICAR': 'rw',
 'PDF': 'rw',
 'MBTiles': 'rw',
 'EEDA': 'r',
 'OGCAPI': 'r',
 'ESRI Shapefile': 'rw',
 'MapInfo File': 'rw',
 'UK .NTF': 'r',
 'LVBAG': 'r',
 'OGR_SDTS': 'r',
 'S57': 'rw',
 'DGN': 'rw',
 'OGR_VRT': 'r',
 'Memory': 'rw',
 'CSV': 'rw',
 'GML': 'rw',
 'GPX': 'rw',
 'KML': 'rw',
 'GeoJSON': 'rw',
 'GeoJSONSeq': 'rw',
 'ESRIJSON': 'r',
 'TopoJSON': 'r',
 'OGR_GMT': 'rw',
 'GPKG': 'rw',
 'SQLite': 'rw',
 'WAsP': 'rw',
 'OpenFileGDB': 'rw',
 'DXF': 'rw',
 'FlatGeobuf': 'rw',
 'Geoconcept': 'rw',
 'GeoRSS': 'rw',
 'VFK': 'r',
 'PGDUMP': 'rw',
 'OSM': 'r',
 'GPSBabel': 'rw',
 'OGR_PDS': 'r',
 'WFS': 'r',
 'OAPIF': 'r',
 'EDIGEO': 'r',
 'SVG': 'r',
 'Idrisi': 'r',
 'ODS': 'rw',
 'XLSX': 'rw',
 'Elasticsearch': 'rw',
 'Carto': 'rw',
 'AmigoCloud': 'rw',
 'SXF': 'r',
 'Selafin': 'rw',
 'JML': 'rw',
 'PLSCENES': 'r',
 'CSW': 'r',
 'VDV': 'rw',
 'MVT': 'rw',
 'NGW': 'rw',
 'MapML': 'rw',
 'GTFS': 'r',
 'PMTiles': 'rw',
 'JSONFG': 'rw',
 'MiraMonVecto

# Loading spatial data

In [26]:
# Download amphibian distribution areas
# From Quebec Ministry
amph_gdf = gpd.read_file(filename="https://diffusion.mffp.gouv.qc.ca/Diffusion/DonneeGratuite/Faune/Aires_repartition/Amphibien/SQLite/Aires_repartition_amphibiens.sqlite")

In [29]:
# See the first lines of the object
# No UTF8
print(type(amph_gdf))
(amph_gdf.head())

<class 'geopandas.geodataframe.GeoDataFrame'>


Unnamed: 0,desc_entit,producteur,nom_franca,nom_angla,nom_scient,date_maj,grand_groupe,famille,shape_length,shape_area,geometry
0,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Crapaud d'AmÃ©rique,American Toad,Anaxyrus americanus,2021,Amphibiens,Bufonidae,14983450.0,1184087000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500..."
1,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille des bois,Wood Frog,Lithobates sylvaticus,2021,Amphibiens,Ranidae,13323450.0,1198792000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500..."
2,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille des marais,Pickerel Frog,Lithobates palustris,2021,Amphibiens,Ranidae,5387379.0,157320000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500..."
3,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille du Nord,Mink Frog,Lithobates septentrionalis,2021,Amphibiens,Ranidae,11062840.0,974839300000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500..."
4,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille lÃ©opard du Nord,Northern leopard Frog,Lithobates pipiens,2021,Amphibiens,Ranidae,9757220.0,633866000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500..."


In [16]:
# See distribution of the first species
#amph_gdf.iloc[0:1].explore()

In [33]:
# Create and append a new (it already exists but we want to show)
# This is a Pandas feature combined with shapely geometry attributes
amph_gdf["area_now"] = amph_gdf.geometry.area

# Look at it!
amph_gdf.head()

Unnamed: 0,desc_entit,producteur,nom_franca,nom_angla,nom_scient,date_maj,grand_groupe,famille,shape_length,shape_area,geometry,area_now
0,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Crapaud d'AmÃ©rique,American Toad,Anaxyrus americanus,2021,Amphibiens,Bufonidae,14983450.0,1184087000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500...",1184087000000.0
1,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille des bois,Wood Frog,Lithobates sylvaticus,2021,Amphibiens,Ranidae,13323450.0,1198792000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500...",1198792000000.0
2,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille des marais,Pickerel Frog,Lithobates palustris,2021,Amphibiens,Ranidae,5387379.0,157320000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500...",157320000000.0
3,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille du Nord,Mink Frog,Lithobates septentrionalis,2021,Amphibiens,Ranidae,11062840.0,974839300000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500...",974839300000.0
4,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille lÃ©opard du Nord,Northern leopard Frog,Lithobates pipiens,2021,Amphibiens,Ranidae,9757220.0,633866000000.0,"MULTIPOLYGON (((-485026.718 132619.355, -48500...",633866000000.0


In [36]:
# Since it is an extension of Pandas DataFrame, you can still use similar functions
amph_gdf.groupby("famille")["area_now"].agg(["min", "max", "std", "count"])

Unnamed: 0_level_0,min,max,std,count
famille,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ambystomatidae,443057200000.0,696179200000.0,178984300000.0,2
Bufonidae,1184087000000.0,1184087000000.0,,1
Hylidae,2483088000.0,679869900000.0,329904600000.0,4
Plethodontidae,311481300.0,826555600000.0,329351500000.0,6
Proteidae,21505820000.0,21505820000.0,,1
Ranidae,157320000000.0,1198792000000.0,400928400000.0,6
Salamandridae,264668600000.0,264668600000.0,,1


# Clipping

In [None]:
# Define a bounding box centered over northern Quebec
my_bbox = shapely.geometry.box(minx=-80.18, miny=51.24, maxx=-70.32, maxy=61.14)

# Create a GeoDataFrame
boxy = gpd.GeoDataFrame(data=[{"name": "boxy"}], geometry=[my_bbox], crs="EPSG:4326")
boxy

Unnamed: 0,name,geometry
0,boxy,"POLYGON ((-70.32 51.24, -70.32 61.14, -80.18 6..."


In [None]:
# Look at the bounding box
boxy.explore()

In [45]:
# Clip the random polygons with the box
amph_clipped = amph_gdf.clip(mask=boxy)

# Note: Equivalent to
# amph_clipped = gpd.clip(gdf=amph_gdf, mask=boxy, sort=True)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:32198
Right CRS: EPSG:4326

  return geopandas.clip(self, mask=mask, keep_geom_type=keep_geom_type, sort=sort)


In [49]:
# AN ON PURPOSE ERROR!
# Reproject the left geometries to the appropriate CRS (EPSG:4326)
# Do it in the same line to avoid creating additional objects
amph_clipped = amph_gdf.to_crs(epsg=4326).clip(mask=boxy, sort=True)

# Note: Equivalent to
# amph_clipped = gpd.clip(gdf=amph_gdf.to_crs(epsg=4326), mask=boxy, sort=True)

In [50]:
# Everything went well, look at the first few lines
amph_clipped.head()

Unnamed: 0,desc_entit,producteur,nom_franca,nom_angla,nom_scient,date_maj,grand_groupe,famille,shape_length,shape_area,geometry,area_now
0,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Crapaud d'AmÃ©rique,American Toad,Anaxyrus americanus,2021,Amphibiens,Bufonidae,14983450.0,1184087000000.0,"POLYGON ((-79.51779 51.25005, -79.51784 51.316...",1184087000000.0
1,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille des bois,Wood Frog,Lithobates sylvaticus,2021,Amphibiens,Ranidae,13323450.0,1198792000000.0,"POLYGON ((-79.51779 51.25005, -79.51784 51.316...",1198792000000.0
3,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille du Nord,Mink Frog,Lithobates septentrionalis,2021,Amphibiens,Ranidae,11062840.0,974839300000.0,"POLYGON ((-79.51779 51.25005, -79.51784 51.316...",974839300000.0
4,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille lÃ©opard du Nord,Northern leopard Frog,Lithobates pipiens,2021,Amphibiens,Ranidae,9757220.0,633866000000.0,"POLYGON ((-78.80668 53.76057, -78.66113 53.766...",633866000000.0
5,Aire de rÃ©partition,"MinistÃ¨re des ForÃªts, de la Faune et des Parcs",Grenouille verte,Green Frog,Lithobates clamitans,2021,Amphibiens,Ranidae,9041373.0,540889600000.0,"POLYGON ((-74.73656 51.25987, -74.49898 51.337...",540889600000.0


In [None]:
# Have a look at the data and review folium concepts
my_map = boxy.explore(color="red", alpha=0.75)

# Add 
folium.GeoJson(data=amph_gdf[0:1].to_crs(epsg=4326).geometry, tooltip="I was not clipped!", color="purple", alpha=0.75).add_to(parent=my_map)
folium.GeoJson(data=amph_clipped[0:1].geometry, tooltip="But I was!", color="blue", alpha=0.75).add_to(parent=my_map)

# Show the map
my_map

In [None]:
# Have a look at the data and review folium concepts (part 2!!!)
my_map = boxy.explore(color="red", alpha=0.75)

# Add 
folium.GeoJson(data=amph_gdf[2:3].to_crs(epsg=4326).geometry, tooltip="I was outside the box!", color="purple", alpha=0.75).add_to(parent=my_map)

# Show the map
my_map

# SPATIAL JOINS

In [53]:
# Get Australian territories and boundaries
gdf_aus = gpd.read_file(filename="https://www.abs.gov.au/statistics/standards/australian-statistical-geography-standard-asgs-edition-3/jul2021-jun2026/access-and-downloads/digital-boundary-files/STE_2021_AUST_SHP_GDA2020.zip")

# Look 
print(gdf_aus.head())

# Have a look at the shape of the GDF
# Note: I hate you last line...
print(gdf_aus.shape)

  STE_CODE21         STE_NAME21 CHG_FLAG21  CHG_LBL21 AUS_CODE21 AUS_NAME21  \
0          1    New South Wales          0  No change        AUS  Australia   
1          2           Victoria          0  No change        AUS  Australia   
2          3         Queensland          0  No change        AUS  Australia   
3          4    South Australia          0  No change        AUS  Australia   
4          5  Western Australia          0  No change        AUS  Australia   

     AREASQKM21                                       LOCI_URI21  \
0  8.007977e+05  http://linked.data.gov.au/dataset/asgsed3/STE/1   
1  2.274962e+05  http://linked.data.gov.au/dataset/asgsed3/STE/2   
2  1.730171e+06  http://linked.data.gov.au/dataset/asgsed3/STE/3   
3  9.842314e+05  http://linked.data.gov.au/dataset/asgsed3/STE/4   
4  2.526632e+06  http://linked.data.gov.au/dataset/asgsed3/STE/5   

                                            geometry  
0  MULTIPOLYGON (((159.0623 -31.50886, 159.06218 ...  
1  MUL

In [None]:
# Have a look at the geometries
gdf_aus.explore()

In [55]:
# Have a look at the CRS
print(gdf_aus.crs)
print(gdf_aus.crs.is_projected)
print(gdf_aus.crs.area_of_use)
print(type(gdf_aus.crs))

EPSG:7844
False
- name: Australia including Lord Howe Island, Macquarie Island, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore.
- bounds: (93.41, -60.55, 173.34, -8.47)
<class 'pyproj.crs.crs.CRS'>


In [57]:
# Read in GBIF data
desert_pea = pd.read_csv(filepath_or_buffer="../data/desert_pea.csv")

# Look at the first 5 lines
desert_pea.head()

Unnamed: 0.1,Unnamed: 0,eventId,decimalLatitude,decimalLongitude
0,0,5087759453,-20.585195,116.787842
1,1,5104528592,-22.843242,122.580642
2,2,4510177813,-32.524942,136.163344
3,3,4512283168,-30.397483,136.874328
4,4,4516234716,-32.268875,135.993515


In [None]:
# Create a GDF from the coordinates
# Convert the latitude and longitude to Shapely Point geometries
#pea_geom = [shapely.geometry.Point(lon, lat) for lon, lat in zip(desert_pea["decimalLongitude"], desert_pea["decimalLatitude"])]
#pea_geom = [shapely.geometry.Point(row["decimalLongitude"], row["decimalLatitude"]) for row in desert_pea.itertuples()]

pea_geom = [shapely.geometry.Point(row["decimalLongitude"], row["decimalLatitude"]) for idx, row in desert_pea.iterrows()]

# Look at the first 5 values of WKT Points
pea_geom[0:5]

[<POINT (116.788 -20.585)>,
 <POINT (122.581 -22.843)>,
 <POINT (136.163 -32.525)>,
 <POINT (136.874 -30.397)>,
 <POINT (135.994 -32.269)>]

In [66]:
# Create a GDF from the two
# Note: For the nonspatial part, we can keep just the eventId value
gdf_pea = gpd.GeoDataFrame(data=desert_pea["eventId"], geometry=pea_geom)

# Observe the first 5 values
gdf_pea.head()

Unnamed: 0,eventId,geometry
0,5087759453,POINT (116.78784 -20.5852)
1,5104528592,POINT (122.58064 -22.84324)
2,4510177813,POINT (136.16334 -32.52494)
3,4512283168,POINT (136.87433 -30.39748)
4,4516234716,POINT (135.99352 -32.26888)


In [67]:
# BUT! Look at the CRS
print(gdf_pea.crs)

None


In [68]:
# We need to manually set the CRS.
# GBIF considers WGS84 (i.e. EPSG:4326 for their data)
gdf_pea.set_crs(crs="EPSG:4326", inplace=True)

# Now it will work
print(gdf_pea.crs)

EPSG:4326


In [69]:
###################################NOTICE
#
# Convert to GeoDataFrame in one go using from_xy
# But the previous method shows each step and is therefore more
# appropriate for a tutorial (yep!)
#
gdf_pea = gpd.GeoDataFrame(data=desert_pea["eventId"], geometry=gpd.points_from_xy(x=desert_pea["decimalLongitude"], y=desert_pea["decimalLatitude"], crs=4326))
print(gdf_pea)

        eventId                     geometry
0    5087759453   POINT (116.78784 -20.5852)
1    5104528592  POINT (122.58064 -22.84324)
2    4510177813  POINT (136.16334 -32.52494)
3    4512283168  POINT (136.87433 -30.39748)
4    4516234716  POINT (135.99352 -32.26888)
..          ...                          ...
295  3985803395  POINT (136.88933 -30.57152)
296  3986318782  POINT (136.92632 -30.42622)
297  3985793950  POINT (136.55266 -31.27423)
298  3985875946  POINT (136.86227 -33.25164)
299  3985925954  POINT (136.54971 -31.25694)

[300 rows x 2 columns]


In [71]:
# Now convert it to the same CRS as that of the AUSTRALIA DATA
gdf_pea.to_crs(crs=7844, inplace=True)

# Check that it has been correctly changed
print(gdf_pea.crs)

EPSG:7844


In [72]:
# Now you can perform spatial joins between `points_gdf` and `territories_gdf`
# For example, join the points with the Australian territories based on location
joined_gdf = gpd.sjoin(left_df=gdf_pea, right_df=gdf_aus, how="left", predicate="within")

# Display the resulting GeoDataFrame
print(joined_gdf)

        eventId                     geometry  index_right STE_CODE21  \
0    5087759453   POINT (116.78784 -20.5852)          4.0          5   
1    5104528592  POINT (122.58064 -22.84324)          4.0          5   
2    4510177813  POINT (136.16334 -32.52494)          3.0          4   
3    4512283168  POINT (136.87433 -30.39748)          3.0          4   
4    4516234716  POINT (135.99352 -32.26888)          3.0          4   
..          ...                          ...          ...        ...   
295  3985803395  POINT (136.88933 -30.57152)          3.0          4   
296  3986318782  POINT (136.92632 -30.42622)          3.0          4   
297  3985793950  POINT (136.55266 -31.27423)          3.0          4   
298  3985875946  POINT (136.86227 -33.25164)          3.0          4   
299  3985925954  POINT (136.54971 -31.25694)          3.0          4   

            STE_NAME21 CHG_FLAG21  CHG_LBL21 AUS_CODE21 AUS_NAME21  \
0    Western Australia          0  No change        AUS  Australi

In [None]:
# What are the dimensions of joined_gdf?
# Answer (n_left) x (p_left + (p_right - 1) + 1) REVOIR
print(joined_gdf.shape)
#
print(gdf_pea.shape)
print(gdf_aus.shape)

(300, 11)
(300, 2)
(10, 9)


In [49]:
# VERY IMPORTANT COLUMN
# INDEX_RIGHT
# The ever important "index_right" column gives you
# the index of the right geometries based on the predicate
# you mentionned

print(joined_gdf.index_right)
print(type(joined_gdf.index_right))

0      4.0
1      4.0
2      3.0
3      3.0
4      3.0
      ... 
295    3.0
296    3.0
297    3.0
298    3.0
299    3.0
Name: index_right, Length: 300, dtype: float64
<class 'pandas.core.series.Series'>


In [75]:
# The ever important "index_right" column gives you
# the index of the right geometries based on the predicate
# you mentionned
print(joined_gdf.index_right.value_counts())
#
print(joined_gdf.index_right.value_counts().sum())

# We lost two along the way... (more later)

index_right
4.0    106
3.0     94
0.0     74
6.0     24
Name: count, dtype: int64
298


In [None]:
#  Create a map
# Use gdf_aus.explore() as a base
mapy = gdf_aus.explore()

#for idx, point in joined_gdf.iterrows():
for _, point in joined_gdf.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        # For extra coolness, put the name of the variable in bold HTML tags
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="green", prefix="fa", icon="seedling")
    ).add_to(mapy)

# Display the map
mapy

In [79]:
# See which points did not make the cut and why
sad_points = joined_gdf[joined_gdf["index_right"].isna()]

print(sad_points)

        eventId                     geometry  index_right STE_CODE21  \
147  4413908515  POINT (118.63845 -20.30648)          NaN        NaN   
284  3947437828   POINT (116.59086 -20.4657)          NaN        NaN   

    STE_NAME21 CHG_FLAG21 CHG_LBL21 AUS_CODE21 AUS_NAME21  AREASQKM21  \
147        NaN        NaN       NaN        NaN        NaN         NaN   
284        NaN        NaN       NaN        NaN        NaN         NaN   

    LOCI_URI21  
147        NaN  
284        NaN  


In [None]:
# Create another map
mapo = gdf_aus.explore()

#for idx, point in sad_points.iterrows():
for _, point in sad_points.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        # For extra coolness, put the name of the variable in bold HTML tags
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="red", prefix="fa", icon="plant-wilt")
    ).add_to(mapo)

# Display the map
mapo

In [None]:
# Use the previously seen Pandas DataFrame .groupby() method to determine
# in which territory the observations fall
territory_counts = joined_gdf.groupby("STE_NAME21").size()

# Step 5: Print the counts of observations per territory
print("\nUnsorted values:")
print(territory_counts)

# Note: You could also sort the counts in descending order
print("\nSorted values:")
print(territory_counts.sort_values(ascending=False))


Unsorted values:
STE_NAME21
New South Wales        74
Northern Territory     24
South Australia        94
Western Australia     106
dtype: int64

Sorted values:
STE_NAME21
Western Australia     106
South Australia        94
New South Wales        74
Northern Territory     24
dtype: int64


In [83]:
##################### JUST FOR KICKS?
# Change the order of the dataframes and the predicate.
joined_gdf_v2 = gpd.sjoin(left_df=gdf_aus, right_df=gdf_pea, how="left", predicate="contains")

# Display the resulting GeoDataFrame
print(joined_gdf_v2)

   STE_CODE21                    STE_NAME21 CHG_FLAG21  CHG_LBL21 AUS_CODE21  \
0           1               New South Wales          0  No change        AUS   
0           1               New South Wales          0  No change        AUS   
0           1               New South Wales          0  No change        AUS   
0           1               New South Wales          0  No change        AUS   
0           1               New South Wales          0  No change        AUS   
..        ...                           ...        ...        ...        ...   
6           7            Northern Territory          0  No change        AUS   
6           7            Northern Territory          0  No change        AUS   
7           8  Australian Capital Territory          0  No change        AUS   
8           9             Other Territories          0  No change        AUS   
9           Z             Outside Australia          1        New        ZZZ   

           AUS_NAME21    AREASQKM21  \


In [92]:
# Right index isn't as talkative as before.
print(joined_gdf_v2.index_right.value_counts())

index_right
82.0     1
160.0    1
77.0     1
78.0     1
76.0     1
        ..
288.0    1
244.0    1
151.0    1
67.0     1
271.0    1
Name: count, Length: 298, dtype: int64


In [94]:
# BE CAREFUL OF ONE THING
print("Number of times each territory appears:")
print(joined_gdf_v2["STE_NAME21"].value_counts())

Number of times each territory appears:
STE_NAME21
Western Australia               106
South Australia                  94
New South Wales                  74
Northern Territory               24
Victoria                          1
Queensland                        1
Tasmania                          1
Australian Capital Territory      1
Other Territories                 1
Outside Australia                 1
Name: count, dtype: int64


In [98]:
# Don't forget to look at the index_right column!!!!
# It ain't 1 cuz they only got 1 observation!!!
print("Territories that did not get any observations:")
print(joined_gdf_v2.loc[joined_gdf_v2["index_right"].isna(), "STE_NAME21"])

Territories that did not get any observations:
1                        Victoria
2                      Queensland
5                        Tasmania
7    Australian Capital Territory
8               Other Territories
9               Outside Australia
Name: STE_NAME21, dtype: object


In [None]:
# If you do this, be mindful of dropping NAs before counting
print("Number of times each territory appears with NA values removed:")
print(joined_gdf_v2.dropna(subset="index_right")["STE_NAME21"].value_counts())

Number of times each territory appears:
STE_NAME21
Western Australia     106
South Australia        94
New South Wales        74
Northern Territory     24
Name: count, dtype: int64


# Case 2: Dugongs

In [106]:
# This time quick
df_dugong = pd.read_csv(filepath_or_buffer="../data/sea_cow.csv")
gdf_dugong = gpd.GeoDataFrame(data=df_dugong["eventId"], geometry=gpd.points_from_xy(df_dugong["decimalLongitude"], df_dugong["decimalLatitude"], crs=4326))

# Get also Australian marine parks
gdf_marineparks = gpd.read_file(filename="https://hub.arcgis.com/api/v3/datasets/2b3eb1d42b8d4319900cf4777f0a83b9_0/downloads/data?format=shp&spatialRefId=4283&where=1%3D1")

# Safety check: verify both CRS (you set the first one though...)
print("Dugong sightings CRS info:")
print(gdf_dugong.crs)
print(gdf_dugong.crs.is_projected)
print(gdf_dugong.crs.area_of_use)
#
print("\nAustralian marine parks CRS info:")
print(gdf_marineparks.crs)
print(gdf_marineparks.crs.is_projected)
print(gdf_marineparks.crs.area_of_use)

Dugong sightings CRS info:
EPSG:4326
False
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)

Australian marine parks CRS info:
EPSG:4283
False
- name: Australia including Lord Howe Island, Macquarie Island, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore.
- bounds: (93.41, -60.55, 173.34, -8.47)


In [107]:
# Reproject both GeoDataFrames to the same projected CRS (EPSG:3577)
gdf_dugong.to_crs(epsg=3577, inplace=True)
gdf_marineparks.to_crs(epsg=3577, inplace=True)

# Safety check:
print("Dugong sightings CRS info:")
print(gdf_dugong.crs)
print(gdf_dugong.crs.is_projected)
print(gdf_dugong.crs.area_of_use)
#
print("\nAustralian marine parks CRS info:")
print(gdf_marineparks.crs)
print(gdf_marineparks.crs.is_projected)
print(gdf_marineparks.crs.area_of_use)

Dugong sightings CRS info:
EPSG:3577
True
- name: Australia - Australian Capital Territory; New South Wales; Northern Territory; Queensland; South Australia; Tasmania; Western Australia; Victoria.
- bounds: (112.85, -43.7, 153.69, -9.86)

Australian marine parks CRS info:
EPSG:3577
True
- name: Australia - Australian Capital Territory; New South Wales; Northern Territory; Queensland; South Australia; Tasmania; Western Australia; Victoria.
- bounds: (112.85, -43.7, 153.69, -9.86)


In [108]:
# Create a buffer around the marine park.
# For example, 40 km buffer, seems like a reasonable dugong-ish area (verify)
gdf_mpbuff = gpd.GeoDataFrame(data=gdf_marineparks, geometry=gdf_marineparks.buffer(distance=40000), crs=3577)

# Have a look at the data
gdf_mpbuff.head()

Unnamed: 0,OBJECTID,NETNAME,RESNAME,ZONENAME,ZONEIUCN,POLYGONID,NATLEGEND,AREA_KM2,SHAPEAREA,SHAPELEN,geometry
0,1,South-west,South-west Corner,Special Purpose Zone (Mining Exclusion),VI,swswcspm03,Special Purpose Zone (Mining Exclusion) (IUCN VI),5897.166603,0.574243,3.845637,"POLYGON ((-1616240.278 -3726962.412, -1614840...."
1,2,South-west,Southern Kangaroo Island,Special Purpose Zone (Mining Exclusion),VI,swskispm01,Special Purpose Zone (Mining Exclusion) (IUCN VI),629.938756,0.063069,1.749836,"POLYGON ((483704.268 -3908783.148, 486016.344 ..."
2,3,Temperate East,Lord Howe,National Park Zone,II,telhinpz06,National Park Zone (IUCN II),6302.226441,0.601856,3.360809,"POLYGON ((2547264.478 -3696281.302, 2550631.98..."
3,4,Temperate East,Lord Howe,National Park Zone,II,telhinpz05,National Park Zone (IUCN II),203.064013,0.019288,0.670249,"POLYGON ((2495939.867 -3700975.953, 2496208.50..."
4,5,Temperate East,Lord Howe,National Park Zone,II,telhinpz02,National Park Zone (IUCN II),2767.488053,0.257778,2.033333,"POLYGON ((2636033.534 -3538313.004, 2635010.92..."


In [None]:
# Have a look at the data
gdf_mpbuff.explore()

In [111]:
# Perform a spatial join to find dugong sightings within the buffer
join_dugong = gpd.sjoin(left_df=gdf_dugong, right_df=gdf_mpbuff, how="left", predicate="within")

# Look at the dimensions of the joined DataFrame
# NONONONO
print(join_dugong.shape)

(309, 13)


In [None]:
# Be careful!
# Joins are done per cominations!!!!
join_dugong.dropna(subset="index_right")["eventId"].value_counts()

# Here, some 1s are actual 1s

eventId
4138745637    4
4405405111    3
4405391134    3
4954727789    2
3391347177    2
             ..
4155461500    1
4155552348    1
4155568245    1
4155457594    1
4155570396    1
Name: count, Length: 164, dtype: int64

In [None]:
# Extract happy dugongs and sad dugongs
happy_dugongs = join_dugong.dropna(subset=["index_right"]).copy()
happy_dugongs.to_crs(4326, inplace=True)
#
sad_dugongs = join_dugong[join_dugong["index_right"].isna()].copy()
sad_dugongs.to_crs(4326, inplace=True)

# Map them both on the same map
mapa = gdf_mpbuff.explore()

for _, point in happy_dugongs.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="green", prefix="fa", icon="hippo")
    ).add_to(mapa)

for _, point in sad_dugongs.iterrows():
    folium.Marker(
        location=[point.geometry.y, point.geometry.x],
        popup=f"<b>eventId</b>: {point.eventId}",
        icon=folium.Icon(color="red", prefix="fa", icon="hippo")
    ).add_to(mapa)

# Display the map
mapa

In [118]:
# Again with the groupby
join_dugong.columns

Index(['eventId', 'geometry', 'index_right', 'OBJECTID', 'NETNAME', 'RESNAME',
       'ZONENAME', 'ZONEIUCN', 'POLYGONID', 'NATLEGEND', 'AREA_KM2',
       'SHAPEAREA', 'SHAPELEN'],
      dtype='object')

In [119]:
join_dugong.dropna(subset="index_right").groupby(by="ZONENAME").size().sort_values(ascending=False)

ZONENAME
Habitat Protection Zone         150
Multiple Use Zone                13
Recreational Use Zone             5
National Park Zone                3
Special Purpose Zone              1
Special Purpose Zone (Trawl)      1
dtype: int64

In [120]:
join_dugong.dropna(subset="index_right").groupby(by="RESNAME").size().sort_values(ascending=False)

RESNAME
Limmen                   148
Roebuck                    8
Dampier                    6
Ningaloo                   6
Gascoyne                   3
Gulf of Carpentaria        1
Joseph Bonaparte Gulf      1
dtype: int64

In [121]:
join_dugong.dropna(subset="index_right").groupby(by="POLYGONID").size().sort_values(ascending=False)

POLYGONID
nolimhpz01    148
nwroemuz01      8
nwgasmuz03      3
nwninruz01      3
nwdammuz03      2
nwdamnpz01      2
nwdamhpz02      2
nwninruz03      2
nogocspt02      1
nojbgspz01      1
nwninnpz02      1
dtype: int64

In [122]:
join_dugong.dropna(subset="index_right").groupby(by=["ZONEIUCN", "ZONENAME"]).size()

# YOU CAN SORT, BUT IT BREAKS THE ORDER OF GROUPBY
#join_dugong.dropna(subset="index_right").groupby(by=["ZONEIUCN", "ZONENAME"]).size().sort_values(ascending=False)

ZONEIUCN  ZONENAME                    
II        National Park Zone                3
IV        Habitat Protection Zone         150
          Recreational Use Zone             5
VI        Multiple Use Zone                13
          Special Purpose Zone              1
          Special Purpose Zone (Trawl)      1
dtype: int64

In [83]:
gdf_marineparks.describe(include="object")

Unnamed: 0,NETNAME,RESNAME,ZONENAME,ZONEIUCN,POLYGONID,NATLEGEND
count,216,216,216,216,216,216
unique,8,61,12,4,214,12
top,South-west,Coral Sea,National Park Zone,VI,tesolspt03,National Park Zone (IUCN II)
freq,65,28,76,82,3,76


In [87]:
gdf_marineparks["ZONEIUCN"].value_counts()

ZONEIUCN
VI    82
II    76
IV    51
Ia     7
Name: count, dtype: int64