# GBIF Data Cleaning

##  import libraries 

In [1]:
import pandas as pd
import geopandas as gpd
from keplergl import KeplerGl

# load city boundary

In [2]:
url = 'https://opendata.vancouver.ca/explore/dataset/facet-grid-boundaries/download/?format=shp&timezone=America/Los_Angeles&lang=en'
city_boundary = gpd.read_file(url)

# load gbif

In [3]:
gbif = pd.read_csv('data/misc/gbif.csv', sep='\t', low_memory=False)

In [4]:
gbif.head()

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,...,identifiedBy,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue
0,1065573484,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/264412,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Dryopteridaceae,Polystichum,Polystichum munitum,...,,2013-05-12T03:23:20Z,CC_BY_NC_4_0,vjh,vjh,,,2019-03-19T21:43:51.212Z,,GEODETIC_DATUM_ASSUMED_WGS84
1,1065573958,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/366049,Animalia,Chordata,Amphibia,Anura,Bufonidae,Anaxyrus,Anaxyrus boreas,...,,2013-08-15T12:38:58Z,CC_BY_NC_4_0,James Maughn,James Maughn,,,2019-03-19T22:01:25.296Z,STILLIMAGE,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS84
2,1065578258,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/710315,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Onocleaceae,Onoclea,Onoclea sensibilis,...,,2014-06-01T18:10:11Z,CC_BY_NC_4_0,Killarney Provincial Park,Killarney Provincial Park,,,2019-03-19T23:02:20.517Z,STILLIMAGE,GEODETIC_DATUM_ASSUMED_WGS84
3,1065578318,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/715125,Plantae,Tracheophyta,Polypodiopsida,Osmundales,Osmundaceae,Claytosmunda,Claytosmunda claytoniana,...,,2014-06-03T19:55:29Z,CC_BY_NC_4_0,Killarney Provincial Park,Killarney Provincial Park,,,2019-03-19T23:03:05.983Z,STILLIMAGE,GEODETIC_DATUM_ASSUMED_WGS84
4,1065578594,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/735247,Plantae,Tracheophyta,Polypodiopsida,Osmundales,Osmundaceae,Osmunda,Osmunda regalis,...,,2014-06-13T18:48:27Z,CC_BY_NC_4_0,Killarney Provincial Park,Killarney Provincial Park,,,2019-03-19T23:05:25.989Z,STILLIMAGE,GEODETIC_DATUM_ASSUMED_WGS84


### convert gbif data to geodataframe 

In [5]:
gbif_gdf = gpd.GeoDataFrame(gbif, 
                            crs='epsg:4326', 
                            geometry=gpd.points_from_xy(gbif.decimalLongitude, gbif.decimalLatitude))

### clip gbif data 

In [6]:
vancouver_gbif = gpd.clip(gbif_gdf, mask=city_boundary)

### save clipped species data 

In [15]:
vancouver_gbif.to_file('vancouver_gbif.shp')

# remove unnecessary columns

In [16]:
# what are the columns?
vancouver_gbif.columns

Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'countryCode', 'locality',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue', 'geometry'],
      dtype='object')

In [None]:
vancouver_gbif = vancouver_gbif[['kingdom', 'phylum', 'class',
                                'order', 'family', 'genus', 'species',
                                'decimalLatitude', 'decimalLongitude', 'eventDate',
                                'day',]]

In [25]:
vancouver_gbif.species

81        Dacrymyces chrysospermus
82               Branta canadensis
83              Anas platyrhynchos
84           Lophodytes cucullatus
85               Melospiza melodia
                    ...           
553631          Trifolium pratense
553634         Anthidium manicatum
553637               Silene dioica
553638       Hypholoma fasciculare
553641              Rachiplusia ou
Name: species, Length: 5110, dtype: object

In [20]:
vancouver_gbif.head(1)

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,...,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue,geometry
81,1065607488,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/1220290,Fungi,Basidiomycota,Dacrymycetes,Dacrymycetales,Dacrymycetaceae,Dacrymyces,Dacrymyces chrysospermus,...,2018-12-15T03:01:47Z,CC_BY_NC_4_0,Riley Pollom,Riley Pollom,,,2019-03-19T19:27:43.497Z,,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS84,POINT (-123.13854 49.30549)
