# GBIF Data Cleaning

##  import libraries 

In [31]:
import pandas as pd
import geopandas as gpd
from mealprep.mealprep import find_missing_ingredients

# load city boundary

In [37]:
url = 'https://opendata.vancouver.ca/explore/dataset/facet-grid-boundaries/download/?format=shp&timezone=America/Los_Angeles&lang=en'
city_boundary = gpd.read_file(url)

# load gbif

In [38]:
gbif = pd.read_csv('data/misc/gbif.csv', sep='\t', low_memory=False)

In [39]:
gbif.head()

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,...,identifiedBy,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue
0,1065573484,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/264412,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Dryopteridaceae,Polystichum,Polystichum munitum,...,,2013-05-12T03:23:20Z,CC_BY_NC_4_0,vjh,vjh,,,2019-03-19T21:43:51.212Z,,GEODETIC_DATUM_ASSUMED_WGS84
1,1065573958,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/366049,Animalia,Chordata,Amphibia,Anura,Bufonidae,Anaxyrus,Anaxyrus boreas,...,,2013-08-15T12:38:58Z,CC_BY_NC_4_0,James Maughn,James Maughn,,,2019-03-19T22:01:25.296Z,STILLIMAGE,COORDINATE_ROUNDED;GEODETIC_DATUM_ASSUMED_WGS84
2,1065578258,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/710315,Plantae,Tracheophyta,Polypodiopsida,Polypodiales,Onocleaceae,Onoclea,Onoclea sensibilis,...,,2014-06-01T18:10:11Z,CC_BY_NC_4_0,Killarney Provincial Park,Killarney Provincial Park,,,2019-03-19T23:02:20.517Z,STILLIMAGE,GEODETIC_DATUM_ASSUMED_WGS84
3,1065578318,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/715125,Plantae,Tracheophyta,Polypodiopsida,Osmundales,Osmundaceae,Claytosmunda,Claytosmunda claytoniana,...,,2014-06-03T19:55:29Z,CC_BY_NC_4_0,Killarney Provincial Park,Killarney Provincial Park,,,2019-03-19T23:03:05.983Z,STILLIMAGE,GEODETIC_DATUM_ASSUMED_WGS84
4,1065578594,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/735247,Plantae,Tracheophyta,Polypodiopsida,Osmundales,Osmundaceae,Osmunda,Osmunda regalis,...,,2014-06-13T18:48:27Z,CC_BY_NC_4_0,Killarney Provincial Park,Killarney Provincial Park,,,2019-03-19T23:05:25.989Z,STILLIMAGE,GEODETIC_DATUM_ASSUMED_WGS84


### convert gbif data to geodataframe 

In [40]:
gbif_gdf = gpd.GeoDataFrame(gbif, 
                            crs='epsg:4326', 
                            geometry=gpd.points_from_xy(gbif.decimalLongitude, gbif.decimalLatitude))

### clip gbif data 

In [41]:
vancouver_gbif = gpd.clip(gbif_gdf, mask=city_boundary)

In [42]:
vancouver_gbif.to_file('data/raw_data/vancouver_gbif.shp')

# Data Cleaning 


### remove unnecessary columns

In [16]:
# what are the columns?
vancouver_gbif.columns

Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'countryCode', 'locality',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue', 'geometry'],
      dtype='object')

In [43]:
vancouver_gbif = vancouver_gbif[['kingdom', 'phylum', 'class',
                                'order', 'family', 'genus', 'species',
                                'decimalLatitude', 'decimalLongitude', 'eventDate',
                                'day', 'month', 'year', 'basisOfRecord',
                                'geometry']]

In [44]:
vancouver_gbif.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species,decimalLatitude,decimalLongitude,eventDate,day,month,year,basisOfRecord,geometry
81,Fungi,Basidiomycota,Dacrymycetes,Dacrymycetales,Dacrymycetaceae,Dacrymyces,Dacrymyces chrysospermus,49.305487,-123.138538,2015-02-08T13:15:50Z,8,2,2015,HUMAN_OBSERVATION,POINT (-123.13854 49.30549)
82,Animalia,Chordata,Aves,Anseriformes,Anatidae,Branta,Branta canadensis,49.279785,-123.138956,2015-02-08T16:28:30Z,8,2,2015,HUMAN_OBSERVATION,POINT (-123.13896 49.27978)
83,Animalia,Chordata,Aves,Anseriformes,Anatidae,Anas,Anas platyrhynchos,49.2764,-123.145512,2015-02-09T08:50:46Z,9,2,2015,HUMAN_OBSERVATION,POINT (-123.14551 49.27640)
84,Animalia,Chordata,Aves,Anseriformes,Anatidae,Lophodytes,Lophodytes cucullatus,49.279637,-123.139337,2015-02-09T08:42:00Z,9,2,2015,HUMAN_OBSERVATION,POINT (-123.13934 49.27964)
85,Animalia,Chordata,Aves,Passeriformes,Emberizidae,Melospiza,Melospiza melodia,49.285406,-123.14334,2015-02-09T09:51:57Z,9,2,2015,HUMAN_OBSERVATION,POINT (-123.14334 49.28541)


In [45]:
# rename the columns
vancouver_gbif = vancouver_gbif.rename(columns={'decimalLatitude': 'latitude',
                                               'decimalLongitude': 'longitude',
                                               'eventDate': 'timestamp',
                                               'basisOfRecord': 'basis_of_record'})

In [46]:
vancouver_gbif.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species,latitude,longitude,timestamp,day,month,year,basis_of_record,geometry
81,Fungi,Basidiomycota,Dacrymycetes,Dacrymycetales,Dacrymycetaceae,Dacrymyces,Dacrymyces chrysospermus,49.305487,-123.138538,2015-02-08T13:15:50Z,8,2,2015,HUMAN_OBSERVATION,POINT (-123.13854 49.30549)
82,Animalia,Chordata,Aves,Anseriformes,Anatidae,Branta,Branta canadensis,49.279785,-123.138956,2015-02-08T16:28:30Z,8,2,2015,HUMAN_OBSERVATION,POINT (-123.13896 49.27978)
83,Animalia,Chordata,Aves,Anseriformes,Anatidae,Anas,Anas platyrhynchos,49.2764,-123.145512,2015-02-09T08:50:46Z,9,2,2015,HUMAN_OBSERVATION,POINT (-123.14551 49.27640)
84,Animalia,Chordata,Aves,Anseriformes,Anatidae,Lophodytes,Lophodytes cucullatus,49.279637,-123.139337,2015-02-09T08:42:00Z,9,2,2015,HUMAN_OBSERVATION,POINT (-123.13934 49.27964)
85,Animalia,Chordata,Aves,Passeriformes,Emberizidae,Melospiza,Melospiza melodia,49.285406,-123.14334,2015-02-09T09:51:57Z,9,2,2015,HUMAN_OBSERVATION,POINT (-123.14334 49.28541)


In [47]:
# find any missing values 
find_missing_ingredients(vancouver_gbif)

Unnamed: 0,Column name,NaN count,NaN proportion,NaN indices
0,order,1,0.0%,[482461]
1,genus,2,0.0%,"[250867, 254074]"
2,species,40,0.8%,"[5244, 10031, 14375, 37664, 41215, 42796, 4438..."


# save clean gbif

In [48]:
vancouver_gbif.to_file('data/clean_data/vancouver_gbif.shp')