Extract birding data from eBird API
====

## Setup

In [1]:
# import packages
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
import numpy as np
from ebird.api import Client
from functools import partial
from multiprocessing import Pool
from collections import defaultdict
import datetime
import time
import os
import gc

# setup client instance
api_key = '6qmfvb8pg9dk'
locale = 'en_US'
client = Client(api_key, locale)
CORES = 4


## Read entries from eBird database

In [87]:
def loadEbird(filename=None, rows=1e5, chunk=True, start_year=2017):
   # columns of interest
    target_columns = ['SAMPLING EVENT IDENTIFIER', 'COMMON NAME', 'OBSERVATION COUNT', 
                      'LOCALITY TYPE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 
                      'TIME OBSERVATIONS STARTED', 'PROTOCOL TYPE', 'DURATION MINUTES', 
                      'EFFORT DISTANCE KM', 'EFFORT AREA HA', 'NUMBER OBSERVERS', 
                      'ALL SPECIES REPORTED', 'HAS MEDIA', 'OBSERVER ID']
    
    chunks = []


    if chunk:
        
        # load the big file in smaller chunks
        for df_chunk in pd.read_csv(filename,
                                    sep='\t',
                                    chunksize=rows,
                                    usecols=target_columns):
            
            valid = [int(ele.split('-')[0]) >= start_year for ele in df_chunk['OBSERVATION DATE']]
            df_chunk = df_chunk.loc[valid]
            
            # 

            if len(df_chunk) > 0:
                chunks.append(df_chunk)
    
    
    return pd.concat(chunks).sort_values(by=['OBSERVER ID'])

tic = time.time()

# load database
database = loadEbird('Datasets/ebd_US-ME_201701_202002_relDec-2019.txt')
print(set(database['PROTOCOL TYPE']))
print(database)

print(time.time() - tic)


{'eBird Pelagic Protocol', 'Banding', 'Nocturnal Flight Call Count', 'Audubon Coastal Bird Survey', 'Incidental', 'Area', 'Traveling', 'International Shorebird Survey (ISS)', 'Stationary', 'Historical', 'Random'}
                     COMMON NAME OBSERVATION COUNT LOCALITY TYPE   LATITUDE  \
445152          Great Black Hawk                 1             H  43.658764   
2627215                Snowy Owl                 1             P  43.398289   
2923050  White-breasted Nuthatch                 1             P  43.407312   
481486       American Black Duck                 2             T  43.578180   
483273         Belted Kingfisher                 1             P  43.620732   
...                          ...               ...           ...        ...   
2877341  White-breasted Nuthatch                 4             P  43.113307   
2872310        Northern Cardinal                10             P  43.113307   
2865358         Chipping Sparrow                 2             P  43.113307 

## Function to read in data

In [88]:

# helper function to handle missing data on numeric columns
def handle_numerical(nums):
    if type(nums[0]) == str:
        return [float(ele) if len(ele) > 0 else float('NaN') for ele in nums]
    else:
        return [float(ele) for ele in nums]


# processing function
def get_user_data(user_data, latest=2015):
    
    # store user name
    try:
        user = client.get_checklist(user_data['SAMPLING EVENT IDENTIFIER'].iloc[0])['userDisplayName']
    except:
        return False
    
    # get user features
    users_df = gpd.GeoDataFrame()

    # event data
    event_data = pd.DataFrame()

    for event in set(user_data['SAMPLING EVENT IDENTIFIER']):
        event_data = event_data.append(user_data.loc[user_data['SAMPLING EVENT IDENTIFIER'] == event].iloc[0],
                                      ignore_index=True)
        
        
    # add event statistics
    n_checklists = len(event_data)
    n_observations = len(user_data)
        
    # time variables

    # find median interval between observations
    dates = [datetime.date(*[int(ele) for ele in date.split('-')]) for date in event_data['OBSERVATION DATE']]
    
    # get year of first checklist
    since = int(dates[0].year)
    
    # check if user has been active for the last year
    if int(dates[-1].year) < latest:
        return False
    
    if n_checklists > 1:
        dates = sorted(dates)
        intervals = [(dates[idx + 1] - dates[idx]).total_seconds() for idx in range(len(dates) - 1)]
        median_interval = np.median(intervals)
    else:
        median_interval = 0


    # effort variables

    # deal with missing data on numerical columns
    
    distances = handle_numerical([ele for ele in event_data['EFFORT DISTANCE KM']])
    median_distance = np.nanmedian(distances)

    durations = handle_numerical([ele for ele in event_data['DURATION MINUTES']])
    median_duration = np.nanmedian(durations)
    
    percents = handle_numerical([ele for ele in event_data['ALL SPECIES REPORTED']])
    percent_all = np.mean(percents)
    
    observers = handle_numerical([ele for ele in event_data['NUMBER OBSERVERS']])
    mean_observers = np.mean(observers)

    # location
    percent_hotspot = sum(event_data['LOCALITY TYPE'] == 'H') / n_checklists
    locations = [Point([float(ele[0]), float(ele[1])]) for ele in zip(event_data['LATITUDE'], event_data['LONGITUDE'])]
    distances = [sum([ele1.distance(ele2) for ele1 in locations]) for ele2 in locations]
    centroid = locations[np.argmin(distances)]
    
    # trip type
    percent_travel = sum(event_data['PROTOCOL TYPE'] == 'Traveling') / n_checklists
    if percent_travel > 0:
        travels = [locations[idx] for idx, ele in enumerate(event_data['PROTOCOL TYPE']) if ele == 'Traveling']
        median_travel_distance = np.nanmedian([centroid.distance(ele) for ele in travels])
    else:
        median_travel_distance = 0

    # median starting time
    times = [int(tm[:2]) * 3600 + int(tm[3:5]) * 60 + int(tm[6:]) if type(tm) == str else float('NaN') for tm in event_data['TIME OBSERVATIONS STARTED']]
    median_start = np.nanmedian(times)
 
    # observation data
    percent_media = np.mean([int(ele) for ele in user_data['HAS MEDIA']])
    n_species = len(set(user_data['COMMON NAME']))
    
    # get a sample checklist
    if percent_media > 0:
        checklist = np.random.choice(user_data.loc[user_data['HAS MEDIA'] == 1]['SAMPLING EVENT IDENTIFIER'])
    else:
        checklist = np.random.choice(user_data['SAMPLING EVENT IDENTIFIER'])
        
    sample_checklist = f"https://ebird.org/checklist/{checklist}"

    # add user    
    users_df = users_df.append(pd.Series({'n_checklists': n_checklists,
                                'n_species': n_species,
                                'n_observations': n_observations,
                                'since': since,
                                'geometry': centroid,
                                'median_distance': median_distance,
                                'median_duration': median_duration,
                                'median_interval': median_interval,
                                'median_travel_distance': median_travel_distance,
                                'median_start': median_start,
                                'percent_all': percent_all, 
                                'percent_media': percent_media,
                                'percent_travel': percent_travel,
                                'percent_hotspot': percent_hotspot,
                                'sample_checklist': sample_checklist,
                                'mean_group_size': mean_observers}, name=user))

    # return user data
    return users_df

## Extract user information

In [89]:
#user_df = gpd.GeoDataFrame()
breaks = []
observers = list(database['OBSERVER ID'])
prev = 0
for idx in range(len(observers) - 1):
    if observers[idx] != observers[idx + 1]:
        breaks.append([prev, idx + 1])
        prev = idx + 1

tic = time.time()
pool = Pool(CORES)
chunks = (database.iloc[range(chunk_idx[0], chunk_idx[1])] for chunk_idx in breaks)

out = pool.map(get_user_data, chunks)
print(out)
out = [ele for ele in out if type(ele) != bool]
user_df = pd.concat(out)
print(time.time() - tic)

user_df = user_df.to_file('users_data.shp')

  overwrite_input=overwrite_input)
  overwrite_input=overwrite_input)
  overwrite_input=overwrite_input)
  overwrite_input=overwrite_input)
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



1087.4322302341461


In [90]:
user_df = pd.concat(out)
print(user_df.head())

                                  geometry  mean_group_size  median_distance  \
Jennifer Jones  POINT (43.65876 -70.27101)             5.00              NaN   
Ben Tettlebaum  POINT (43.39829 -70.41247)             1.00              NaN   
Kerry Daly      POINT (43.57818 -70.32169)              NaN            0.805   
Ian Minter      POINT (43.59140 -70.37876)             2.75            3.219   
Alec Jolman     POINT (43.65563 -70.19971)             1.50            0.517   

                median_duration  median_interval  median_start  \
Jennifer Jones             15.0              0.0       48240.0   
Ben Tettlebaum              NaN              0.0       27660.0   
Kerry Daly                 30.0         259200.0       39600.0   
Ian Minter                 62.5          86400.0       47850.0   
Alec Jolman                30.0              0.0       53310.0   

                median_travel_distance  n_checklists  n_observations  \
Jennifer Jones                0.000000           1

## Get distance between users

In [24]:
# select columns
user_df = pd.read_csv('user_data_US_2019/user_data_US.csv', index_col=0)
print(len(user_df))
unused = ['sample_checklist', 'geometry', 'since', 'mean_interval']
to_process = user_df.iloc[:, [idx for idx, ele in enumerate(user_df.columns) if ele not in unused]]
print(to_process.columns)
to_process = to_process.fillna(0)
users = list(to_process.index)

# normalize columns
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity

    
x = to_process.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

from bokeh.transform import jitter, factor_cmap
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource, output_file
from bokeh.models import Legend, Range1d, LabelSet, BoxAnnotation
from bokeh.io import export_png
from bokeh.layouts import column

# We import sklearn.
import sklearn
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
RS = 20150101

# take a random sample
plot_df = to_process.copy()
#indices = np.random.choice(len(x_scaled), 5000)
#plot_df = plot_df.iloc[indices]
#x_scaled = x_scaled[indices, :]

users_proj = TSNE(random_state=RS, perplexity=10, n_iter=5000).fit_transform(x_scaled)
#model = PCA(n_components=2)
#users_proj = model.fit_transform(x_scaled)
plot_df['x'], plot_df['y'] = users_proj[:, 0], users_proj[:, 1]


source = ColumnDataSource(plot_df)

TOOLTIPS = [
    ("#species", "@n_species"),
    ("distance", "@median_distance"),
    ("percent_complete", "@percent_complete"),
    ("#checklists", "@n_checklists"),
    ("group size", "@mean_group_size")
]
                    
plotting_comments = figure(title='Birders', tooltips=TOOLTIPS)

plotting_comments.circle('x', 'y', color = 'coral', size=6, alpha=0.2, source=source)

plotting_comments.toolbar.logo = None
plotting_comments.toolbar_location = None




output_file('all_comments_pds.html')
show(plotting_comments)

#for idx in range(len(x_scaled)):
#    to_process.iloc[idx] = x_scaled[idx]
#
#encoded = pd.DataFrame()
#for _, row in to_process.iterrows():
#    encoded = encoded.append(encode_user(row))
#print(encoded)
#similarity = cosine_similarity(encoded)
#matches = {}
#for i in range(len(similarity)):
#    row = [ele for idx, ele in enumerate(similarity[i]) if idx != i]
#    matches[users[i]] = users[np.argmax(row)]
    
#print(matches)

108433
Index(['banding', 'mean_comment_size', 'mean_group_size', 'median_distance',
       'median_duration', 'median_interval', 'median_start',
       'median_travel_distance', 'n_checklists', 'n_observations', 'n_species',
       'percent_comments', 'percent_complete', 'percent_hotspot',
       'percent_incidental', 'percent_media', 'percent_survey',
       'percent_travel'],
      dtype='object')


In [23]:
# take a random sample
plot_df = to_process.copy()
indices = np.random.choice(len(x_scaled), 5000)
plot_df = plot_df.iloc[indices]
x_scaled = x_scaled[indices, :]

users_proj = TSNE(random_state=RS, perplexity=100, n_iter=10000, learning_rate=1000).fit_transform(x_scaled)
#model = PCA(n_components=2)
#users_proj = model.fit_transform(x_scaled)
plot_df['x'], plot_df['y'] = users_proj[:, 0], users_proj[:, 1]


source = ColumnDataSource(plot_df)

TOOLTIPS = [
    ("#species", "@n_species"),
    ("distance", "@median_distance"),
    ("percent_complete", "@percent_complete"),
    ("#checklists", "@n_checklists"),
    ("group size", "@mean_group_size")
]
                    
plotting_comments = figure(title='Birders', tooltips=TOOLTIPS)

plotting_comments.circle('x', 'y', color = 'coral', size=6, alpha=0.2, source=source)

plotting_comments.toolbar.logo = None
plotting_comments.toolbar_location = None




output_file('all_comments_pds.html')
show(plotting_comments)

In [9]:
user_df.head()

Unnamed: 0.1,Unnamed: 0,banding,geometry,mean_comment_size,mean_group_size,median_distance,median_duration,median_interval,median_start,median_travel_distance,...,n_species,percent_comments,percent_complete,percent_hotspot,percent_incidental,percent_media,percent_survey,percent_travel,sample_checklist,since
0,Pam Howland,1.0,POINT (-89.72984 45.47107000000001),2.5,1.25,4.828,510.0,86400.0,26850.0,0.0,...,12.0,0.064516,1.0,0.0,0.0,0.032258,0.0,0.25,https://ebird.org/checklist/S52994220,2019.0
1,Travis DeNeal,1.0,POINT (-88.4800125 37.6364028),17.691057,1.398747,2.897,18.0,86400.0,53700.0,0.091406,...,245.0,0.024236,0.995825,0.031315,0.004175,0.016552,0.0,0.557411,https://ebird.org/checklist/S62439785,2019.0
2,Micah Nims,1.0,POINT (-122.7119207 47.0780133),0.0,10.0,4.828,240.0,0.0,28800.0,0.0,...,47.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,https://ebird.org/checklist/S54457891,2019.0
3,Jennifer Jones,1.0,POINT (-70.0253749 41.6700588),0.0,2.0,1.609,64.0,0.0,63960.0,0.0,...,12.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,https://ebird.org/checklist/S57676658,2019.0
4,Annette Saubon Sole,1.0,POINT (-155.0687814 19.725559),21.333333,4.0,1.416,68.0,86400.0,47340.0,42.969664,...,82.0,0.167702,0.826087,0.695652,0.173913,0.055901,0.0,0.565217,https://ebird.org/checklist/S57382888,2019.0


In [68]:
a = user_df.iloc[0]
print(a.name)

Kirby  Pitchford


   banding  mean_comme  mean_group  median_dis  median_dur  median_int  \
0      1.0    2.500000    1.250000       4.828       510.0     86400.0   
1      1.0   17.691057    1.398747       2.897        18.0     86400.0   
2      1.0    0.000000   10.000000       4.828       240.0         0.0   
3      1.0    0.000000    2.000000       1.609        64.0         0.0   
4      1.0   21.333333    4.000000       1.416        68.0     86400.0   

   median_sta  median_tra  n_checklis  n_observat  ...  percent_co  \
0     26850.0    0.000000         4.0        31.0  ...    0.064516   
1     53700.0    0.091406       479.0      5075.0  ...    0.024236   
2     28800.0    0.000000         1.0        47.0  ...    0.000000   
3     63960.0    0.000000         1.0        12.0  ...    0.000000   
4     47340.0   42.969664        23.0       161.0  ...    0.167702   

   percent__1  percent_ho  percent_in  percent_me  percent_su  percent_tr  \
0    1.000000    0.000000    0.000000    0.032258        