Extract birding data from eBird API
====

## Setup

In [1]:
# import packages
import geopandas as gpd
from shapely.geometry import Point
from utils import read_ebird, process_ebird
import pandas as pd
import numpy as np
from ebird.api import Client
from functools import partial
from multiprocessing import Pool
from collections import defaultdict
import datetime
import time
import os



## Read entries from eBird database

In [15]:
# options for reading
!python utils/read_ebird.py -h

# read dataset
!python utils/read_ebird.py -i Datasets/ebd_US_relDec-2019.txt -o obs_US_2018.csv -p 2018-2019

usage: eBird database .txt file muncher [-h] [--input_txt INPUT_TXT]
                                        [--period PERIOD] [--output OUTPUT]

optional arguments:
  -h, --help            show this help message and exit
  --input_txt INPUT_TXT, -i INPUT_TXT
                        path to eBird database file
  --period PERIOD, -p PERIOD
                        start year to end year separated by a dash
  --output OUTPUT, -o OUTPUT
                        path to output csv file
Finished reading 68637867 lines in 00:42:53


## Condense observation data into user data

In [4]:
# options for processing
!python utils/process_ebird.py -h

# process dataset
!python utils/process_ebird.py -i observation_data.csv -c 10 -o user_data.csv

usage: Script to process eBird observations into user data [-h]
                                                           [--input_csv INPUT_CSV]
                                                           [--cores CORES]
                                                           [--output OUTPUT]

optional arguments:
  -h, --help            show this help message and exit
  --input_csv INPUT_CSV, -i INPUT_CSV
                        path to observations .csv file
  --cores CORES, -c CORES
                        number of cores for parallel processing
  --output OUTPUT, -o OUTPUT
                        path to output csv file
  overwrite_input=overwrite_input)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  overwrite_input=overwrite_input)
  overwrite_input=overwrite_input)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  overwrite_input=overwrite_input)
  overwrite_input=overwrite_input)
  overwrite_input=overwrite_input)
  overwrite_input=overwrite_input)
Fin

## Extract user information

In [None]:
#user_df = gpd.GeoDataFrame()
breaks = []
observers = list(database['OBSERVER ID'])
prev = 0
for idx in range(len(observers) - 1):
    if observers[idx] != observers[idx + 1]:
        breaks.append([prev, idx + 1])
        prev = idx + 1

tic = time.time()
pool = Pool(CORES)
chunks = (database.iloc[range(chunk_idx[0], chunk_idx[1])] for chunk_idx in breaks)

out = pool.map(get_user_data, chunks)
print(out)
out = [ele for ele in out if type(ele) != bool]
user_df = pd.concat(out)
print(time.time() - tic)

user_df = user_df.to_file('users_data.shp')

In [None]:
user_df = pd.concat(out)
print(user_df.head())

## Get distance between users

In [None]:
import seaborn as sns

# select columns
user_df = pd.read_csv('Datasets/user_data_US.csv', index_col=0)



print(len(user_df))
unused = ['sample_checklist', 'geometry', 'since', 'mean_interval']
to_process = user_df.iloc[:, [idx for idx, ele in enumerate(user_df.columns) if ele not in unused]]
print(to_process.columns)
to_process = to_process.fillna(0)
users = list(to_process.index)

# # normalize columns
# from sklearn import preprocessing
# from sklearn.metrics.pairwise import cosine_similarity

    
# x = to_process.values #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)

# from bokeh.transform import jitter, factor_cmap
# from bokeh.plotting import figure, output_notebook, show, ColumnDataSource, output_file
# from bokeh.models import Legend, Range1d, LabelSet, BoxAnnotation
# from bokeh.io import export_png
# from bokeh.layouts import column

# # We import sklearn.
# import sklearn
# from sklearn.manifold import TSNE
# from sklearn.decomposition import PCA
# RS = 20150101

# # take a random sample
# plot_df = to_process.copy()
# indices = np.random.choice(len(x_scaled), 5000)
# plot_df = plot_df.iloc[indices]
# x_scaled = x_scaled[indices, :]

# users_proj = TSNE(random_state=RS, perplexity=10, n_iter=5000).fit_transform(x_scaled)
# plot_df['x'], plot_df['y'] = users_proj[:, 0], users_proj[:, 1]


# source = ColumnDataSource(plot_df)

# TOOLTIPS = [
#     ("#species", "@n_species"),
#     ("distance", "@median_distance"),
#     ("percent_complete", "@percent_complete"),
#     ("#checklists", "@n_checklists"),
#     ("group size", "@mean_group_size")
# ]
                    
# plotting_comments = figure(title='Birders', tooltips=TOOLTIPS)

# plotting_comments.circle('x', 'y', color = 'coral', size=6, alpha=0.2, source=source)

# plotting_comments.toolbar.logo = None
# plotting_comments.toolbar_location = None




# output_file('all_comments_pds.html')
# show(plotting_comments)



In [None]:
similarity = cosine_similarity(users_proj)
#print(similarity)
print(similarity[:,4999])

matches = {}
for i in range(len(similarity)):
    max_sim = -1
    idx = 99999
    for j, ele in enumerate(similarity[i,:]):
        if ele > max_sim:
            max_sim = ele
            idx = j
    matches[i] = j

print(matches)

In [None]:
# take a random sample
plot_df = to_process.copy()

users_proj = TSNE(random_state=RS, perplexity=10, n_iter=10000).fit_transform(x_scaled)
#model = PCA(n_components=2)
#users_proj = model.fit_transform(x_scaled)
plot_df['x'], plot_df['y'] = users_proj[:, 0], users_proj[:, 1]


source = ColumnDataSource(plot_df)

TOOLTIPS = [
    ("#species", "@n_species"),
    ("distance", "@median_distance"),
    ("percent_complete", "@percent_complete"),
    ("#checklists", "@n_checklists"),
    ("group size", "@mean_group_size")
]
                    
plotting_comments = figure(title='Birders', tooltips=TOOLTIPS)

plotting_comments.circle('x', 'y', color = 'coral', size=6, alpha=0.2, source=source)

plotting_comments.toolbar.logo = None
plotting_comments.toolbar_location = None




output_file('all_comments_pds.html')
show(plotting_comments)

In [None]:
user_df.head()

In [None]:
a = user_df.iloc[0]
print(a.name)