In [3]:
import sys
import pandas as pd
import requests
from dotenv import load_dotenv  
import os
from tqdm import tqdm
from time import sleep
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

from utils import search_bird

In [9]:
df = pd.read_csv('../data/processed/field_notes_with_standardized_names.csv')
df.sample(5)

Unnamed: 0,Date,Time,Location,Weather,Bird Species,Notes,Count,Sex,Comments,corrected_name
729,,,,,grey wagtail,,,,,grey wagtail
154,,,,,indian cuckoo,,,,,Indian Cuckoo
208,,,,,spotted munia,,,,,Spotted Munia
765,,,,,great horned owl,seen,,,,great horned owl
971,,,,,little green beeeater,,,,,Little Green Bee-eater


#### Match against eBird


In [55]:

corrected_names = df.corrected_name.unique()
_ = []
for i, k in enumerate(tqdm(corrected_names)):
    if i%5==0:
        sleep(3)
    _.append((k, search_bird(k)))
    
l = []

for item in _:
    t = [(item[0], y['name'], y['code']) for y in item[1]]
    l.extend(t)

ddf = pd.DataFrame(l, columns = ['corrected_name', 'standardized_name', 'id'])   
ddf.to_csv('../data/reference/disambiguation.csv', index=False)

100%|██████████| 220/220 [04:37<00:00,  1.26s/it]


In [41]:
ddf.sample(5)

Unnamed: 0,name_in_notes,current_name,id
464,Purple-rumped Sunbird,Purple-rumped Sunbird - Leptocoma zeylonica,pursun3
1568,Swift,Horus Swift - Apus horus,horswi1
576,Warbler,Southern Marquesan Reed Warbler - Acrocephalus...,marwar2
853,Flycatcher,Least Flycatcher - Empidonax minimus,leafly
1078,dove,Thick-billed Ground Dove - Pampusana salamonis,tbgdov1


#### NOTES:
1. Manually correct the names (delete the rows not associated to the bird in question from the ../data/referece/disambiguation.csv sheet)

2. Save the resulting CSV file as ../data/reference/eBird_matched_names.csv

In [65]:
# read the resulting of the manual pruning 

d = {x[0]:(x[1], x[2]) for x in pd.read_csv('../data/reference/eBird_matched_names.csv').values.tolist()}

In [69]:
# update the standardized names to the original dataframe
df['name'] = df.corrected_name.map(lambda x: d.get(x, (None, None))[0])
df['id'] = df.corrected_name.map(lambda x: d.get(x, (None, None))[1])

In [70]:
df.sample(5)

Unnamed: 0,Date,Time,Location,Weather,Bird Species,Notes,Count,Sex,Comments,corrected_name,name,id
694,,,,,loten sunbird,,,,,Loten Sunbird,Loten's Sunbird (Long-billed Sunbird) - Cinnyr...,lobsun2
307,,,,,tree pie,,,,,Treepie,Rufous Treepie - Dendrocitta vagabunda,ruftre2
229,,,,,white throated munia,,,,,White-throated Munia,Indian Silverbill (White-throated Munia) - Euo...,indsil
204,,,,,grey drongo,,,,,grey drongo,,
966,,,,,red rumped swallow,,,,,Red-rumped Swallow,Red-rumped Swallow - Cecropis daurica,rerswa1


##### Manually match remaning unclassified names

In [99]:
non_classified_names = sorted(df[df.name.isna()].corrected_name.unique())
# write the unclassified names to file. Manually edit and add the names from eBird or other sources
pd.DataFrame(non_classified_names, columns = ['name']).to_csv('../data/reference/disambiguation_manual.csv', index=False)


#### NOTES:
1. Make manual modifications to a copy of the above the file. Find appropriate matches in Bird manually, since the current name doesnt show any matches in their search.

2. Issues are of the following kind:  searching for Brahminy Myna yieds no response from the eBird API. This is because they record it as Brahminy Starling. These type of matches need to be made manually

In [124]:
# read the CSV file with manual modifications
df_manual_modifications = pd.read_csv('../data/reference/disambiguation_manual_modified.csv')
df_manual_modifications.sample(5)

Unnamed: 0,raw_name,matched_name
73,dabchick,Little Grebe - Tachybaptus ruficollis
2,BW Stilts,
32,Pariah Kite,Black Kite - Milvus migrans
18,Grey Drongo,Ashy Drongo - Dicrurus leucophaeus
70,Yellow-headed Babbler,Yellow-billed Babbler - Argya affinis


In [139]:
# retrieve code/id of the bird usingthe name in the matcheed_name column
new_names = df_manual_modifications[df_manual_modifications.matched_name.notna()].matched_name.unique()

_ = []
for name in tqdm(new_names):
    _.extend(search_bird(name.split(' - ')[0]))
    
    name_2_id = {x['name']:x['code'] for x in _}
df_manual_modifications['id'] = df_manual_modifications.matched_name.map(lambda x: name_2_id.get(x))


100%|██████████| 50/50 [00:32<00:00,  1.53it/s]


In [164]:
# append to original dataframe the name and id of birds from the manual modification list
d1 = {x[0]:(x[1], x[2]) for x in df_manual_modifications.values.tolist()}
df['name'] = df.apply(lambda x: d1.get(x['corrected_name'])[0] if x['corrected_name'] in d1 else x['name'], axis=1)
df['id'] = df.apply(lambda x: d1.get(x['corrected_name'])[1] if x['corrected_name'] in d1 else x['id'], axis=1)

In [165]:
df.sample(5)

Unnamed: 0,Date,Time,Location,Weather,Bird Species,Notes,Count,Sex,Comments,corrected_name,name,id
701,,,,,red rumped swallow,,,,,Red-rumped Swallow,Red-rumped Swallow - Cecropis daurica,rerswa1
459,,,,,purple rumped sunbird,,,,,Purple-rumped Sunbird,Purple-rumped Sunbird - Leptocoma zeylonica,pursun3
826,,,,,dusky craig martin,,,,,dusky crag martin,Dusky Crag-Martin - Ptyonoprogne concolor,duscrm1
953,,,,,tailor bird,,,,,Tailorbird,Common Tailorbird - Orthotomus sutorius,comtai1
617,,,,,Egret,,,,,Egret,Intermediate Egret (Medium Egret) - Ardea inte...,integr1


In [166]:
# final yet unclassified names
df_manual_modifications[df_manual_modifications.matched_name.isna()]

Unnamed: 0,raw_name,matched_name,id
2,BW Stilts,,
31,Merlin,,
36,Red-crested Cuckoo,,
37,Red-flowered Sunbird,,
45,Rufous-winged Bush Lark,,
46,Rufous-winged Bushlark,,
54,Spotted Bulbul,,
55,Streaked Fantail,,
56,Streaked Fantail Flycatcher,,
57,Streaked Fantail Warbler,,


In [167]:
df.to_csv('../data/processed/field_notes_with_standardized_names.csv', index=False)