#### 與iNaturalist 資料比對的Ebird驗證資料集
- 按照iNaturalist 物種名錄(num_sp=1486)
    - 每種最多選10張


In [192]:
import sys 
import os
import pandas as pd
import numpy as np
from pathlib import Path 
from PIL import Image
import shutil
import time
import datetime


print(pd.__version__)
print(np.__version__)
!python -V
!pwd

dir_meta = Path('meta')
dir_meta.mkdir(exist_ok=True, parents=True)


1.4.1
1.22.2
Python 3.9.7
/home/esslab/AI_projects/shared/eBird/download_ebird


## work pipiline
1. 根據iNaturalist 2021 dayaset取得鳥類物種名錄與流水號
2. 產出`ebird_1486_selected/val` 目錄 
3. 根據物種名，至ebird meta 資料取得對應得物種名
4. 依據該物種名索引，參照`Average Community Rating`與`Number of Ratings`取top10
5. 照檔案路徑取得資料後複製到所屬的資料夾

## 1. 根據iNaturalist 2021 dayaset取得鳥類物種名錄與流水號


In [None]:
!ls downloaded/ebird_1486_sel/val -l


In [89]:
dir_iNat = Path('../../../shared/iNaturalist_2021') 
iNat_val = dir_iNat/'val'
dir_name_iNatVal  = [dir_.name for dir_ in iNat_val.iterdir()] 
print(len(dir_name_iNatVal))

# get Specie & Family list 
sp_list = [file.split('_')[-2] + ' ' + file.split('_')[-1]  
           for file in dir_name_iNatVal]

family_list = [file.split('_')[-3] for file in dir_name_iNatVal]

df_iNat_val =  pd.concat([pd.DataFrame(dir_name_iNatVal),
                          pd.DataFrame(family_list),
                          pd.DataFrame(sp_list)],
              axis='columns').set_axis(['Dir','Family','Specie'],axis='columns')
df_iNat_val

1486


Unnamed: 0,Dir,Family,Specie
0,03908_Animalia_Chordata_Aves_Passeriformes_Lan...,Laniidae,Lanius collurio
1,04326_Animalia_Chordata_Aves_Passeriformes_Tyr...,Tyrannidae,Todirostrum cinereum
2,04026_Animalia_Chordata_Aves_Passeriformes_Par...,Paridae,Baeolophus atricristatus
3,03284_Animalia_Chordata_Aves_Bucerotiformes_Ph...,Phoeniculidae,Phoeniculus purpureus
4,03111_Animalia_Chordata_Aves_Accipitriformes_A...,Accipitridae,Accipiter badius
...,...,...,...
1481,03388_Animalia_Chordata_Aves_Charadriiformes_J...,Jacanidae,Hydrophasianus chirurgus
1482,03859_Animalia_Chordata_Aves_Passeriformes_Hir...,Hirundinidae,Riparia riparia
1483,03362_Animalia_Chordata_Aves_Charadriiformes_C...,Charadriidae,Charadrius tricollaris
1484,04556_Animalia_Chordata_Aves_Strigiformes_Stri...,Strigidae,Ninox novaeseelandiae


## 2. 產出ebird_1486_selected/val 目錄
- 物種目錄參照iNaturalist 

In [None]:
# mkdir ebird_1486_selected/val
ebird_val = dir_iNat/'val_ebird'

for dir_ in  dir_name_iNatVal:
    dir_ = ebird_val/dir_
    dir_.mkdir(exist_ok=True, parents=True)
    print(f'{dir_} maked')

## 3. 根據物種名，至ebird meta 資料取得對應得物種名

### load ebird meta

In [111]:
%%time
meta_file = dir_meta/'jpg_top100_meta3.csv'
df_jpg100_meta = pd.read_csv(meta_file, index_col=0, 
                            #  dtype=column_dtypes
                             )
df_jpg100_meta.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 896555 entries, 0 to 896554
Data columns (total 62 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ML Catalog Number            896555 non-null  int64  
 1   Dir_FName                    896555 non-null  object 
 2   Parent_Dir                   896555 non-null  object 
 3   Size                         896555 non-null  float64
 4   Sci_N                        896555 non-null  object 
 5   Macaulay_public              37 non-null      float64
 6   Format                       896521 non-null  object 
 7   Scientific Name              896535 non-null  object 
 8   Common Name                  896521 non-null  object 
 9   Background Species           1807 non-null    object 
 10  Recordist                    896478 non-null  object 
 11  Date                         884239 non-null  object 
 12  Year                         884239 non-null  float64
 13 

In [123]:
col_basic =  ['ML Catalog Number','Dir_FName', 'Parent_Dir', 'Sci_N',]
# col_geo = ['County', 'Locality', 'Latitude','Longitude']
col_taxon = ['Scientific Name',  'Parent Species', 'Taxon Category']
col_rating = ['Number of Ratings', 'Average Community Rating',]

In [127]:
df_jpg100_meta_sel = df_jpg100_meta[col_basic + col_taxon + col_rating]
df_jpg100_meta_sel

Unnamed: 0,ML Catalog Number,Dir_FName,Parent_Dir,Sci_N,Scientific Name,Parent Species,Taxon Category,Number of Ratings,Average Community Rating
0,114289801,Acanthisittidae/Acanthisitta_chloris_0_114289801,Acanthisittidae,Acanthisitta_chloris_0,Acanthisitta chloris,Acanthisitta chloris,Species,2.0,4.000
1,114289821,Acanthisittidae/Acanthisitta_chloris_0_114289821,Acanthisittidae,Acanthisitta_chloris_0,Acanthisitta chloris,Acanthisitta chloris,Species,2.0,4.000
2,114404941,Acanthisittidae/Acanthisitta_chloris_0_114404941,Acanthisittidae,Acanthisitta_chloris_0,Acanthisitta chloris,Acanthisitta chloris,Species,2.0,4.500
3,114404951,Acanthisittidae/Acanthisitta_chloris_0_114404951,Acanthisittidae,Acanthisitta_chloris_0,Acanthisitta chloris,Acanthisitta chloris,Species,2.0,4.000
4,115495451,Acanthisittidae/Acanthisitta_chloris_0_115495451,Acanthisittidae,Acanthisitta_chloris_0,Acanthisitta chloris,Acanthisitta chloris,Species,21.0,4.906
...,...,...,...,...,...,...,...,...,...
896550,59630711,Zosteropidae/Zosterornis_whiteheadi_0_59630711,Zosteropidae,Zosterornis_whiteheadi_0,Zosterornis whiteheadi,Zosterornis whiteheadi,Species,1.0,3.000
896551,59630721,Zosteropidae/Zosterornis_whiteheadi_0_59630721,Zosteropidae,Zosterornis_whiteheadi_0,Zosterornis whiteheadi,Zosterornis whiteheadi,Species,0.0,0.000
896552,63554951,Zosteropidae/Zosterornis_whiteheadi_0_63554951,Zosteropidae,Zosterornis_whiteheadi_0,Zosterornis whiteheadi,Zosterornis whiteheadi,Species,1.0,3.000
896553,79738531,Zosteropidae/Zosterornis_whiteheadi_0_79738531,Zosteropidae,Zosterornis_whiteheadi_0,Zosterornis whiteheadi,Zosterornis whiteheadi,Species,4.0,3.000


#### checking sp list

In [139]:
sp_iNat =  df_iNat_val.Specie.unique()

sp_ebird = df_jpg100_meta_sel['Parent Species'].unique()

set_diff =  set(sp_iNat) - set(sp_ebird)
print(f'#_sp in iNat  : {len(sp_iNat)}')
print(f'#_sp in ebird : {len(sp_ebird)}')
print(f'#_sp_lack in ebird : {len(set_diff):,d}, {set_diff}') 

assert len(set_diff)==0

#_sp in iNat  : 1486
#_sp in ebird : 11215
#_sp_lack in ebird : 0, set()


## 4. 依據該物種名索引取得檔案路徑
- 參照`Average Community Rating`與`Number of Ratings`取top10

In [160]:
# sp_ = 'Lanius collurio' # 'Accipiter badius'
sp_ = 'Hydrophasianus chirurgus'

def get_path(df_:pd.DataFrame, sp_:str)->tuple:
    mask = df_['Parent Species'] == sp_
    df_ = (df_[mask]
           .sort_values(by=col_rating, ascending=False)
           .head(10))
    
    return df_['Dir_FName'].values
    # return df_ 

# print(sp_)
paths = get_path(df_jpg100_meta_sel, sp_)
paths

array(['Jacanidae/Hydrophasianus_chirurgus_0_714944',
       'Jacanidae/Hydrophasianus_chirurgus_0_205014781',
       'Jacanidae/Hydrophasianus_chirurgus_0_206055751',
       'Jacanidae/Hydrophasianus_chirurgus_0_205593231',
       'Jacanidae/Hydrophasianus_chirurgus_0_108262961',
       'Jacanidae/Hydrophasianus_chirurgus_0_115911651',
       'Jacanidae/Hydrophasianus_chirurgus_0_126261371',
       'Jacanidae/Hydrophasianus_chirurgus_0_139014661',
       'Jacanidae/Hydrophasianus_chirurgus_0_243700561',
       'Jacanidae/Hydrophasianus_chirurgus_0_26123191'], dtype=object)

## 5. 照檔案路徑取得資料後複製到所屬的資料夾

In [150]:
dir_ebird = Path('downloaded/jpg_top100')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder.to(device)

start_time = time.time()
for i, (images, *_) in enumerate(data_loader):
    images = images.to(device)
    
    with torch.no_grad():
        embedding_ = encoder.forward_features(images)
        # embedding_ = embedding_.detach().cpu().numpy()

    if i == 0 :
        embeddings = embedding_
    else:
        embeddings = torch.cat((embeddings, embedding_), axis=0)    
    
    passing_time = str(datetime.timedelta(seconds=int(time.time() - start_time )))
    info = f'Progress:{i+1:4d}, {100*(i+1)/len(data_loader):3.2f}%. ' 
    info += f'| Feature_shape: {embeddings.shape}.' 
    info += f'| Time: {passing_time}. '
    print(info, end='\r')


In [206]:
start_time = time.time()
for idx, rows in df_iNat_val.iterrows():
    dir_, family_, sp_ = rows
    
    paths = get_path(df_jpg100_meta_sel, sp_)
    for idx_p, path in enumerate(paths):
        src = dir_ebird.joinpath(path  + '.jpg')
        dst = ebird_val.joinpath(dir_, Path(path).name + '.jpg')
        shutil.copyfile(src, dst)
        # print(f'\t{idx_p:2d}, Copy imgs : {Path(path).name}')
    
    passing_time = str(datetime.timedelta(seconds=int(time.time() - start_time )))
    # print(f'{idx:4d}, {family_:10s}, {sp_:15s}, data dst : {dir_}')
    info = f'Progress : {idx:4d}, Time: {passing_time} '
    info += f'| Taxon: {family_:15s}, {sp_:25s}, n:{len(paths)}  '
    info += f'| Data dst : {dir_:<40s}'
    print(info, end='\r')
    # break

Progress : 1485, Time: 0:01:20 | Taxon: Alcedinidae    , Megaceryle alcyon        , n:10  | Data dst : 03557_Animalia_Chordata_Aves_Coraciiformes_Alcedinidae_Megaceryle_alcyoneollaristelisrissisuss