### 資料來源:
- gbif: [Data-set of Moth Specimen from TESRI](https://www.gbif.org/dataset/f3f25fcf-2930-4cf1-a495-6b31d7fa0252)

In [1]:
import re
import os
from pathlib import Path
import numpy as np
import pandas as pd
from skimage import io 
from PIL import Image
import urllib.request
import matplotlib.pyplot  as plt
%matplotlib inline 

dir_data = Path('./data')
dir_meta = Path('./meta')
dir_meta.mkdir(exist_ok=True)


#### 檢視 "tesri_species_list.csv" 資料概況
- 物種欄位
    - 沒有缺值
    - 種名的空格以"_"呈現
    - 沒有 sp. 等不確定的種名
    - !! 沒有重複的種名


In [2]:

df_tesri_sp = pd.read_csv(dir_meta.joinpath('tesri_species_list.csv'))

df_tesri_sp['Species'] = df_tesri_sp.Species.apply(lambda x : x.replace('_',' '))

print(df_tesri_sp.describe())
print('\nAny NaN in Species column? :　', df_tesri_sp.Species.isnull().all())

df_tesri_sp.query('Species.str.endswith("sp\.")')

print('\n檢視種名的組成，以" "分割種名回傳的長度: ' , {len(sp.split(' ')) for sp in df_tesri_sp.Species.unique()})



          Family  Subfamily     Genus         Species
count       1868       1868      1868            1868
unique        47        114      1086            1868
top     Erebidae  Ennominae  Mythimna  Evonima aperta
freq         472        229        23               1

Any NaN in Species column? :　 False

檢視種名的組成，以" "分割種名回傳的長度:  {2, 3}



#### 讀取gbig "verbatim.txt"


In [3]:
df_verbatim = pd.read_csv(dir_data.joinpath('verbatim.txt'), sep='\t')
print(f'Number of data : {len(df_verbatim):,d}')
print('Number of columns : ', len(df_verbatim.columns.values))

# drop columns which all value is NaN 
df_verbatim.dropna(axis=1, how='all', inplace=True)
print(f'Number of columns : {len(df_verbatim.columns.values)} (after drop na by columns) ', )

# drop 'scientificName' column which value is NaN 
df_verbatim.dropna(subset=['scientificName'], inplace=True)
print(f'Number of data : {len(df_verbatim):,d} (after drop na by "scientificName" column)')

# drop 'associatedMedia column which value is NaN 
df_verbatim.dropna(subset=['associatedMedia'], inplace=True)
print(f'Number of data : {len(df_verbatim):,d} (after drop na by "associatedMedia" column)')

print('\ncolumns : ', df_verbatim.columns.values)
df_verbatim


Number of data : 109,653
Number of columns :  219
Number of columns : 36 (after drop na by columns) 
Number of data : 81,479 (after drop na by "scientificName" column)
Number of data : 81,229 (after drop na by "associatedMedia" column)

columns :  ['gbifID' 'identifier' 'collectionID' 'institutionCode' 'basisOfRecord'
 'occurrenceID' 'individualCount' 'sex' 'lifeStage' 'associatedMedia'
 'associatedReferences' 'eventID' 'parentEventID' 'eventDate'
 'samplingProtocol' 'sampleSizeValue' 'sampleSizeUnit' 'country'
 'countryCode' 'county' 'municipality' 'locality' 'verbatimElevation'
 'decimalLatitude' 'decimalLongitude' 'geodeticDatum' 'identifiedBy'
 'scientificName' 'kingdom' 'phylum' 'class' 'order' 'family' 'genus'
 'taxonRank' 'vernacularName']


Unnamed: 0,gbifID,identifier,collectionID,institutionCode,basisOfRecord,occurrenceID,individualCount,sex,lifeStage,associatedMedia,...,identifiedBy,scientificName,kingdom,phylum,class,order,family,genus,taxonRank,vernacularName
0,1934768067,A36-20130606-040,A36-20130606-040,tesri,PreservedSpecimen,A36-20130606-040,1,unknow,adult,http://farm8.staticflickr.com/7429/9554379312_...,...,施禮正,Chrysaeglia magnifica,Animalia,Arthropoda,Insecta,Lepidoptera,Erebidae,Chrysaeglia,species,閃光苔蛾
1,1934768377,A36-20130606-041,A36-20130606-041,tesri,PreservedSpecimen,A36-20130606-041,1,unknow,adult,http://farm6.staticflickr.com/5479/9551585327_...,...,施禮正,Spilarctia nydia werneri,Animalia,Arthropoda,Insecta,Lepidoptera,Erebidae,Spilarctia,subspecies,黑鬚污燈蛾
3,1934767936,A36-20130606-043,A36-20130606-043,tesri,PreservedSpecimen,A36-20130606-043,1,unknow,adult,http://farm4.staticflickr.com/3671/9554375768_...,...,施禮正,Barsine sauteri,Animalia,Arthropoda,Insecta,Lepidoptera,Erebidae,Barsine,species,東方葩苔蛾
4,1934767969,A36-20130606-048,A36-20130606-048,tesri,PreservedSpecimen,A36-20130606-048,1,unknow,adult,http://farm8.staticflickr.com/7446/9554369076_...,...,施禮正,Narosoideus vulpina,Animalia,Arthropoda,Insecta,Lepidoptera,Limacodidae,Narosoideus,species,狡娜刺蛾
5,1934768183,A36-20130606-049,A36-20130606-049,tesri,PreservedSpecimen,A36-20130606-049,1,unknow,adult,http://farm4.staticflickr.com/3757/9551574537_...,...,施禮正,Xyleutes strix,Animalia,Arthropoda,Insecta,Lepidoptera,Cossidae,Xyleutes,species,梟斑蠹蛾
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109648,2550500679,V38-20191102-067,V38-20191102-067,tesri,PreservedSpecimen,V38-20191102-067,1,femal,adult,https:\/\/live.staticflickr.com\/65535\/493925...,...,林旭宏,Alcis taiwanovariegata,Animalia,Arthropoda,Insecta,Lepidoptera,Geometridae,Alcis,species,臺灣暗斑霜尺蛾
109649,1934783697,A52-20160704-020,A52-20160704-020,tesri,PreservedSpecimen,A52-20160704-020,1,unknow,adult,https://farm8.staticflickr.com/7741/2902005066...,...,林旭宏,Chrysorabdia vilemani,Animalia,Arthropoda,Insecta,Lepidoptera,Erebidae,Chrysorabdia,species,雙帶苔蛾
109650,2550500777,V38-20191102-066,V38-20191102-066,tesri,PreservedSpecimen,V38-20191102-066,1,unknow,adult,https:\/\/live.staticflickr.com\/65535\/493931...,...,林旭宏,Fascellina chromataria,Animalia,Arthropoda,Insecta,Lepidoptera,Geometridae,Fascellina,species,褐缺口尺蛾
109651,2550500710,V38-20191102-065,V38-20191102-065,tesri,PreservedSpecimen,V38-20191102-065,1,femal,adult,https:\/\/live.staticflickr.com\/65535\/493931...,...,林旭宏,Orthocabera sericea sericea,Animalia,Arthropoda,Insecta,Lepidoptera,Geometridae,Orthocabera,subspecies,山茶斜帶尺蛾


In [4]:
columns_sel = ['gbifID', 'collectionID', 'family', 'genus', 'scientificName', 'taxonRank', 'sex', 'associatedMedia']
df_verbatim_ = df_verbatim[columns_sel]
df_verbatim_['associatedMedia'] = df_verbatim_.associatedMedia.apply(lambda x : str(x).replace('\\', ''))
df_verbatim_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,gbifID,collectionID,family,genus,scientificName,taxonRank,sex,associatedMedia
0,1934768067,A36-20130606-040,Erebidae,Chrysaeglia,Chrysaeglia magnifica,species,unknow,http://farm8.staticflickr.com/7429/9554379312_...
1,1934768377,A36-20130606-041,Erebidae,Spilarctia,Spilarctia nydia werneri,subspecies,unknow,http://farm6.staticflickr.com/5479/9551585327_...
3,1934767936,A36-20130606-043,Erebidae,Barsine,Barsine sauteri,species,unknow,http://farm4.staticflickr.com/3671/9554375768_...
4,1934767969,A36-20130606-048,Limacodidae,Narosoideus,Narosoideus vulpina,species,unknow,http://farm8.staticflickr.com/7446/9554369076_...
5,1934768183,A36-20130606-049,Cossidae,Xyleutes,Xyleutes strix,species,unknow,http://farm4.staticflickr.com/3757/9551574537_...
...,...,...,...,...,...,...,...,...
109648,2550500679,V38-20191102-067,Geometridae,Alcis,Alcis taiwanovariegata,species,femal,https://live.staticflickr.com/65535/4939251788...
109649,1934783697,A52-20160704-020,Erebidae,Chrysorabdia,Chrysorabdia vilemani,species,unknow,https://farm8.staticflickr.com/7741/2902005066...
109650,2550500777,V38-20191102-066,Geometridae,Fascellina,Fascellina chromataria,species,unknow,https://live.staticflickr.com/65535/4939319476...
109651,2550500710,V38-20191102-065,Geometridae,Orthocabera,Orthocabera sericea sericea,subspecies,femal,https://live.staticflickr.com/65535/4939319478...


In [5]:
df_verbatim_.taxonRank.unique()

array(['species', 'subspecies'], dtype=object)

In [6]:
df_verbatim_.query('taxonRank == "subspecies"')

Unnamed: 0,gbifID,collectionID,family,genus,scientificName,taxonRank,sex,associatedMedia
1,1934768377,A36-20130606-041,Erebidae,Spilarctia,Spilarctia nydia werneri,subspecies,unknow,http://farm6.staticflickr.com/5479/9551585327_...
7,1934768426,A36-20130606-045,Erebidae,Cyana,Cyana hamata hamata,subspecies,male,http://farm8.staticflickr.com/7313/9554373082_...
30,1934762292,A35-20130605-001,Erebidae,Asota,Asota egens confinis,subspecies,unknow,http://farm6.staticflickr.com/5470/9387118195_...
31,1934762338,A35-20130605-002,Erebidae,Asota,Asota plana lacteata,subspecies,unknow,http://farm4.staticflickr.com/3677/9389892518_...
32,1934762258,A35-20130605-003,Erebidae,Asota,Asota heliconia zebrina,subspecies,unknow,http://farm3.staticflickr.com/2834/9389893916_...
...,...,...,...,...,...,...,...,...
109637,1934783716,A52-20160704-011,Geometridae,Ourapteryx,Ourapteryx similaria horishana,subspecies,unknow,https://farm9.staticflickr.com/8121/2956288298...
109644,2550500635,V38-20191102-062,Geometridae,Hyposidra,Hyposidra talaca talaca,subspecies,male,https://live.staticflickr.com/65535/4939252035...
109647,1934783674,A52-20160704-021,Erebidae,Asota,Asota heliconia zebrina,subspecies,unknow,https://farm9.staticflickr.com/8483/2901820475...
109651,2550500710,V38-20191102-065,Geometridae,Orthocabera,Orthocabera sericea sericea,subspecies,femal,https://live.staticflickr.com/65535/4939319478...


In [7]:
df_multimedia = pd.read_csv(dir_data.joinpath('multimedia.txt'), sep='\t')
df_multimedia.dropna(axis=1, how='all', inplace=True)
df_multimedia

Unnamed: 0,gbifID,type,format,identifier
0,1934756035,StillImage,image/jpeg,http://farm3.staticflickr.com/2619/4115988779_...
1,1934756040,StillImage,image/jpeg,http://farm3.staticflickr.com/2588/4116760766_...
2,1934756045,StillImage,image/jpeg,http://farm3.staticflickr.com/2749/4116763040_...
3,1934756050,StillImage,image/jpeg,http://farm3.staticflickr.com/2652/4116759522_...
4,1934756055,StillImage,image/jpeg,http://farm3.staticflickr.com/2757/4115987483_...
...,...,...,...,...
78926,2983347367,StillImage,image/jpeg,https://live.staticflickr.com/65535/5059271493...
78927,2983348367,StillImage,image/jpeg,https://live.staticflickr.com/65535/5059284350...
78928,2983349367,StillImage,image/jpeg,https://live.staticflickr.com/65535/5059271491...
78929,2983350362,StillImage,image/jpeg,https://live.staticflickr.com/65535/5059284432...


### 處理程序 

#### 1. 根據'tesri_species_list.csv'內的物種清單，建立對應的檔案名稱、影像路徑
- 保存對應的物種名與ID的metadata
    - key : 科、學名 


In [110]:

df_tesri_sp

Unnamed: 0,Family,Subfamily,Genus,Species
0,Adelidae,Adelinae,Nemophora,Nemophora ahenea
1,Adelidae,Adelinae,Nemophora,Nemophora aritai
2,Adelidae,Adelinae,Nemophora,Nemophora aurora
3,Adelidae,Adelinae,Nemophora,Nemophora fluorites
4,Agonoxenidae,Parametriotinae,Lamprystica,Lamprystica purpurata
...,...,...,...,...
1863,Zygaenidae,Chalcosiinae,Hysteroscene,Hysteroscene hyalina
1864,Zygaenidae,Chalcosiinae,Pidorus,Pidorus atratus
1865,Zygaenidae,Chalcosiinae,Soritia,Soritia azurea
1866,Zygaenidae,Chalcosiinae,Soritia,Soritia choui


In [16]:
df_tesri_img = df_tesri_sp.merge(df_verbatim_[['collectionID','family','scientificName','sex','associatedMedia']], 
                    left_on=['Family', 'Species'],
                    right_on=['family','scientificName'],
                    how='left' 
                    )[['Family', 'Species', 'collectionID', 'scientificName', 'sex', 'associatedMedia']]
df_tesri_img

Unnamed: 0,Family,Species,collectionID,scientificName,sex,associatedMedia
0,Adelidae,Nemophora ahenea,A36-20130606-066,Nemophora ahenea,unknow,http://farm8.staticflickr.com/7433/9551556043_...
1,Adelidae,Nemophora ahenea,A37-20210407-067,Nemophora ahenea,femal,https://live.staticflickr.com/65535/5125228124...
2,Adelidae,Nemophora ahenea,V06-20130412-088,Nemophora ahenea,unknow,http://farm8.staticflickr.com/7289/8744067974_...
3,Adelidae,Nemophora ahenea,A55-20210511-230,Nemophora ahenea,male,https://live.staticflickr.com/65535/5153768924...
4,Adelidae,Nemophora aritai,A48-20160222-078,Nemophora aritai,unknow,https://farm2.staticflickr.com/1672/2614359891...
...,...,...,...,...,...,...
69908,Zygaenidae,Soritia choui,A64-20170905-001,Soritia choui,unknow,https://farm5.staticflickr.com/4494/3677386082...
69909,Zygaenidae,Soritia choui,A64-20170905-003,Soritia choui,unknow,https://farm5.staticflickr.com/4344/3741338091...
69910,Zygaenidae,Soritia choui,A64-20210830-003,Soritia choui,femal,https://live.staticflickr.com/65535/5173436643...
69911,Zygaenidae,Soritia choui,A64-20210830-001,Soritia choui,male,https://live.staticflickr.com/65535/5173501296...


In [89]:
df_tesri_img.describe()


Unnamed: 0,Family,Species,collectionID,scientificName,sex,associatedMedia
count,69913,69913,69768,69768,69768,69768
unique,47,1868,69768,1723,3,69764
top,Erebidae,Asota heliconia zebrina,V42-20200912-246,Asota heliconia zebrina,unknow,http://farm8.staticflickr.com/7144/6571918093_...
freq,22937,751,1,751,45694,2


In [9]:
count = df_tesri_img.groupby('Species').count().associatedMedia.rename('path_count').reset_index()
df_tesri_img_count = df_tesri_sp.merge(count, left_on='Species', right_on='Species', how='left' )
df_tesri_img_count.to_csv(dir_meta.joinpath('tesri_img_count.csv')) 
df_tesri_img_count

Unnamed: 0,Family,Subfamily,Genus,Species,path_count
0,Adelidae,Adelinae,Nemophora,Nemophora ahenea,4
1,Adelidae,Adelinae,Nemophora,Nemophora aritai,4
2,Adelidae,Adelinae,Nemophora,Nemophora aurora,8
3,Adelidae,Adelinae,Nemophora,Nemophora fluorites,4
4,Agonoxenidae,Parametriotinae,Lamprystica,Lamprystica purpurata,0
...,...,...,...,...,...
1863,Zygaenidae,Chalcosiinae,Hysteroscene,Hysteroscene hyalina,22
1864,Zygaenidae,Chalcosiinae,Pidorus,Pidorus atratus,11
1865,Zygaenidae,Chalcosiinae,Soritia,Soritia azurea,4
1866,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,5


In [10]:
print("種名無法匹配的物種數 : ", df_tesri_img.query('scientificName.isnull()').Species.size)
print("有種名，但沒有影像的數量 : ", df_tesri_img.query('scientificName.notnull() and associatedMedia.isnull()').size)

種名無法匹配的物種數 :  145
有種名，但沒有影像的數量 :  0


In [22]:
df_tesri_img.dropna(subset=['associatedMedia'], inplace=True)
df_tesri_img.reset_index(drop=True, inplace=True)
print("沒有path影像的數量 : ", df_tesri_img.query('associatedMedia.isnull()').size)

沒有path影像的數量 :  0


In [23]:
df_tesri_img['associatedMedia'] = df_tesri_img.associatedMedia.apply(lambda str_ : str_.replace('_q.jpg', '.jpg'))
df_tesri_img

Unnamed: 0,Family,Species,collectionID,scientificName,sex,associatedMedia
0,Adelidae,Nemophora ahenea,A36-20130606-066,Nemophora ahenea,unknow,http://farm8.staticflickr.com/7433/9551556043_...
1,Adelidae,Nemophora ahenea,A37-20210407-067,Nemophora ahenea,femal,https://live.staticflickr.com/65535/5125228124...
2,Adelidae,Nemophora ahenea,V06-20130412-088,Nemophora ahenea,unknow,http://farm8.staticflickr.com/7289/8744067974_...
3,Adelidae,Nemophora ahenea,A55-20210511-230,Nemophora ahenea,male,https://live.staticflickr.com/65535/5153768924...
4,Adelidae,Nemophora aritai,A48-20160222-078,Nemophora aritai,unknow,https://farm2.staticflickr.com/1672/2614359891...
...,...,...,...,...,...,...
69763,Zygaenidae,Soritia choui,A64-20170905-002,Soritia choui,unknow,https://farm5.staticflickr.com/4416/2359234584...
69764,Zygaenidae,Soritia choui,A64-20170905-001,Soritia choui,unknow,https://farm5.staticflickr.com/4494/3677386082...
69765,Zygaenidae,Soritia choui,A64-20170905-003,Soritia choui,unknow,https://farm5.staticflickr.com/4344/3741338091...
69766,Zygaenidae,Soritia choui,A64-20210830-003,Soritia choui,femal,https://live.staticflickr.com/65535/5173436643...


In [24]:
# df_tesri_img[df_tesri_img.associatedMedia.isnull()]
df_tesri_img.query('not associatedMedia.str.endswith("jpg")')

Unnamed: 0,Family,Species,collectionID,scientificName,sex,associatedMedia


In [25]:
path_list = df_tesri_img.associatedMedia.values
print(len(path_list))

path_list_ = [path for path in path_list if path.lower().endswith('.jpg')]
print(len(path_list_))

path_list_not = [path for path in path_list if not path.lower().endswith('.jpg')]
print(len(path_list_not))
set(path_list_not)


# path_list_ = [path for path in path_list if str(path).lower().endswith('_q.jpg')]
# print(len(path_list_))

# path_list_not = [path for path in path_list if not str(path).lower().endswith('_q.jpg')]
# print(len(path_list_not))

69768
69768
0


set()

In [26]:
df_tesri_img.to_csv(dir_meta.joinpath('tesri_img_path.csv'))

In [36]:
id =  "V36-20180127-160" 
df_tesri_img.query('collectionID == "V04-20130509-159" ')

Unnamed: 0,Family,Species,collectionID,scientificName,sex,associatedMedia
65336,Pyralidae,Endotricha metacuralis,V04-20130509-159,Endotricha metacuralis,unknow,http://farm4.staticflickr.com/3710/9156300915_...


In [None]:
with open(dir_meta.joinpath('path_list.txt'), 'w') as file:
    for path in path_list:
        file.write(path + '\t\n' )

#### 2.根據檔案路徑下載
- 避免伺服器當掉， 設置sleep?
- 寫成.py檔，開幾個terminal分流下載
- 等待設定
    - 加個try except的迴圈，except發生時 就加入等待時間 每一個連結多一次except，就拉長等待時間，然後訂一個上限 譬如超過5次或10次就放棄那個連結  

##### 見"get_tesri_imgs.py"

#### 3. 檢查抓取結果

In [24]:
df_imgs = pd.read_csv("meta/imgs.txt", header=None,  names=['filename'])
df_imgs

Unnamed: 0,filename
0,A01-20090819-001.jpg
1,A01-20090819-002.jpg
2,A01-20090819-003.jpg
3,A01-20090819-004.jpg
4,A01-20090819-005.jpg
...,...
69659,V48-20210905-087.jpg
69660,V48-20210905-088.jpg
69661,V48-20210905-089.jpg
69662,V48-20210905-091.jpg


### 3.檢查bbox 

In [4]:
df_bboxes = pd.read_csv('../moth_thermal_project/yolov4/moth_all_bboxes_tesri_imgs_YY.csv', index_col=0)
df_bboxes['file'] = df_bboxes.file.apply(lambda path: Path(path).stem)
df_bboxes

Unnamed: 0,file,bboxes
0,A61-20171018-184,"67.5809383392334,31.886962890625053,422.491598..."
1,V33-20170811-245,"80.67727088928223,35.87327384948731,408.557653..."
2,A76-20190409-129,"137.29925155639648,98.5918560028076,370.206403..."
3,V19-20151016-009,"71.48327827453616,19.238918304443352,451.12323..."
4,V48-20210905-042,"27.653264999389638,55.21928501129147,450.70738..."
...,...,...
69659,V17-20150606-049,"47.678565979003885,14.682095527648904,467.0315..."
69660,V31-20180222-010,"88.05942535400393,37.131647109985366,417.37966..."
69661,V40-20190419-117,"63.29630613327025,9.26065826416019,425.2367138..."
69662,A42-20190129-050,"39.69497680664064,34.25918006896973,436.755609..."


In [63]:
df_bboxes.query('bboxes.isnull()').file.to_csv(dir_meta.joinpath('bbox_null.csv'))

  """Entry point for launching an IPython kernel.


### 根據類群建立目錄


In [43]:
df_tesri_sp

Unnamed: 0,Family,Subfamily,Genus,Species
0,Adelidae,Adelinae,Nemophora,Nemophora ahenea
1,Adelidae,Adelinae,Nemophora,Nemophora aritai
2,Adelidae,Adelinae,Nemophora,Nemophora aurora
3,Adelidae,Adelinae,Nemophora,Nemophora fluorites
4,Agonoxenidae,Parametriotinae,Lamprystica,Lamprystica purpurata
...,...,...,...,...
1863,Zygaenidae,Chalcosiinae,Hysteroscene,Hysteroscene hyalina
1864,Zygaenidae,Chalcosiinae,Pidorus,Pidorus atratus
1865,Zygaenidae,Chalcosiinae,Soritia,Soritia azurea
1866,Zygaenidae,Chalcosiinae,Soritia,Soritia choui


In [46]:
df_bboxes = pd.read_csv(dir_meta.joinpath('moth_all_bboxes_tersi_imgs_forYY.csv'), index_col=0)
df_file = df_bboxes.file.apply(lambda path: Path(path).stem)
df_file

0        A61-20171018-184
1        V33-20170811-245
2        A76-20190409-129
3        V19-20151016-009
4        V48-20210905-042
               ...       
69659    V17-20150606-049
69660    V31-20180222-010
69661    V40-20190419-117
69662    A42-20190129-050
69663    A47-20160425-131
Name: file, Length: 69664, dtype: object

In [55]:
df_tesri_img = pd.read_csv(dir_meta.joinpath('tesri_img_path.csv'), index_col=0)
df_tesri_img_ = df_tesri_img.merge(df_file, 
                    left_on='collectionID',
                    right_on='file',
                    how='right' 
                   )
df_tesri_img_

Unnamed: 0,Family,Species,collectionID,scientificName,sex,associatedMedia,file
0,Adelidae,Nemophora ahenea,A36-20130606-066,Nemophora ahenea,unknow,http://farm8.staticflickr.com/7433/9551556043_...,A36-20130606-066
1,Adelidae,Nemophora ahenea,A37-20210407-067,Nemophora ahenea,femal,https://live.staticflickr.com/65535/5125228124...,A37-20210407-067
2,Adelidae,Nemophora ahenea,V06-20130412-088,Nemophora ahenea,unknow,http://farm8.staticflickr.com/7289/8744067974_...,V06-20130412-088
3,Adelidae,Nemophora ahenea,A55-20210511-230,Nemophora ahenea,male,https://live.staticflickr.com/65535/5153768924...,A55-20210511-230
4,Adelidae,Nemophora aritai,A48-20160222-078,Nemophora aritai,unknow,https://farm2.staticflickr.com/1672/2614359891...,A48-20160222-078
...,...,...,...,...,...,...,...
69659,Zygaenidae,Soritia choui,A64-20170905-002,Soritia choui,unknow,https://farm5.staticflickr.com/4416/2359234584...,A64-20170905-002
69660,Zygaenidae,Soritia choui,A64-20170905-001,Soritia choui,unknow,https://farm5.staticflickr.com/4494/3677386082...,A64-20170905-001
69661,Zygaenidae,Soritia choui,A64-20170905-003,Soritia choui,unknow,https://farm5.staticflickr.com/4344/3741338091...,A64-20170905-003
69662,Zygaenidae,Soritia choui,A64-20210830-003,Soritia choui,femal,https://live.staticflickr.com/65535/5173436643...,A64-20210830-003


In [56]:
df_tesri_img_ = df_tesri_img_.merge(df_tesri_sp,
                                    left_on=['Family','Species'],
                                    right_on=['Family','Species'],
                                    how='left'
                    )
df_tesri_filelist_taxon = df_tesri_img_[['Family','Subfamily','Genus','Species', 'file']].reset_index(drop=True)
df_tesri_filelist_taxon.to_csv(dir_meta.joinpath('tesri_filelist_taxon.csv'))


Unnamed: 0,Family,Subfamily,Genus,Species,file
0,Adelidae,Adelinae,Nemophora,Nemophora ahenea,A36-20130606-066
1,Adelidae,Adelinae,Nemophora,Nemophora ahenea,A37-20210407-067
2,Adelidae,Adelinae,Nemophora,Nemophora ahenea,V06-20130412-088
3,Adelidae,Adelinae,Nemophora,Nemophora ahenea,A55-20210511-230
4,Adelidae,Adelinae,Nemophora,Nemophora aritai,A48-20160222-078
...,...,...,...,...,...
69659,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20170905-002
69660,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20170905-001
69661,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20170905-003
69662,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20210830-003


In [57]:
df_tesri_filelist_taxon = pd.read_csv(dir_meta.joinpath('tesri_filelist_taxon.csv'), index_col=0)
df_tesri_filelist_taxon

Unnamed: 0,Family,Subfamily,Genus,Species,file
0,Adelidae,Adelinae,Nemophora,Nemophora ahenea,A36-20130606-066
1,Adelidae,Adelinae,Nemophora,Nemophora ahenea,A37-20210407-067
2,Adelidae,Adelinae,Nemophora,Nemophora ahenea,V06-20130412-088
3,Adelidae,Adelinae,Nemophora,Nemophora ahenea,A55-20210511-230
4,Adelidae,Adelinae,Nemophora,Nemophora aritai,A48-20160222-078
...,...,...,...,...,...
69659,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20170905-002
69660,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20170905-001
69661,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20170905-003
69662,Zygaenidae,Chalcosiinae,Soritia,Soritia choui,A64-20210830-003


In [65]:
df_filelist_count = df_tesri_filelist_taxon.groupby(by=['Family', 'Subfamily']).count().file.sort_values(ascending=False)
df_filelist_count

Family          Subfamily    
Geometridae     Ennominae        13100
Erebidae        Arctiinae         6647
                Erebinae          5877
Noctuidae       Noctuinae         4509
Crambidae       Spilomelinae      3383
                                 ...  
Hyblaeidae      Hypocalinae          3
Lecithoceridae  Unassigned           3
Pterophoridae   Ochyroticinae        2
Papilionidae    Papilioninae         2
Sesiidae        Sesiinae             2
Name: file, Length: 120, dtype: int64

In [71]:
df_filelist_count[df_filelist_count > 2000]

Family       Subfamily   
Geometridae  Ennominae       13100
Erebidae     Arctiinae        6647
             Erebinae         5877
Noctuidae    Noctuinae        4509
Crambidae    Spilomelinae     3383
Erebidae     Lymantriinae     3171
Name: file, dtype: int64

---

### 檢查問題影像

In [15]:
dir_broken = Path('broken images')
file_broken= [path.stem.split('_cropped')[0] for path in dir_broken.glob('*.png')]
print(len(file_broken))


2


In [45]:
# 檢查兩張破損的影像在原始flickr連結是正常的
# 抓到的原始圖檔比flickr完整

for file in file_broken:
    link =  df_verbatim_.query('collectionID==@file').associatedMedia.values[0]
    img = io.imread(link)
    # io.imshow(img)
    Image.fromarray(img).show()
    ;
    

In [79]:
# 檢查bbox看似也沒問題
df_bboxes = pd.read_csv('meta/moth_all_bboxes_tersi_imgs_forYY.csv', index_col=0)

df_bboxes_img_broken = df_bboxes.query(
    'file.str.contains(@file_broken[0]) or file.str.contains(@file_broken[1])'
    )

df_bboxes_img_broken.to_csv('../moth_thermal_project/yolov4/moth_all_bboxes_img_broken.csv')
df_bboxes_img_broken = pd.read_csv('../moth_thermal_project/yolov4/moth_all_bboxes_img_broken.csv', index_col=0)
df_bboxes_img_broken

Unnamed: 0,file,bboxes
12421,../../Moth_Specimen_TESRI/tesri_imgs_YY/V14-20...,"0,42.571967124938936,491.4109706878662,312.318..."
50561,../../Moth_Specimen_TESRI/tesri_imgs_YY/V42-20...,"74.3729591369629,73.46022510528566,440.9010887..."


In [78]:
df_bboxes.file = df_bboxes.file.str.replace('tesri_imgs_forYY', 'tesri_imgs_YY')
df_bboxes.to_csv('meta/moth_all_bboxes_tersi_imgs_forYY.csv')