In [1]:
import os
import gzip
import shutil
import pandas as pd
from tqdm import tqdm

In [2]:
DATA_DIR = 'D:/species_classifier/data/inaturalist/'
AWS_BASE = 'aws s3 --no-sign-request --region us-east-1 cp s3://inaturalist-open-data/'

### Downloading Data Files

In [4]:
!aws --version

aws-cli/1.22.28 Python/3.7.7 Windows/10 botocore/1.23.28


File association not found for extension .py


In [5]:
os.system(AWS_BASE + 'observations.csv.gz ' + DATA_DIR + 'observations.csv.gz')

0

In [6]:
os.system(AWS_BASE + 'photos.csv.gz ' + DATA_DIR + 'photos.csv.gz')

0

In [7]:
os.system(AWS_BASE + 'taxa.csv.gz ' + DATA_DIR + 'taxa.csv.gz')

0

### Unzipping Data Files

In [11]:
with gzip.open(DATA_DIR + 'observations.csv.gz', 'rb') as f_in:
    with open(DATA_DIR + 'observations.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
os.remove(DATA_DIR + 'observations.csv.gz')

In [12]:
with gzip.open(DATA_DIR + 'photos.csv.gz', 'rb') as f_in:
    with open(DATA_DIR + 'photos.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
os.remove(DATA_DIR + 'photos.csv.gz')

In [10]:
with gzip.open(DATA_DIR + 'taxa.csv.gz', 'rb') as f_in:
    with open(DATA_DIR + 'taxa.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
os.remove(DATA_DIR + 'taxa.csv.gz')

### Create Unified File

In [3]:
def process_partition(partition):
    photo_keys = {}
    observation_keys = {}
    observation_dict = {}
    par_len = len(partition)
    
    with open(DATA_DIR + 'observations.csv', 'r') as observations_file:
        first_line = True
        for observation_line in observations_file:
            if first_line:
                observation_line = observation_line.replace('\r', '').replace('\n', '').split('\t')
                for i in range(len(observation_line)):
                    observation_keys[observation_line[i]] = i
                first_line = False
                continue
            observation_line = observation_line.replace('\r', '').replace('\n', '').split('\t')
            
            if observation_line[observation_keys['observation_uuid']][-1*par_len:] == partition:
                observation_dict[observation_line[observation_keys['observation_uuid']]] = {
                    'taxon_id': observation_line[observation_keys['taxon_id']],
                    'photos': []
                }
                
    with open(DATA_DIR + 'photos.csv', 'r') as photos_file:
        first_line = True
        for photo_line in photos_file:
            if first_line:
                photo_line = photo_line.replace('\r', '').replace('\n', '').split('\t')
                for i in range(len(photo_line)):
                    photo_keys[photo_line[i]] = i
                first_line = False
                continue
            photo_line = photo_line.replace('\r', '').replace('\n', '').split('\t')
            
            if photo_line[photo_keys['observation_uuid']][-1*par_len:] == partition:
                try:
                    observation_dict[photo_line[photo_keys['observation_uuid']]]['photos'].append((photo_line[photo_keys['photo_id']], photo_line[photo_keys['extension']]))
                except KeyError as e:
                    pass
    
    output_count = 0
    print('\nPartition: ' + partition)
    print('\tObservations: ' + str(len(observation_dict)))
    with open(DATA_DIR + 'inaturalist_data.csv', 'a') as output_file:
        for observation_uuid in observation_dict:
            for photo in observation_dict[observation_uuid]['photos']:
                try:
                    output_line = photo[0] + '\t'
                    output_line += photo[1] + '\t'
                    output_line += observation_dict[observation_uuid]['taxon_id'] + '\n'
                    output_file.write(output_line)
                    output_count += 1
                except Exception as e:
                    pass
    print('\tPhotos Added: ' + str(output_count))

def create_output_data(partition_len=1):
    photo_keys = {}
    output_count = 0
    with open(DATA_DIR + 'inaturalist_data.csv', 'w') as output_file:
        output_file.write('')
    
    print('Getting Partitions...')
    partitions = set()
    with open(DATA_DIR + 'photos.csv', 'r') as photos_file:
        first_line = True
        for photo_line in photos_file:
            if first_line:
                photo_line = photo_line.replace('\r', '').replace('\n', '').split('\t')
                for i in range(len(photo_line)):
                    photo_keys[photo_line[i]] = i
                first_line = False
                continue
            photo_line = photo_line.replace('\r', '').replace('\n', '').split('\t')
            
            partitions.add(photo_line[photo_keys['observation_uuid']][-1*partition_len:])
    print('\tFound ' + str(len(list(partitions))) + ' Partitions')
    
    print('Iterating Through Partitions...')
    for partition in tqdm(partitions):
        process_partition(partition)

In [4]:
create_output_data()

Getting Partitions...


  0%|                                                                                           | 0/16 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
71731it [00:00, 714047.14it/s][A

	Found 16 Partitions
Iterating Through Partitions...



147689it [00:00, 725540.30it/s][A
226494it [00:00, 741691.27it/s][A
304152it [00:00, 750849.05it/s][A
381085it [00:00, 754628.22it/s][A
457407it [00:00, 755484.03it/s][A
524613it [00:00, 721246.01it/s][A
600344it [00:00, 730818.73it/s][A
677569it [00:00, 741160.62it/s][A
751790it [00:01, 741044.84it/s][A
825201it [00:01, 737217.90it/s][A
901206it [00:01, 742123.73it/s][A
980483it [00:01, 755241.82it/s][A
1055824it [00:01, 752980.29it/s][A
1130725it [00:01, 745666.85it/s][A
1205030it [00:01, 744306.37it/s][A
1279278it [00:01, 673724.53it/s][A
1347802it [00:01, 670714.05it/s][A
1425110it [00:01, 698209.84it/s][A
1502249it [00:02, 718596.42it/s][A
1580603it [00:02, 735526.65it/s][A
1656583it [00:02, 740884.94it/s][A
1734316it [00:02, 751025.26it/s][A
1809764it [00:02, 744977.42it/s][A
1888104it [00:02, 756023.42it/s][A
1966556it [00:02, 762647.37it/s][A
2042987it [00:02, 665067.86it/s][A
2115310it [00:02, 680585.85it/s][A
2186359it [00:03, 688200.29it/s][A
225


Partition: e
	Observations: 5602190


  6%|█████                                                                           | 1/16 [05:22<1:20:37, 322.53s/it]
0it [00:00, ?it/s][A
45079it [00:00, 449710.73it/s][A

	Photos Added: 9592657



114393it [00:00, 501854.88it/s][A
187453it [00:00, 552936.94it/s][A
262153it [00:00, 599326.60it/s][A
322131it [00:00, 597830.15it/s][A
393667it [00:00, 628454.59it/s][A
470375it [00:00, 663333.59it/s][A
547103it [00:00, 691401.76it/s][A
623941it [00:00, 711694.73it/s][A
699952it [00:01, 724879.79it/s][A
773857it [00:01, 728980.57it/s][A
852231it [00:01, 743143.18it/s][A
928410it [00:01, 747778.33it/s][A
1008057it [00:01, 760155.44it/s][A
1084266it [00:01, 760724.13it/s][A
1160351it [00:01, 751186.54it/s][A
1235506it [00:01, 735332.76it/s][A
1309139it [00:01, 673739.49it/s][A
1377590it [00:01, 656998.97it/s][A
1446382it [00:02, 665161.60it/s][A
1517162it [00:02, 676657.76it/s][A
1589517it [00:02, 689602.47it/s][A
1663483it [00:02, 702430.22it/s][A
1735472it [00:02, 707077.21it/s][A
1806422it [00:02, 707539.98it/s][A
1880838it [00:02, 716152.92it/s][A
1952598it [00:02, 712604.93it/s][A
2026905it [00:02, 721351.84it/s][A
2099627it [00:02, 721480.04it/s][A
217


Partition: b
	Observations: 5599343


 12%|██████████                                                                      | 2/16 [10:45<1:15:15, 322.56s/it]
0it [00:00, ?it/s][A
57115it [00:00, 566343.58it/s][A


	Photos Added: 9582354


135258it [00:00, 615985.54it/s][A
212848it [00:00, 656157.61it/s][A
291800it [00:00, 689810.18it/s][A
364608it [00:00, 699957.53it/s][A
445542it [00:00, 727834.83it/s][A
526558it [00:00, 749473.28it/s][A
606088it [00:00, 762072.31it/s][A
686495it [00:00, 773856.36it/s][A
762128it [00:01, 730952.13it/s][A
844502it [00:01, 756045.23it/s][A
926117it [00:01, 771549.68it/s][A
1007279it [00:01, 782472.64it/s][A
1085943it [00:01, 782287.99it/s][A
1164776it [00:01, 782209.64it/s][A
1246801it [00:01, 792616.86it/s][A
1327133it [00:01, 795026.77it/s][A
1406648it [00:01, 789568.47it/s][A
1486376it [00:01, 790671.73it/s][A
1567155it [00:02, 793942.94it/s][A
1648053it [00:02, 798307.91it/s][A
1728597it [00:02, 799544.47it/s][A
1808562it [00:02, 701149.29it/s][A
1883558it [00:02, 713558.37it/s][A
1962554it [00:02, 733284.97it/s][A
2043058it [00:02, 752860.19it/s][A
2124123it [00:02, 767405.22it/s][A
2204076it [00:02, 774765.73it/s][A
2283057it [00:02, 778763.69it/s][A
236


Partition: c
	Observations: 5600745


 19%|███████████████                                                                 | 3/16 [15:48<1:08:38, 316.79s/it]

	Photos Added: 9587677



0it [00:00, ?it/s][A
52527it [00:00, 521508.05it/s][A
128344it [00:00, 575076.07it/s][A
187403it [00:00, 579212.74it/s][A
268189it [00:00, 631845.81it/s][A
339903it [00:00, 546071.06it/s][A
419367it [00:00, 602440.20it/s][A
501559it [00:00, 653500.97it/s][A
580277it [00:00, 688205.31it/s][A
661299it [00:00, 720650.32it/s][A
743044it [00:01, 745690.59it/s][A
818362it [00:01, 747379.96it/s][A
898458it [00:01, 762584.28it/s][A
979473it [00:01, 774493.43it/s][A
1057276it [00:01, 767021.19it/s][A
1134539it [00:01, 768229.71it/s][A
1211547it [00:01, 727217.46it/s][A
1292883it [00:01, 749611.18it/s][A
1374054it [00:01, 766701.60it/s][A
1452720it [00:02, 771471.24it/s][A
1533104it [00:02, 779206.01it/s][A
1613151it [00:02, 783282.27it/s][A
1695422it [00:02, 793250.36it/s][A
1774914it [00:02, 783825.29it/s][A
1853439it [00:02, 782638.34it/s][A
1931803it [00:02, 766347.37it/s][A
2010064it [00:02, 770276.04it/s][A
2089575it [00:02, 776171.63it/s][A
2169859it [00:02, 7


Partition: d
	Observations: 5595737


 25%|████████████████████                                                            | 4/16 [21:03<1:03:13, 316.16s/it]
0it [00:00, ?it/s][A
59977it [00:00, 599364.74it/s][A


	Photos Added: 9580861


137724it [00:00, 643143.96it/s][A
214313it [00:00, 674943.38it/s][A
291011it [00:00, 700053.11it/s][A
347596it [00:00, 649243.25it/s][A
425027it [00:00, 681834.68it/s][A
503196it [00:00, 708425.99it/s][A
580936it [00:00, 725977.57it/s][A
660017it [00:00, 743024.72it/s][A
740118it [00:01, 758871.48it/s][A
820423it [00:01, 770817.51it/s][A
901449it [00:01, 780526.29it/s][A
979046it [00:01, 587477.18it/s][A
1058924it [00:01, 637165.68it/s][A
1135382it [00:01, 670148.67it/s][A
1210241it [00:01, 690211.77it/s][A
1283664it [00:01, 699440.93it/s][A
1356004it [00:01, 692021.91it/s][A
1433478it [00:02, 714340.63it/s][A
1512205it [00:02, 733529.32it/s][A
1590486it [00:02, 747497.01it/s][A
1671280it [00:02, 762911.08it/s][A
1748227it [00:02, 762178.34it/s][A
1826649it [00:02, 766651.85it/s][A
1905155it [00:02, 771606.67it/s][A
1983560it [00:02, 773004.17it/s][A
2062664it [00:02, 777803.79it/s][A
2140672it [00:02, 776570.33it/s][A
2218416it [00:03, 774779.67it/s][A
2296


Partition: 7
	Observations: 5600757


 31%|█████████████████████████▋                                                        | 5/16 [26:21<58:04, 316.80s/it]
0it [00:00, ?it/s][A
15903it [00:00, 158466.08it/s][A

	Photos Added: 9592052



87950it [00:00, 206834.21it/s][A
162682it [00:00, 263955.01it/s][A
239915it [00:00, 328541.24it/s][A
317568it [00:00, 397201.39it/s][A
390219it [00:00, 459153.06it/s][A
469382it [00:00, 525189.05it/s][A
548061it [00:00, 580876.80it/s][A
625403it [00:00, 627410.96it/s][A
702946it [00:01, 664946.93it/s][A
780688it [00:01, 693673.74it/s][A
855596it [00:01, 691114.33it/s][A
931104it [00:01, 707680.53it/s][A
1006941it [00:01, 721940.73it/s][A
1083945it [00:01, 734726.54it/s][A
1161336it [00:01, 745487.66it/s][A
1238374it [00:01, 750647.62it/s][A
1316226it [00:01, 758198.85it/s][A
1392598it [00:01, 752985.50it/s][A
1471080it [00:02, 760660.41it/s][A
1547694it [00:02, 760496.16it/s][A
1624033it [00:02, 759905.65it/s][A
1701751it [00:02, 763280.36it/s][A
1779761it [00:02, 766456.32it/s][A
1857559it [00:02, 768285.40it/s][A
1934443it [00:02, 740243.13it/s][A
2010650it [00:02, 745643.68it/s][A
2088292it [00:02, 754220.61it/s][A
2166660it [00:02, 761586.93it/s][A
2242


Partition: 2
	Observations: 5598214


 38%|██████████████████████████████▊                                                   | 6/16 [31:44<53:06, 318.63s/it]
0it [00:00, ?it/s][A
2020it [00:00, 16970.24it/s][A

	Photos Added: 9593947



71152it [00:00, 23988.63it/s][A
143494it [00:00, 33787.97it/s][A
214745it [00:00, 47305.02it/s][A
288448it [00:00, 65765.80it/s][A
343223it [00:00, 89291.74it/s][A
416992it [00:00, 121261.78it/s][A
492851it [00:00, 162098.10it/s][A
566474it [00:00, 211503.93it/s][A
639618it [00:01, 268608.09it/s][A
715452it [00:01, 332998.12it/s][A
786393it [00:01, 395857.17it/s][A
862254it [00:01, 461621.76it/s][A
934488it [00:01, 500846.29it/s][A
1006701it [00:01, 550706.89it/s][A
1079541it [00:01, 593133.33it/s][A
1151406it [00:01, 625534.33it/s][A
1223330it [00:01, 649654.21it/s][A
1296257it [00:01, 671209.89it/s][A
1372604it [00:02, 696220.70it/s][A
1448739it [00:02, 712581.82it/s][A
1522460it [00:02, 709850.58it/s][A
1599410it [00:02, 725380.43it/s][A
1676214it [00:02, 736298.18it/s][A
1752112it [00:02, 742435.32it/s][A
1828993it [00:02, 749634.35it/s][A
1904445it [00:02, 564150.70it/s][A
1979098it [00:02, 608081.64it/s][A
2045873it [00:03, 567987.21it/s][A
2114118it 


Partition: 6
	Observations: 5602757


 44%|███████████████████████████████████▉                                              | 7/16 [37:10<48:08, 320.93s/it]
0it [00:00, ?it/s][A
51362it [00:00, 508966.82it/s][A

	Photos Added: 9589905



126193it [00:00, 562234.81it/s][A
201459it [00:00, 607959.95it/s][A
277223it [00:00, 644988.40it/s][A
352114it [00:00, 672541.79it/s][A
430869it [00:00, 702028.03it/s][A
506377it [00:00, 715798.24it/s][A
581222it [00:00, 724204.12it/s][A
651578it [00:00, 648769.12it/s][A
727428it [00:01, 676939.07it/s][A
802099it [00:01, 696390.93it/s][A
877264it [00:01, 711243.43it/s][A
953545it [00:01, 725860.66it/s][A
1032199it [00:01, 741466.07it/s][A
1107416it [00:01, 744046.30it/s][A
1186432it [00:01, 757021.01it/s][A
1264249it [00:01, 762748.19it/s][A
1340677it [00:01, 759278.23it/s][A
1416716it [00:01, 739203.36it/s][A
1497271it [00:02, 757429.37it/s][A
1576368it [00:02, 765670.06it/s][A
1655129it [00:02, 767988.18it/s][A
1732066it [00:02, 703095.94it/s][A
1810264it [00:02, 724554.23it/s][A
1889115it [00:02, 740947.57it/s][A
1966928it [00:02, 749957.39it/s][A
2045687it [00:02, 760228.66it/s][A
2123157it [00:02, 763910.36it/s][A
2199858it [00:02, 763935.54it/s][A
227


Partition: 1
	Observations: 5598867


 50%|█████████████████████████████████████████                                         | 8/16 [42:33<42:52, 321.58s/it]
0it [00:00, ?it/s][A
58269it [00:00, 580582.39it/s][A

	Photos Added: 9587465



134595it [00:00, 625035.30it/s][A
193246it [00:00, 612794.13it/s][A
270254it [00:00, 652741.99it/s][A
344512it [00:00, 676154.88it/s][A
423879it [00:00, 706528.35it/s][A
504738it [00:00, 733431.02it/s][A
583524it [00:00, 747853.70it/s][A
655758it [00:00, 712194.58it/s][A
734821it [00:01, 733229.04it/s][A
814524it [00:01, 751036.71it/s][A
893717it [00:01, 762228.48it/s][A
971895it [00:01, 766629.49it/s][A
1052509it [00:01, 776428.33it/s][A
1131973it [00:01, 780856.48it/s][A
1210012it [00:01, 778863.05it/s][A
1288359it [00:01, 778502.86it/s][A
1366175it [00:01, 761999.70it/s][A
1443263it [00:01, 763198.91it/s][A
1524489it [00:02, 775654.33it/s][A
1603298it [00:02, 779162.53it/s][A
1681272it [00:02, 729032.66it/s][A
1754870it [00:02, 710460.72it/s][A
1833464it [00:02, 731060.32it/s][A
1915404it [00:02, 754189.47it/s][A
1996259it [00:02, 768112.56it/s][A
2077049it [00:02, 777529.20it/s][A
2155157it [00:02, 772362.02it/s][A
2235788it [00:02, 781280.35it/s][A
231


Partition: 0
	Observations: 5601069


 56%|██████████████████████████████████████████████▏                                   | 9/16 [48:00<37:42, 323.21s/it]
0it [00:00, ?it/s][A
54169it [00:00, 541389.77it/s][A


	Photos Added: 9584871


128191it [00:00, 588631.02it/s][A
200701it [00:00, 623085.81it/s][A
275112it [00:00, 654636.25it/s][A
351419it [00:00, 683642.93it/s][A
431340it [00:00, 712831.59it/s][A
513076it [00:00, 739387.16it/s][A
591665it [00:00, 750841.51it/s][A
664457it [00:00, 700450.61it/s][A
744187it [00:01, 725531.67it/s][A
816323it [00:01, 690404.46it/s][A
894526it [00:01, 715155.62it/s][A
970884it [00:01, 728622.20it/s][A
1045705it [00:01, 732805.15it/s][A
1127005it [00:01, 753756.81it/s][A
1206111it [00:01, 763834.54it/s][A
1283849it [00:01, 766052.85it/s][A
1360615it [00:01, 763661.82it/s][A
1438994it [00:01, 767676.08it/s][A
1519544it [00:02, 775843.41it/s][A
1598772it [00:02, 780149.95it/s][A
1678110it [00:02, 782113.33it/s][A
1756641it [00:02, 781338.96it/s][A
1834807it [00:02, 749290.26it/s][A
1910038it [00:02, 617553.96it/s][A
1989373it [00:02, 659882.59it/s][A
2063851it [00:02, 682689.62it/s][A
2145340it [00:02, 716305.85it/s][A
2224086it [00:03, 734725.58it/s][A
2304


Partition: 3
	Observations: 5600548


 62%|██████████████████████████████████████████████████▋                              | 10/16 [53:25<32:21, 323.55s/it]
0it [00:00, ?it/s][A
41402it [00:00, 410067.64it/s][A

	Photos Added: 9591592



117327it [00:00, 475446.37it/s][A
195001it [00:00, 537105.64it/s][A
267457it [00:00, 581158.87it/s][A
341527it [00:00, 620153.36it/s][A
422269it [00:00, 665953.61it/s][A
503324it [00:00, 703550.93it/s][A
583619it [00:00, 728917.96it/s][A
656651it [00:00, 696605.81it/s][A
735205it [00:01, 719586.10it/s][A
816596it [00:01, 743485.28it/s][A
891476it [00:01, 706111.85it/s][A
971951it [00:01, 731989.33it/s][A
1050029it [00:01, 744456.15it/s][A
1126280it [00:01, 747652.99it/s][A
1207245it [00:01, 763275.83it/s][A
1287630it [00:01, 774538.38it/s][A
1368750it [00:01, 784248.31it/s][A
1447416it [00:01, 783186.24it/s][A
1528304it [00:02, 789208.47it/s][A
1609272it [00:02, 793368.52it/s][A
1690550it [00:02, 798248.75it/s][A
1770861it [00:02, 797653.72it/s][A
1851043it [00:02, 796745.61it/s][A
1930755it [00:02, 761949.96it/s][A
2007298it [00:02, 701543.27it/s][A
2084657it [00:02, 720067.02it/s][A
2165543it [00:02, 743863.12it/s][A
2245900it [00:02, 759341.02it/s][A
232


Partition: f
	Observations: 5601073


 69%|███████████████████████████████████████████████████████▋                         | 11/16 [58:48<26:57, 323.60s/it]
0it [00:00, ?it/s][A
57360it [00:00, 569359.84it/s][A

	Photos Added: 9594646



127345it [00:00, 601924.74it/s][A
198796it [00:00, 631389.71it/s][A
277745it [00:00, 670572.17it/s][A
346565it [00:00, 675279.98it/s][A
407972it [00:00, 649731.00it/s][A
480715it [00:00, 670795.84it/s][A
553640it [00:00, 686826.34it/s][A
628341it [00:00, 703334.79it/s][A
703211it [00:01, 714848.87it/s][A
780796it [00:01, 730548.41it/s][A
857729it [00:01, 740826.20it/s][A
935496it [00:01, 750906.67it/s][A
1015092it [00:01, 762715.82it/s][A
1091190it [00:01, 753147.78it/s][A
1166411it [00:01, 752569.56it/s][A
1241603it [00:01, 748103.30it/s][A
1317302it [00:01, 748991.05it/s][A
1394585it [00:01, 753905.11it/s][A
1469964it [00:02, 653418.06it/s][A
1547298it [00:02, 683722.05it/s][A
1623355it [00:02, 704892.21it/s][A
1701211it [00:02, 725432.02it/s][A
1778481it [00:02, 737496.22it/s][A
1853162it [00:02, 738553.82it/s][A
1930174it [00:02, 747372.49it/s][A
2005393it [00:02, 747039.10it/s][A
2080435it [00:02, 740628.93it/s][A
2157767it [00:02, 748944.25it/s][A
223


Partition: 9
	Observations: 5600372


 75%|███████████████████████████████████████████████████████████▎                   | 12/16 [1:04:18<21:41, 325.47s/it]
0it [00:00, ?it/s][A
59074it [00:00, 586016.72it/s][A

	Photos Added: 9593412



132409it [00:00, 623278.08it/s][A
208457it [00:00, 657505.55it/s][A
285209it [00:00, 685992.97it/s][A
340729it [00:00, 627834.47it/s][A
420402it [00:00, 669159.67it/s][A
501083it [00:00, 704827.92it/s][A
568582it [00:00, 687630.11it/s][A
646540it [00:00, 712333.80it/s][A
726086it [00:01, 734505.14it/s][A
805248it [00:01, 749995.73it/s][A
884044it [00:01, 759168.92it/s][A
963539it [00:01, 769047.29it/s][A
1043133it [00:01, 774892.17it/s][A
1120540it [00:01, 773843.50it/s][A
1200115it [00:01, 779745.46it/s][A
1278061it [00:01, 777527.83it/s][A
1356456it [00:01, 778972.76it/s][A
1434341it [00:01, 766584.24it/s][A
1515350it [00:02, 776983.96it/s][A
1593108it [00:02, 686570.19it/s][A
1671382it [00:02, 712269.75it/s][A
1747406it [00:02, 723352.94it/s][A
1826771it [00:02, 742293.67it/s][A
1907172it [00:02, 757668.25it/s][A
1987244it [00:02, 768299.86it/s][A
2066032it [00:02, 772240.20it/s][A
2143843it [00:02, 772481.20it/s][A
2221367it [00:02, 769738.92it/s][A
230


Partition: 8
	Observations: 5601621


 81%|████████████████████████████████████████████████████████████████▏              | 13/16 [1:09:40<16:13, 324.50s/it]
0it [00:00, ?it/s][A
52894it [00:00, 524560.72it/s][A

	Photos Added: 9593466



125850it [00:00, 572527.64it/s][A
195512it [00:00, 604684.97it/s][A
264266it [00:00, 626105.96it/s][A
339903it [00:00, 651230.34it/s][A
417797it [00:00, 684155.56it/s][A
492841it [00:00, 701761.38it/s][A
572536it [00:00, 726063.27it/s][A
650744it [00:00, 740401.11it/s][A
723352it [00:01, 579252.34it/s][A
801335it [00:01, 626936.37it/s][A
876406it [00:01, 658293.42it/s][A
949667it [00:01, 678497.44it/s][A
1026417it [00:01, 702662.38it/s][A
1105248it [00:01, 725914.47it/s][A
1179422it [00:01, 701477.97it/s][A
1258426it [00:01, 725446.04it/s][A
1337672it [00:01, 742543.99it/s][A
1413848it [00:02, 746310.54it/s][A
1494129it [00:02, 761004.13it/s][A
1573457it [00:02, 769735.82it/s][A
1652218it [00:02, 774395.01it/s][A
1732431it [00:02, 780822.17it/s][A
1812676it [00:02, 785524.46it/s][A
1893320it [00:02, 789533.52it/s][A
1972382it [00:02, 782948.81it/s][A
2050766it [00:02, 776884.35it/s][A
2128528it [00:02, 775409.12it/s][A
2206121it [00:03, 726281.98it/s][A
227


Partition: 4
	Observations: 5602229


 88%|█████████████████████████████████████████████████████████████████████▏         | 14/16 [1:14:57<10:44, 322.22s/it]
0it [00:00, ?it/s][A
60951it [00:00, 604522.26it/s][A


	Photos Added: 9593950


137194it [00:00, 643957.73it/s][A
216472it [00:00, 680685.80it/s][A
294917it [00:00, 707758.97it/s][A
367839it [00:00, 713919.15it/s][A
446706it [00:00, 732874.77it/s][A
525239it [00:00, 747325.23it/s][A
602920it [00:00, 755310.69it/s][A
682252it [00:00, 764673.61it/s][A
760893it [00:01, 770387.86it/s][A
836440it [00:01, 729891.95it/s][A
915552it [00:01, 746033.17it/s][A
995205it [00:01, 758746.82it/s][A
1074592it [00:01, 768228.36it/s][A
1154199it [00:01, 775914.90it/s][A
1232460it [00:01, 777357.82it/s][A
1310162it [00:01, 635132.17it/s][A
1382846it [00:01, 659400.83it/s][A
1462796it [00:01, 695607.17it/s][A
1543792it [00:02, 724692.90it/s][A
1625178it [00:02, 747265.48it/s][A
1706171it [00:02, 764097.62it/s][A
1783903it [00:02, 762323.23it/s][A
1861062it [00:02, 745111.01it/s][A
1941252it [00:02, 759776.51it/s][A
2019569it [00:02, 766091.00it/s][A
2098205it [00:02, 769989.29it/s][A
2178250it [00:02, 778440.47it/s][A
2259136it [00:03, 786076.33it/s][A
2337


Partition: 5
	Observations: 5596842


 94%|██████████████████████████████████████████████████████████████████████████     | 15/16 [1:20:11<05:19, 319.56s/it]
0it [00:00, ?it/s][A
56590it [00:00, 562111.85it/s][A


	Photos Added: 9584854


133565it [00:00, 611419.12it/s][A
211778it [00:00, 653956.05it/s][A
286597it [00:00, 679318.33it/s][A
343464it [00:00, 641359.84it/s][A
424038it [00:00, 682516.03it/s][A
504627it [00:00, 714971.20it/s][A
583913it [00:00, 735007.44it/s][A
662926it [00:00, 748566.21it/s][A
742372it [00:01, 761314.26it/s][A
817637it [00:01, 756765.54it/s][A
892714it [00:01, 721127.51it/s][A
968806it [00:01, 731531.83it/s][A
1047274it [00:01, 745553.93it/s][A
1128010it [00:01, 761602.37it/s][A
1208526it [00:01, 773072.18it/s][A
1289497it [00:01, 783011.55it/s][A
1369826it [00:01, 787354.69it/s][A
1449572it [00:01, 789738.09it/s][A
1529284it [00:02, 791243.54it/s][A
1608455it [00:02, 787223.74it/s][A
1691128it [00:02, 796987.92it/s][A
1770882it [00:02, 789415.18it/s][A
1849879it [00:02, 787749.23it/s][A
1928694it [00:02, 779164.90it/s][A
2006660it [00:02, 735906.49it/s][A
2085871it [00:02, 750422.79it/s][A
2165751it [00:02, 763391.58it/s][A
2246778it [00:02, 775214.91it/s][A
2327


Partition: a
	Observations: 5598239


100%|███████████████████████████████████████████████████████████████████████████████| 16/16 [1:25:21<00:00, 320.10s/it]

	Photos Added: 9585535





### Filter for Animal Species

In [4]:
inaturalist_df = pd.read_csv(DATA_DIR + 'inaturalist_data.csv', delimiter='\t', header=None)
inaturalist_df = inaturalist_df.dropna()
inaturalist_df[0] = inaturalist_df[0].astype(int)
inaturalist_df[1] = inaturalist_df[1].astype(str)
inaturalist_df[2] = inaturalist_df[2].astype(int)
inaturalist_df

Unnamed: 0,0,1,2
0,21223,jpg,67438
1,21236,jpg,124171
2,21238,jpg,67559
3,21283,jpg,555970
4,21285,jpg,555970
...,...,...,...
153429239,249325291,jpg,453937
153429240,249325363,jpg,321690
153429241,249323740,jpeg,50822
153429242,249325544,jpg,48419


In [3]:
taxa_df = pd.read_csv(DATA_DIR + 'taxa.csv', delimiter='\t')
taxa_df = taxa_df[(taxa_df['rank'] == 'species') & taxa_df['active']] # Getting only Active Species
taxa_df = taxa_df[taxa_df['ancestry'].str.contains('/2/', na=False)] # Getting only Chordates
taxa_df

Unnamed: 0,taxon_id,ancestry,rank_level,rank,name,active
2,14866,48460/1/2/355675/3/7251/14841/14865,10.0,species,Gracula ptilogenys,True
3,11218,48460/1/2/355675/3/7251/10732/559715/979788,10.0,species,Cranioleuca erythrops,True
5,10773,48460/1/2/355675/3/7251/10732/559715/10768,10.0,species,Thripadectes holostictus,True
6,11624,48460/1/2/355675/3/7251/10732/559715/11613,10.0,species,Phacellodomus rufifrons,True
8,13857,48460/1/2/355675/3/7251/13685/13847,10.0,species,Passer pyrrhonotus,True
...,...,...,...,...,...,...
1393606,1424864,48460/1/2/355675/26036/26172/85552/35001/35002,10.0,species,Heloderma exasperatum,True
1393607,1424865,48460/1/2/355675/26036/26172/85552/35001/35002,10.0,species,Heloderma charlesbogerti,True
1393643,1424866,48460/1/2/355675/26036/26172/85552/35001/35002,10.0,species,Heloderma alvarezi,True
1394063,1428033,48460/1/2/355675/47178/49216/1303593/513686/90170,10.0,species,Nemacheilus cacao,True


In [7]:
taxon_id_set = set(list(taxa_df['taxon_id']))
inaturalist_df = inaturalist_df[inaturalist_df[2].isin(taxon_id_set)]
inaturalist_df

Unnamed: 0,0,1,2
7,21329,jpg,4328
8,21543,jpg,204452
9,21550,jpg,4665
10,21551,jpg,4665
11,21552,jpg,4665
...,...,...,...
153429169,249324626,jpeg,14886
153429178,249324706,jpeg,357
153429226,249325313,jpg,1416307
153429229,249325412,jpeg,4981


### Filter for Photo Count by Species

In [24]:
species_counts = inaturalist_df.groupby(2).count()[0].sort_values()
species_counts = species_counts[species_counts >= 1000]
species_counts

2
792951      1000
117570      1001
12711       1003
6643        1003
1081        1003
           ...  
42223     180888
4956      200795
13858     201496
5212      206921
6930      302345
Name: 0, Length: 2652, dtype: int64

In [25]:
taxon_id_set = set(list(species_counts.index))
inaturalist_df = inaturalist_df[inaturalist_df[2].isin(taxon_id_set)]
inaturalist_df

Unnamed: 0,0,1,2
7,21329,jpg,4328
9,21550,jpg,4665
10,21551,jpg,4665
11,21552,jpg,4665
12,21553,jpg,4665
...,...,...,...
153429168,249324554,jpeg,46017
153429169,249324626,jpeg,14886
153429178,249324706,jpeg,357
153429229,249325412,jpeg,4981


In [29]:
inaturalist_df.to_csv(DATA_DIR + 'inaturalist_data.csv', sep='\t', header=False, index=False) 

### Train/Test Split