# Cleaning locations datasets
Outliers are removed from the species' locations based on the corresponding outliers in the bioclim dataset.

In [1]:
# Importing useful packages
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
# Loading train data
species_train = np.load('species_train.npz')
species_ids = species_train['taxon_ids'].astype(str)# list of all species' IDs (NB: not necessarily starting at 0 nor 1)
species_names = species_train['taxon_names']        # list of all species' taxon names
species_names = dict(zip(species_ids,species_names))# dictionary of all species' IDs -> taxonomy names
train_ids = species_train['train_ids'].astype(str)  # 1D array of each location's relevant specie IDs
train_locations = species_train['train_locs']       # 2D array with each datapoint's latitude and longitude
train_latitudes = train_locations[:,0]
train_longitudes = train_locations[:,1]
train_dataDF = pd.concat([pd.DataFrame(train_ids),pd.DataFrame(train_locations)],axis=1)
train_dataDF.columns = ['id', 'lat', 'long']
display(train_dataDF)

# Preparing extra-train data
extra_species_train = np.load('species_train_extra.npz')
extra_train_ids = extra_species_train['train_ids'].astype(str)  # 1D array of each location's relevant specie IDs
extra_train_locations = extra_species_train['train_locs']       # 2D array with each datapoint's latitude and longitude
extra_train_longitudes = extra_train_locations[:,1]
extra_train_latitudes = extra_train_locations[:,0]
extra_train_dataDF = pd.concat([pd.DataFrame(extra_train_ids),pd.DataFrame(extra_train_locations)],axis=1)
extra_train_dataDF.columns = ['id', 'lat', 'long']
display(extra_train_dataDF)

# Preparing test data 
species_test = np.load('species_test.npz', allow_pickle=True)
test_ids = species_test['taxon_ids'].astype(str)    # 1D array of each location's relevant specie IDs
test_locations = species_test['test_locs']          # 2D array with each datapoint's latitude and longitude
#species_test['test_pos_inds']                      # list of lists, each list corresponds to the indices in test_locs where a given species is present
                                                    # > it can be assumed that they are not present in the other locations 
test_pos_inds = dict(zip(test_ids, species_test['test_pos_inds']))
test_dataDF = pd.concat([pd.DataFrame(test_ids),pd.DataFrame(test_locations)],axis=1)
test_dataDF.columns = ['id', 'lat', 'long']
display(test_dataDF)

Unnamed: 0,id,lat,long
0,31529,-18.286728,143.481247
1,31529,-13.099798,130.783646
2,31529,-13.965274,131.695145
3,31529,-12.853950,132.800507
4,31529,-12.196790,134.279327
...,...,...,...
272032,145031,33.716885,73.203621
272033,145031,24.600239,72.730560
272034,145031,18.849600,80.654129
272035,145031,21.073837,75.945656


Unnamed: 0,id,lat,long
0,14,-22.128679,-46.795666
1,14,-22.833548,-47.105415
2,14,-20.454288,-54.581146
3,14,-22.676571,-45.841366
4,14,-22.310076,-42.497063
...,...,...,...
1067587,1369303,3.339513,101.247047
1067588,1369303,3.176027,101.830063
1067589,1369303,3.339002,101.244804
1067590,1369303,3.342500,101.246635


Unnamed: 0,id,lat,long
0,31529,9.630478,-173.535599
1,3117,3.839375,-162.544464
2,116872,4.289169,-167.944778
3,13392,3.879849,-169.720459
4,13456,-6.237210,-169.554123
...,...,...,...
288117,,-23.468565,110.252289
288118,,-18.319242,107.135307
288119,,-21.390581,111.551872
288120,,-21.996183,109.184601


In [3]:
# Cleaning datasets based on bioclim outliers
bioclimatic_train = pd.read_csv(Path('../bioclimatic/cleaned_bioclimatic_train.csv'), dtype={'id': str, 'lat':np.float32, 'long':np.float32})
#bioclimatic_train_extra = pd.read_csv(Path('../bioclimatic/cleaned_bioclimatic_train_extra.csv'), dtype={'id': str})
bioclimatic_test = pd.read_csv(Path('../bioclimatic/cleaned_bioclimatic_test.csv'), dtype={'lat':np.float32, 'long':np.float32})
train_dataDF = train_dataDF[train_dataDF[['long', 'lat']].apply(tuple, axis=1).isin(list(bioclimatic_train[['long', 'lat']].apply(tuple, axis=1).unique()))]
#extra_train_dataDF = extra_train_dataDF[extra_train_dataDF[['long', 'lat']].apply(tuple, axis=1).isin(bioclimatic_train_extra[['long', 'lat']].apply(tuple, axis=1))]
test_dataDF = test_dataDF[test_dataDF[['long', 'lat']].apply(tuple, axis=1).isin(bioclimatic_test[['long', 'lat']].apply(tuple, axis=1))]
display(train_dataDF)
#display(extra_train_dataDF)
display(test_dataDF)
train_dataDF.to_csv('species_train.csv', index=False)
#extra_train_dataDF.to_csv('species_train_extra.csv', index=False)
test_dataDF.to_csv('species_test.csv', index=False)

Unnamed: 0,id,lat,long
0,31529,-18.286728,143.481247
1,31529,-13.099798,130.783646
2,31529,-13.965274,131.695145
3,31529,-12.853950,132.800507
4,31529,-12.196790,134.279327
...,...,...,...
272032,145031,33.716885,73.203621
272033,145031,24.600239,72.730560
272034,145031,18.849600,80.654129
272035,145031,21.073837,75.945656


Unnamed: 0,id,lat,long
1833,,-4.431143,-171.202988
2231,,-2.811569,-171.774796
2405,,-48.206493,-74.041061
2408,,-45.039169,-73.180496
2410,,-40.356495,-66.358994
...,...,...,...
288104,,-23.839869,117.268517
288107,,-22.654541,119.709366
288110,,-24.801586,117.922195
288111,,-22.258629,115.336998
