## Locations data cleaning

Outliers are removed from the species' locations based on the corresponding outliers in the bioclimatic data.

### Preliminary code

In [1]:
# Importing useful packages
from pathlib import Path
import numpy as np
import pandas as pd

In [None]:
# Loading train data
species_train = np.load('species_train.npz')
species_ids = species_train['taxon_ids'].astype(str)                # list of all species' IDs
species_names = species_train['taxon_names']                        # list of all species' taxon names
species_names = dict(zip(species_ids,species_names))                # dictionary of all species' IDs -> taxonomy names
train_ids = species_train['train_ids'].astype(str)                  # 1D array of each location's relevant specie IDs
train_locations = species_train['train_locs'].astype(np.float64)    # 2D array with each datapoint's latitude and longitude
train_latitudes = train_locations[:,0]
train_longitudes = train_locations[:,1]
train_dataDF = pd.concat([pd.DataFrame(train_ids),pd.DataFrame(train_locations)],axis=1)
train_dataDF.columns = ['id', 'lat', 'long']
display(train_dataDF)

# Preparing extra-train data
extra_species_train = np.load('species_train_extra.npz')
extra_train_ids = extra_species_train['train_ids'].astype(str)                  # 1D array of each location's relevant specie IDs
extra_train_locations = extra_species_train['train_locs'].astype(np.float64)    # 2D array with each datapoint's latitude and longitude
extra_train_longitudes = extra_train_locations[:,1]
extra_train_latitudes = extra_train_locations[:,0]
extra_train_dataDF = pd.concat([pd.DataFrame(extra_train_ids),pd.DataFrame(extra_train_locations)],axis=1)
extra_train_dataDF.columns = ['id', 'lat', 'long']
display(extra_train_dataDF)

# Preparing test data 
species_test = np.load('species_test.npz', allow_pickle=True)
taxon_ids = species_test['taxon_ids'].astype(str)               # 1D array of each location's relevant specie IDs
test_locations = species_test['test_locs'].astype(np.float64)   # 2D array with each datapoint's latitude and longitude
test_pos_inds = species_test['test_pos_inds']                   # list of lists, each list corresponds to the indices in test_locs where a given species is present (it can be assumed that they are not present in the other locations)
test_pos_inds = dict(zip(taxon_ids, test_pos_inds))
rows = []
for taxon_id,indices in test_pos_inds.items():
    for idx in indices:
        lat,long = test_locations[idx]
        rows.append({'id':taxon_id,'lat':lat,'long':long})
test_dataDF = pd.DataFrame(rows, columns=['id','lat','long'])
display(test_dataDF)

Unnamed: 0,id,lat,long
0,31529,-18.286728,143.481247
1,31529,-13.099798,130.783646
2,31529,-13.965274,131.695145
3,31529,-12.853950,132.800507
4,31529,-12.196790,134.279327
...,...,...,...
272032,145031,33.716885,73.203621
272033,145031,24.600239,72.730560
272034,145031,18.849600,80.654129
272035,145031,21.073837,75.945656


Unnamed: 0,id,lat,long
0,14,-22.128679,-46.795666
1,14,-22.833548,-47.105415
2,14,-20.454288,-54.581146
3,14,-22.676571,-45.841366
4,14,-22.310076,-42.497063
...,...,...,...
1067587,1369303,3.339513,101.247047
1067588,1369303,3.176027,101.830063
1067589,1369303,3.339002,101.244804
1067590,1369303,3.342500,101.246635


Unnamed: 0,id,lat,long
0,31529,-19.884237,126.052979
1,31529,-20.219316,124.723953
2,31529,-20.053690,125.386505
3,31529,-19.973000,126.462440
4,31529,-19.962839,124.980362
...,...,...,...
1706641,145031,31.998211,72.541458
1706642,145031,26.927755,69.225052
1706643,145031,23.349318,70.605515
1706644,145031,23.706282,68.259659


### Cleaning data

In [3]:
# Cleaning datasets based on bioclim outliers
bioclimatic_train = pd.read_csv(Path('../bioclimatic/cleaned_bioclimatic_train.csv'), dtype={'id': str, 'long': np.float32, 'lat': np.float32})
bioclimatic_train_extra = pd.read_csv(Path('../bioclimatic/cleaned_bioclimatic_train_extra.csv'), dtype={'id': str,'long':np.float32,'lat':np.float32})
bioclimatic_test = pd.read_csv(Path('../bioclimatic/cleaned_bioclimatic_test.csv'), dtype={'id': str, 'long': np.float32, 'lat': np.float32})
cleaned_train_dataDF = train_dataDF[train_dataDF[['long', 'lat']].apply(tuple, axis=1).isin(list(bioclimatic_train[['long', 'lat']].apply(tuple, axis=1).unique()))]
cleaned_extra_train_dataDF = extra_train_dataDF[extra_train_dataDF[['long', 'lat']].apply(tuple, axis=1).isin(bioclimatic_train_extra[['long', 'lat']].apply(tuple, axis=1))]
cleaned_test_dataDF = test_dataDF[test_dataDF[['long', 'lat']].apply(tuple, axis=1).isin(bioclimatic_test[['long', 'lat']].apply(tuple, axis=1))]
cleaned_train_dataDF[['long', 'lat']] = train_dataDF.loc[cleaned_train_dataDF.index, ['long', 'lat']] # restoring np.float64 values
cleaned_extra_train_dataDF[['long', 'lat']] = extra_train_dataDF.loc[cleaned_extra_train_dataDF.index, ['long', 'lat']]
cleaned_test_dataDF[['long', 'lat']] = test_dataDF.loc[cleaned_test_dataDF.index, ['long', 'lat']]
display(cleaned_train_dataDF)
display(cleaned_extra_train_dataDF)
display(cleaned_test_dataDF)
cleaned_train_dataDF.to_csv('species_train.csv', index=False)
cleaned_extra_train_dataDF.to_csv('species_train_extra.csv', index=False)
cleaned_test_dataDF.to_csv('species_test.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_train_dataDF[['long', 'lat']] = train_dataDF.loc[cleaned_train_dataDF.index, ['long', 'lat']] # restoring np.float64 values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_extra_train_dataDF[['long', 'lat']] = extra_train_dataDF.loc[cleaned_extra_train_dataDF.index, ['long', 'lat']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

Unnamed: 0,id,lat,long
0,31529,-18.286728,143.481247
1,31529,-13.099798,130.783646
2,31529,-13.965274,131.695145
3,31529,-12.853950,132.800507
4,31529,-12.196790,134.279327
...,...,...,...
272032,145031,33.716885,73.203621
272033,145031,24.600239,72.730560
272034,145031,18.849600,80.654129
272035,145031,21.073837,75.945656


Unnamed: 0,id,lat,long
0,14,-22.128679,-46.795666
1,14,-22.833548,-47.105415
2,14,-20.454288,-54.581146
3,14,-22.676571,-45.841366
4,14,-22.310076,-42.497063
...,...,...,...
1067587,1369303,3.339513,101.247047
1067588,1369303,3.176027,101.830063
1067589,1369303,3.339002,101.244804
1067590,1369303,3.342500,101.246635


Unnamed: 0,id,lat,long
0,31529,-19.884237,126.052979
1,31529,-20.219316,124.723953
2,31529,-20.053690,125.386505
3,31529,-19.973000,126.462440
4,31529,-19.962839,124.980362
...,...,...,...
1706641,145031,31.998211,72.541458
1706642,145031,26.927755,69.225052
1706643,145031,23.349318,70.605515
1706644,145031,23.706282,68.259659
