# Bioclimatic data extraction
For each long-lat instance present in both the species' train and test set, the value of the following variables of interest are extracted:

- BIO1: Annual Mean Temperature
- BIO5: Max Temperature of Warmest Month
- BIO6: Min Temperature of Coldest Month
- BIO12: Annual Precipitation
- BIO15: Precipitation Seasonality (Coefficient of Variation)

### Preliminary code

In [1]:
# Importing useful libraries
import numpy as np
import pandas as pd
import tifffile
import rasterio
from rasterio.transform import xy
from rasterio.transform import rowcol
from pathlib import Path

### Checking data extraction system
**Process:**
1. Checking the coordinate system used by tiff, allowing to query a pixel and get the corresponding features for a location
2. Checking the coordinate system used by the training and test sets, to ensure that the mapping is coherent

In [2]:
# 1) Checking what coordinate system the tiff uses

input_files = ['wc2.1_10m_bio_1.tif', 'wc2.1_10m_bio_5.tif', 'wc2.1_10m_bio_12.tif', 'wc2.1_10m_bio_15.tif']

# Open the TIFF file
for file in input_files:
    with rasterio.open(file) as dataset:
    # Check the coordinate reference system
        crs = dataset.crs
        print(f"Coordinate Reference System: {crs}")

    # Check the transformation matrix (maps pixel to coordinates)
        transform = dataset.transform
        print(f"Transformation: {transform}")
# Example pixel coordinates (row, col)
row, col = 540, 1080

print("An example:")
# Get geographic coordinates (longitude, latitude)
with rasterio.open('wc2.1_10m_bio_6.tif') as dataset:
    lon, lat = xy(dataset.transform, row, col)
    print(f"Coordinates for pixel ({row}, {col}): Lon: {lon}, Lat: {lat}")
    pixel_value = dataset.read(1)[row, col]
    print(f"Pixel value at ({row}, {col}): {pixel_value}")

Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
An example:
Coordinates for pixel (540, 1080): Lon: 0.08333333333331439, Lat: -0.0833333333333286
Pixel value at (540, 1080): -3.3999999521443642e+38


*They are all in the same reference system with the same transformation matrix (as expected)*

Wikipedia :EPSG:4326 - WGS 84, latitude/longitude coordinate system based on the Earth's center of mass, used by the Global Positioning System among others.

In [None]:
# 2) Checking what coordinate system the training and test sets use

# Loading training data    
data = np.load(Path('../species/species_train.npz'))
train_locs = data['train_locs']  # 2D array, rows are number of datapoints and columns are "latitude" and "longitude"
train_ids = data['train_ids'].astype(str)
    
# Checking that they also follow EPSG:4326
# Check the latitude and longitude ranges
latitudes = train_locs[:, 0]  # first column (latitude)
longitudes = train_locs[:, 1]  # second column (longitude)

# Check if latitudes and longitudes are within valid ranges
print((-90 <= latitudes).all() and (latitudes <= 90).all())
print((-180 <= longitudes).all() and (longitudes <= 180).all())


# Loading test data
data_test = np.load(Path('../species/species_test.npz'), allow_pickle=True)
test_locs = data_test['test_locs']   

# Checking that they also follow EPSG:4326
# Check the latitude and longitude ranges
latitudes = test_locs[:, 0]  # first column (latitude)
longitudes = test_locs[:, 1]  # second column (longitude)

# Check if latitudes and longitudes are within valid ranges
print((-90 <= latitudes).all() and (latitudes <= 90).all())
print((-180 <= longitudes).all() and (longitudes <= 180).all())

True
True
True
True


### Extracting data

In [4]:
# Input files corresponding to the BIO variables
input_files = ['wc2.1_10m_bio_1.tif',  # BIO1: Annual Mean Temperature
               'wc2.1_10m_bio_5.tif',  # BIO5: Max Temperature of Warmest Month
               'wc2.1_10m_bio_6.tif',  # BIO6 = Min Temperature of Coldest Month
               'wc2.1_10m_bio_12.tif', # BIO12: Annual Precipitation
               'wc2.1_10m_bio_15.tif'] # BIO15: Precipitation Seasonality (Coefficient of Variation)

In [14]:
# Extracting train-set's bioclimatic data

# Empty list to store the values from each TIFF file
mapped_values = np.zeros((train_locs.shape[0], len(input_files)))

for idx, tiff_file in enumerate(input_files):
    with rasterio.open(tiff_file) as dataset:
        transform = dataset.transform
        
        # Convert lat/lon coordinates to (row, col) pixel coordinates for all locations
        lon_lats = train_locs[:, ::-1]  # reverse to get (lon, lat) for rasterio -> different requirements
        rows_cols = np.array([rowcol(transform, lon, lat) for lon, lat in lon_lats])
        
        # Extract the pixel values from the raster for each (row, col)
        for i, (row, col) in enumerate(rows_cols):
            row, col = int(row), int(col) # potential issues
            
            # Read the value from the TIFF file at the corresponding pixel
            mapped_values[i, idx] = dataset.read(1)[row, col]  # layer/band 1
        print("done layer")

column_names = ['lat', 'long', 'BIO1', 'BIO5', 'BI06', 'BIO12', 'BIO15']
train_locs_extended = pd.DataFrame(np.hstack((train_locs, mapped_values)), columns=column_names)

train_locs_extended.insert(0, "id", train_ids) # insert also the ids column

# Display and save the dataset
display(train_locs_extended.head())
train_locs_extended.to_csv('bioclimatic_train.csv', index=False)

done layer
done layer
done layer
done layer
done layer


Unnamed: 0,id,lat,long,BIO1,BIO5,BI06,BIO12,BIO15
0,31529,-18.286728,143.481247,25.830833,36.812252,12.26175,778.0,118.931313
1,31529,-13.099798,130.783646,26.970772,35.256001,15.94425,1637.0,106.943298
2,31529,-13.965274,131.695145,27.042313,37.1035,15.0215,1190.0,112.085449
3,31529,-12.85395,132.800507,27.847281,36.48975,17.33075,1438.0,111.659103
4,31529,-12.19679,134.279327,27.310499,35.029251,18.0895,1250.0,112.806938


In [13]:
# Extracting test-set's bioclimatic data

# Input files corresponding to the BIO variables (add any input files that you want)
input_files = ['wc2.1_10m_bio_1.tif',  # BIO1: Annual Mean Temperature
               'wc2.1_10m_bio_5.tif',  # BIO5: Max Temperature of Warmest Month
               'wc2.1_10m_bio_6.tif',  # BIO6 = Min Temperature of Coldest Month
               'wc2.1_10m_bio_12.tif', # BIO12: Annual Precipitation
               'wc2.1_10m_bio_15.tif'] # BIO15: Precipitation Seasonality (Coefficient of Variation)

# Empty list to store the values from each TIFF file
mapped_values = np.zeros((test_locs.shape[0], len(input_files)))

for idx, tiff_file in enumerate(input_files):
    with rasterio.open(tiff_file) as dataset:
        transform = dataset.transform
        
        # Convert lat/lon coordinates to (row, col) pixel coordinates for all locations
        lon_lats = test_locs[:, ::-1]  # reverse to get (lon, lat) for rasterio -> different requirements
        rows_cols = np.array([rowcol(transform, lon, lat) for lon, lat in lon_lats])
        
        # Extract the pixel values from the raster for each (row, col)
        for i, (row, col) in enumerate(rows_cols):
            row, col = int(row), int(col) # potential issues
            
            # Read the value from the TIFF file at the corresponding pixel
            mapped_values[i, idx] = dataset.read(1)[row, col]  # layer/band 1
        print("done layer")

column_names = ['lat', 'long', 'BIO1', 'BIO5', 'BI06', 'BIO12', 'BIO15']
test_locs_extended = pd.DataFrame(np.hstack((test_locs, mapped_values)), columns=column_names)

# Display and save the dataset
display(test_locs_extended.head())
test_locs_extended.to_csv('bioclimatic_test.csv', index=False)

done layer
done layer
done layer
done layer
done layer


Unnamed: 0,lat,long,BIO1,BIO5,BI06,BIO12,BIO15
0,9.630478,-173.535599,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38
1,3.839375,-162.544464,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38
2,4.289169,-167.944778,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38
3,3.879849,-169.720459,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38
4,-6.23721,-169.554123,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38,-3.4000000000000003e+38
