In [6]:
import tifffile
import rasterio
from rasterio.transform import xy
from rasterio.transform import rowcol
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**Variables that could be of interest:**
- BIO1: Annual Mean Temperature
- BIO12: Annual Precipitation
- BIO5: Max Temperature of Warmest Month
- BIO6: Min Temperature of Coldest Month
- BIO15: Precipitation Seasonality (Coefficient of Variation)

# Checking data extraction system

**Process:**
1. Check what coordinate system the tiff uses so that you can query a pixel to get the corresponding features for a location of interest.
2. Check what coordinate system the training set uses to make sure the mapping is similar
 

In [7]:
# 1) Checking what coordinate system the tiff uses

input_files = ['wc2.1_10m_bio_1.tif', 'wc2.1_10m_bio_5.tif', 'wc2.1_10m_bio_12.tif', 'wc2.1_10m_bio_15.tif']

# Open the TIFF file
for file in input_files:
    with rasterio.open('Data\\bioclimatic\\'+file) as dataset:
    # Check the coordinate reference system
        crs = dataset.crs
        print(f"Coordinate Reference System: {crs}")

    # Check the transformation matrix (maps pixel to coordinates)
        transform = dataset.transform
        print(f"Transformation: {transform}")
# Example pixel coordinates (row, col)
row, col = 540, 1080

print("An example:")
# Get geographic coordinates (longitude, latitude)
with rasterio.open('Data\\bioclimatic\\wc2.1_10m_bio_6.tif') as dataset:
    lon, lat = xy(dataset.transform, row, col)
    print(f"Coordinates for pixel ({row}, {col}): Lon: {lon}, Lat: {lat}")
    pixel_value = dataset.read(1)[row, col]
    print(f"Pixel value at ({row}, {col}): {pixel_value}")

Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
Coordinate Reference System: EPSG:4326
Transformation: | 0.17, 0.00,-180.00|
| 0.00,-0.17, 90.00|
| 0.00, 0.00, 1.00|
An example:
Coordinates for pixel (540, 1080): Lon: 0.08333333333331439, Lat: -0.0833333333333286
Pixel value at (540, 1080): -3.3999999521443642e+38


*They are all in the same reference system with the same transformation matrix (as expected)*

Wikipedia :EPSG:4326 - WGS 84, latitude/longitude coordinate system based on the Earth's center of mass, used by the Global Positioning System among others.

In [8]:
# 2) Check what coordinate system the training set uses

# Loading training data    
data = np.load('Data\\species\\species_train.npz')
train_locs = data['train_locs']  # 2D array, rows are number of datapoints and columns are "latitude" and "longitude"
train_ids = data['train_ids']
    
# Checking that they also follow EPSG:4326
# Check the latitude and longitude ranges
latitudes = train_locs[:, 0]  # first column (latitude)
longitudes = train_locs[:, 1]  # second column (longitude)

# Check if latitudes and longitudes are within valid ranges
print((-90 <= latitudes).all() and (latitudes <= 90).all())
print((-180 <= longitudes).all() and (longitudes <= 180).all())

True
True


# Extracting data

In [9]:
# Working with smaller dataset to understand
train_locs_test = train_locs[:100]
train_ids_test = train_ids[:100]
train_locs_test.shape

(100, 2)

In [10]:
# Input files corresponding to the BIO variables (add any input files that you want)
input_files = ['wc2.1_10m_bio_1.tif',  # BIO1: Annual Mean Temperature
               'wc2.1_10m_bio_5.tif',  # BIO5: Max Temperature of Warmest Month
               'wc2.1_10m_bio_6.tif',  # BIO6 = Min Temperature of Coldest Month
               'wc2.1_10m_bio_12.tif', # BIO12: Annual Precipitation
               'wc2.1_10m_bio_15.tif'] # BIO15: Precipitation Seasonality (Coefficient of Variation)

# Empty list to store the values from each TIFF file
mapped_values = np.zeros((train_locs_test.shape[0], len(input_files)))

for idx, tiff_file in enumerate(input_files):
    with rasterio.open('Data\\bioclimatic\\'+tiff_file) as dataset:
        transform = dataset.transform
        
        # Convert lat/lon coordinates to (row, col) pixel coordinates for all locations
        lon_lats = train_locs_test[:, ::-1]  # reverse to get (lon, lat) for rasterio -> different requirements
        rows_cols = np.array([rowcol(transform, lon, lat) for lon, lat in lon_lats])
        
        # Extract the pixel values from the raster for each (row, col)
        for i, (row, col) in enumerate(rows_cols):
            row, col = int(row), int(col) # potential issues
            
            # Read the value from the TIFF file at the corresponding pixel
            mapped_values[i, idx] = dataset.read(1)[row, col]  # layer/band 1

column_names = ['Latitude', 'Longitude', 'BIO1', 'BIO5', 'BI06', 'BIO12', 'BIO15']
train_locs_extended = pd.DataFrame(np.hstack((train_locs_test, mapped_values)), columns=column_names)

train_locs_extended.insert(0, "ID", train_ids_test) # insert also the ID column

# Print or save the DataFrame as needed
train_locs_extended.head()

Unnamed: 0,ID,Latitude,Longitude,BIO1,BIO5,BI06,BIO12,BIO15
0,31529,-18.286728,143.481247,25.830833,36.812252,12.26175,778.0,118.931313
1,31529,-13.099798,130.783646,26.970772,35.256001,15.94425,1637.0,106.943298
2,31529,-13.965274,131.695145,27.042313,37.1035,15.0215,1190.0,112.085449
3,31529,-12.85395,132.800507,27.847281,36.48975,17.33075,1438.0,111.659103
4,31529,-12.19679,134.279327,27.310499,35.029251,18.0895,1250.0,112.806938
