In [40]:
import pandas as pd
import geopandas as gpd
from pyproj import CRS
import hdbscan
import numpy as np
import pickle


bat = pd.read_csv("bat_final_data.csv")
brown_hare = pd.read_csv("brown_hare_final_data.csv")
dormice = pd.read_csv("dormice_final_data.csv")
hedgehog = pd.read_csv("hedgehog_final_data.csv")
red_squirrel = pd.read_csv("red_squirrel_final_data.csv")

In [41]:
# drop na and duplicates
bat = bat.dropna(subset=['decimalLatitude', 'decimalLongitude'])
bat = bat.drop_duplicates(subset=['decimalLatitude', 'decimalLongitude', 'eventDate'])
brown_hare = brown_hare.dropna(subset=['decimalLatitude', 'decimalLongitude'])
brown_hare = brown_hare.drop_duplicates(subset=['decimalLatitude', 'decimalLongitude', 'eventDate'])
dormice = dormice.dropna(subset=['decimalLatitude', 'decimalLongitude'])
dormice = dormice.drop_duplicates(subset=['decimalLatitude', 'decimalLongitude', 'eventDate'])
hedgehog = hedgehog.dropna(subset=['decimalLatitude', 'decimalLongitude'])
hedgehog = hedgehog.drop_duplicates(subset=['decimalLatitude', 'decimalLongitude', 'eventDate'])
red_squirrel = red_squirrel.dropna(subset=['decimalLatitude', 'decimalLongitude'])
red_squirrel = red_squirrel.drop_duplicates(subset=['decimalLatitude', 'decimalLongitude', 'eventDate'])

In [42]:
# time-based features

bat['eventDate'] = pd.to_datetime(bat['eventDate'])
bat['month'] = bat['eventDate'].dt.month
bat['day_of_year'] = bat['eventDate'].dt.dayofyear

brown_hare['eventDate'] = pd.to_datetime(brown_hare['eventDate'])
brown_hare['month'] = brown_hare['eventDate'].dt.month
brown_hare['day_of_year'] = brown_hare['eventDate'].dt.dayofyear

dormice['eventDate'] = pd.to_datetime(dormice['eventDate'])
dormice['month'] = dormice['eventDate'].dt.month
dormice['day_of_year'] = dormice['eventDate'].dt.dayofyear

hedgehog['eventDate'] = pd.to_datetime(hedgehog['eventDate'])
hedgehog['month'] = hedgehog['eventDate'].dt.month
hedgehog['day_of_year'] = hedgehog['eventDate'].dt.dayofyear

red_squirrel['eventDate'] = pd.to_datetime(red_squirrel['eventDate'])
red_squirrel['month'] = red_squirrel['eventDate'].dt.month
red_squirrel['day_of_year'] = red_squirrel['eventDate'].dt.dayofyear

In [43]:
#drop date and species name
bat = bat.drop(columns=['eventDate', 'species'])
brown_hare = brown_hare.drop(columns=['eventDate', 'species'])
dormice = dormice.drop(columns=['eventDate', 'species'])
hedgehog = hedgehog.drop(columns=['eventDate', 'species'])
red_squirrel = red_squirrel.drop(columns=['eventDate', 'species'])

In [44]:
# Bat region clustering

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(bat, geometry=gpd.points_from_xy(bat['decimalLongitude'], bat['decimalLatitude']), crs='EPSG:4326')

utm_crs = CRS("EPSG:32630")
gdf_utm = gdf.to_crs(utm_crs)

# Extract UTM X/Y for clustering
coords_utm = gdf_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()


# HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=800.0,  # 800 meters
    metric='euclidean'  # Now in meters, not degrees
)

# Fit and assign region IDs
labels = clusterer.fit_predict(np.array(coords_utm))

# Assuming `clusterer` is your trained HDBSCAN object
with open('bat_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer, f)


# Assign unique cluster IDs to noise points (-1)
noise_points = np.where(labels == -1)[0]
max_label = labels.max()

for i, idx in enumerate(noise_points):
    labels[idx] = max_label + 1 + i

gdf_utm['region_id'] = labels

bat['region_id'] = gdf_utm['region_id']



In [45]:
# Brown hare region clustering
gdf_brown_hare = gpd.GeoDataFrame(brown_hare, geometry=gpd.points_from_xy(brown_hare['decimalLongitude'], brown_hare['decimalLatitude']), crs='EPSG:4326')
utm_crs_brown_hare = CRS("EPSG:32630")
gdf_brown_hare_utm = gdf_brown_hare.to_crs(utm_crs_brown_hare)
# Extract UTM X/Y for clustering
coords_utm_brown_hare = gdf_brown_hare_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_brown_hare = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=800.0,  # 800 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_brown_hare = clusterer_brown_hare.fit_predict(np.array(coords_utm_brown_hare))
# Assuming `clusterer_brown_hare` is your trained HDBSCAN object
with open('brown_hare_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_brown_hare, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_brown_hare = np.where(labels_brown_hare == -1)[0]
max_label_brown_hare = labels_brown_hare.max()
for i, idx in enumerate(noise_points_brown_hare):
    labels_brown_hare[idx] = max_label_brown_hare + 1 + i
gdf_brown_hare_utm['region_id'] = labels_brown_hare
brown_hare['region_id'] = gdf_brown_hare_utm['region_id']



In [46]:
# Dormice region clustering
gdf_dormice = gpd.GeoDataFrame(dormice, geometry=gpd.points_from_xy(dormice['decimalLongitude'], dormice['decimalLatitude']), crs='EPSG:4326')
utm_crs_dormice = CRS("EPSG:32630")
gdf_dormice_utm = gdf_dormice.to_crs(utm_crs_dormice)
# Extract UTM X/Y for clustering
coords_utm_dormice = gdf_dormice_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_dormice = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=800.0,  # 800 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_dormice = clusterer_dormice.fit_predict(np.array(coords_utm_dormice))
# Assuming `clusterer_dormice` is your trained HDBSCAN object
with open('dormice_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_dormice, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_dormice = np.where(labels_dormice == -1)[0]
max_label_dormice = labels_dormice.max()
for i, idx in enumerate(noise_points_dormice):
    labels_dormice[idx] = max_label_dormice + 1 + i
gdf_dormice_utm['region_id'] = labels_dormice
dormice['region_id'] = gdf_dormice_utm['region_id']



In [47]:
# Hedgehog region clustering
gdf_hedgehog = gpd.GeoDataFrame(hedgehog, geometry=gpd.points_from_xy(hedgehog['decimalLongitude'], hedgehog['decimalLatitude']), crs='EPSG:4326')
utm_crs_hedgehog = CRS("EPSG:32630")
gdf_hedgehog_utm = gdf_hedgehog.to_crs(utm_crs_hedgehog)
# Extract UTM X/Y for clustering
coords_utm_hedgehog = gdf_hedgehog_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_hedgehog = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=800.0,  # 800 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_hedgehog = clusterer_hedgehog.fit_predict(np.array(coords_utm_hedgehog))
# Assuming `clusterer_hedgehog` is your trained HDBSCAN object
with open('hedgehog_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_hedgehog, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_hedgehog = np.where(labels_hedgehog == -1)[0]
max_label_hedgehog = labels_hedgehog.max()
for i, idx in enumerate(noise_points_hedgehog):
    labels_hedgehog[idx] = max_label_hedgehog + 1 + i
gdf_hedgehog_utm['region_id'] = labels_hedgehog
hedgehog['region_id'] = gdf_hedgehog_utm['region_id']



In [48]:
# Red squirrel region clustering
gdf_red_squirrel = gpd.GeoDataFrame(red_squirrel, geometry=gpd.points_from_xy(red_squirrel['decimalLongitude'], red_squirrel['decimalLatitude']), crs='EPSG:4326')
utm_crs_red_squirrel = CRS("EPSG:32630")
gdf_red_squirrel_utm = gdf_red_squirrel.to_crs(utm_crs_red_squirrel)
# Extract UTM X/Y for clustering
coords_utm_red_squirrel = gdf_red_squirrel_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_red_squirrel = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=800.0,  # 800 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_red_squirrel = clusterer_red_squirrel.fit_predict(np.array(coords_utm_red_squirrel))
# Assuming `clusterer_red_squirrel` is your trained HDBSCAN object
with open('red_squirrel_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_red_squirrel, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_red_squirrel = np.where(labels_red_squirrel == -1)[0]
max_label_red_squirrel = labels_red_squirrel.max()
for i, idx in enumerate(noise_points_red_squirrel):
    labels_red_squirrel[idx] = max_label_red_squirrel + 1 + i
gdf_red_squirrel_utm['region_id'] = labels_red_squirrel
red_squirrel['region_id'] = gdf_red_squirrel_utm['region_id']



In [49]:
# drop lat/long columns
bat = bat.drop(columns=['decimalLatitude', 'decimalLongitude'])
brown_hare = brown_hare.drop(columns=['decimalLatitude', 'decimalLongitude'])
dormice = dormice.drop(columns=['decimalLatitude', 'decimalLongitude'])
hedgehog = hedgehog.drop(columns=['decimalLatitude', 'decimalLongitude'])
red_squirrel = red_squirrel.drop(columns=['decimalLatitude', 'decimalLongitude'])

In [50]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
env_vars = ['BSI', 'LST', 'MNDWI', 'NDBI', 'NDSI', 'NDVI', 'NDWI', 'SAVI', 'UI']
bat[env_vars] = scaler.fit_transform(bat[env_vars])
# save the scaler
with open('bat_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
brown_hare[env_vars] = scaler.fit_transform(brown_hare[env_vars])
# save the scaler
with open('brown_hare_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
dormice[env_vars] = scaler.fit_transform(dormice[env_vars])
# save the scaler
with open('dormice_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
hedgehog[env_vars] = scaler.fit_transform(hedgehog[env_vars])
# save the scaler
with open('hedgehog_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
red_squirrel[env_vars] = scaler.fit_transform(red_squirrel[env_vars])
# save the scaler
with open('red_squirrel_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [51]:
bat_bool_columns = ['ceratopogonidae_presence','chironomidae_presence','cats_presence','wind_turbines_presence','near_road']
dormice_bool_columns = ['hazel_presence','birch_presence','beech_presence','honeysuckle_presence','oak_presence','hawthorn_presence']
hedgehog_bool_columns = ['badger_presence','ground_beetles_presence','near_road']
brown_hare_bool_columns = ['winter_wheat_presence']
red_squirrel_bool_columns = ['sitka_spruce_presence','grey_squirrel_presence']

bat[bat_bool_columns] = bat[bat_bool_columns].astype(bool)
dormice[dormice_bool_columns] = dormice[dormice_bool_columns].astype(bool)
hedgehog[hedgehog_bool_columns] = hedgehog[hedgehog_bool_columns].astype(bool)
brown_hare[brown_hare_bool_columns] = brown_hare[brown_hare_bool_columns].astype(bool)
red_squirrel[red_squirrel_bool_columns] = red_squirrel[red_squirrel_bool_columns].astype(bool)

In [52]:
bat = pd.get_dummies(bat, columns=['Land_cover'])
brown_hare = pd.get_dummies(brown_hare, columns=['Land_cover'])
dormice = pd.get_dummies(dormice, columns=['Land_cover'])
hedgehog = pd.get_dummies(hedgehog, columns=['Land_cover'])
red_squirrel = pd.get_dummies(red_squirrel, columns=['Land_cover'])


In [54]:
red_squirrel

Unnamed: 0,BSI,LST,MNDWI,NDBI,NDSI,NDVI,NDWI,SAVI,UI,sitka_spruce_presence,...,Land_cover_Inland rock,Land_cover_Littoral rock,Land_cover_Littoral sediment,Land_cover_Neutral grassland,Land_cover_Saltmarsh,Land_cover_Saltwater,Land_cover_Suburban,Land_cover_Supralittoral rock,Land_cover_Supralittoral sediment,Land_cover_Urban
0,1.395999,0.704739,1.190194,0.318967,1.190194,-1.428353,1.528635,-1.428620,-0.318967,False,...,False,False,False,False,False,False,False,False,False,False
1,-0.444714,0.492250,-0.071536,-0.805157,-0.071536,0.663558,-0.506304,0.663807,0.805157,False,...,False,False,False,False,False,False,False,False,False,False
2,2.252173,0.519514,1.990485,0.134052,1.990485,-2.388220,2.460455,-2.388818,-0.134052,False,...,False,False,False,False,False,False,False,False,False,False
3,-0.399284,0.433378,-0.083044,-0.347021,-0.083044,0.451493,-0.113816,0.451885,0.347021,False,...,False,False,False,False,False,False,True,False,False,False
4,-0.431666,-0.233311,-0.929215,1.443395,-0.929215,-0.310041,-0.486236,-0.311057,-1.443395,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37488,0.820917,0.791313,0.616713,0.497967,0.616713,-0.828457,0.922715,-0.828659,-0.497967,False,...,False,False,False,False,False,False,False,False,False,False
37489,2.467499,1.405014,2.403377,-0.542954,2.403377,-2.457251,2.586302,-2.457876,0.542954,False,...,False,False,False,False,False,False,False,False,False,False
37490,-1.075908,0.502881,-0.824936,-0.440819,-0.824936,1.245877,-1.063531,1.246321,0.440819,False,...,False,False,False,False,False,False,False,False,False,False
37491,0.049248,1.203749,-0.271477,1.400403,-0.271477,-0.470055,0.390742,-0.470308,-1.400403,False,...,False,False,False,False,False,False,True,False,False,False


In [55]:
# save the dataframes
bat.to_csv('bat_final_data_preprocessed.csv', index=False)
brown_hare.to_csv('brown_hare_final_data_preprocessed.csv', index=False)
dormice.to_csv('dormice_final_data_preprocessed.csv', index=False)
hedgehog.to_csv('hedgehog_final_data_preprocessed.csv', index=False)
red_squirrel.to_csv('red_squirrel_final_data_preprocessed.csv', index=False)