In [11]:
import pandas as pd
import geopandas as gpd
from pyproj import CRS
import hdbscan
import numpy as np
import pickle


bat = pd.read_csv("bat_final_data.csv")
brown_hare = pd.read_csv("brown_hare_final_data.csv")
dormice = pd.read_csv("dormice_final_data.csv")
hedgehog = pd.read_csv("hedgehog_final_data.csv")
red_squirrel = pd.read_csv("red_squirrel_final_data.csv")

tgb_bat = pd.read_csv("tgb_bat_final_data.csv")
tgb_brown_hare = pd.read_csv("tgb_brown_hare_final_data.csv")
tgb_dormice = pd.read_csv("tgb_dormice_final_data.csv")
tgb_hedgehog = pd.read_csv("tgb_hedgehog_final_data.csv")
tgb_red_squirrel = pd.read_csv("tgb_red_squirrel_final_data.csv")

In [12]:
# drop na and duplicates
bat = bat.dropna()
bat = bat.drop_duplicates()
brown_hare = brown_hare.dropna()
brown_hare = brown_hare.drop_duplicates()
dormice = dormice.dropna()
dormice = dormice.drop_duplicates()
hedgehog = hedgehog.dropna()
hedgehog = hedgehog.drop_duplicates()
red_squirrel = red_squirrel.dropna()
red_squirrel = red_squirrel.drop_duplicates()

tgb_bat = tgb_bat.dropna()
tgb_brown_hare = tgb_brown_hare.dropna()
tgb_dormice = tgb_dormice.dropna()
tgb_hedgehog = tgb_hedgehog.dropna()
tgb_red_squirrel = tgb_red_squirrel.dropna()

tgb_bat = tgb_bat.drop_duplicates()
tgb_brown_hare = tgb_brown_hare.drop_duplicates()
tgb_dormice = tgb_dormice.drop_duplicates()
tgb_hedgehog = tgb_hedgehog.drop_duplicates()
tgb_red_squirrel = tgb_red_squirrel.drop_duplicates()


In [13]:
#drop date and species name
bat = bat.drop(columns=['eventDate', 'species'])
brown_hare = brown_hare.drop(columns=['eventDate', 'species'])
dormice = dormice.drop(columns=['eventDate', 'species'])
hedgehog = hedgehog.drop(columns=['eventDate', 'species'])
red_squirrel = red_squirrel.drop(columns=['eventDate', 'species'])

tgb_bat = tgb_bat.drop(columns=['eventDate', 'species'])
tgb_brown_hare = tgb_brown_hare.drop(columns=['eventDate', 'species'])
tgb_dormice = tgb_dormice.drop(columns=['eventDate', 'species'])
tgb_hedgehog = tgb_hedgehog.drop(columns=['eventDate', 'species'])
tgb_red_squirrel = tgb_red_squirrel.drop(columns=['eventDate', 'species'])

In [14]:
bat['type'] = 'presence'
brown_hare['type'] = 'presence'
dormice['type'] = 'presence'
hedgehog['type'] = 'presence'
red_squirrel['type'] = 'presence'

tgb_bat['type'] = 'background'
tgb_brown_hare['type'] = 'background'
tgb_dormice['type'] = 'background'
tgb_hedgehog['type'] = 'background'
tgb_red_squirrel['type'] = 'background'

#append TGB data to species data
bat = pd.concat([bat, tgb_bat], ignore_index=True)
brown_hare = pd.concat([brown_hare, tgb_brown_hare], ignore_index=True)
dormice = pd.concat([dormice, tgb_dormice], ignore_index=True)
hedgehog = pd.concat([hedgehog, tgb_hedgehog], ignore_index=True)
red_squirrel = pd.concat([red_squirrel, tgb_red_squirrel], ignore_index=True)

In [15]:
# Bat region clustering

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(bat, geometry=gpd.points_from_xy(bat['decimalLongitude'], bat['decimalLatitude']), crs='EPSG:4326')

utm_crs = CRS("EPSG:32630")
gdf_utm = gdf.to_crs(utm_crs)

# Extract UTM X/Y for clustering
coords_utm = gdf_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()


# HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=250.0,  # 250 meters
    metric='euclidean'  # Now in meters, not degrees
)

# Fit and assign region IDs
labels = clusterer.fit_predict(np.array(coords_utm))

# Assuming `clusterer` is your trained HDBSCAN object
with open('bat_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer, f)


# Assign unique cluster IDs to noise points (-1)
noise_points = np.where(labels == -1)[0]
max_label = labels.max()

for i, idx in enumerate(noise_points):
    labels[idx] = max_label + 1 + i

gdf_utm['region_id'] = labels

bat['region_id'] = gdf_utm['region_id']



In [16]:
# Brown hare region clustering
gdf_brown_hare = gpd.GeoDataFrame(brown_hare, geometry=gpd.points_from_xy(brown_hare['decimalLongitude'], brown_hare['decimalLatitude']), crs='EPSG:4326')
utm_crs_brown_hare = CRS("EPSG:32630")
gdf_brown_hare_utm = gdf_brown_hare.to_crs(utm_crs_brown_hare)
# Extract UTM X/Y for clustering
coords_utm_brown_hare = gdf_brown_hare_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_brown_hare = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=250.0,  # 250 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_brown_hare = clusterer_brown_hare.fit_predict(np.array(coords_utm_brown_hare))
# Assuming `clusterer_brown_hare` is your trained HDBSCAN object
with open('brown_hare_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_brown_hare, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_brown_hare = np.where(labels_brown_hare == -1)[0]
max_label_brown_hare = labels_brown_hare.max()
for i, idx in enumerate(noise_points_brown_hare):
    labels_brown_hare[idx] = max_label_brown_hare + 1 + i
gdf_brown_hare_utm['region_id'] = labels_brown_hare
brown_hare['region_id'] = gdf_brown_hare_utm['region_id']



In [17]:
# Dormice region clustering
gdf_dormice = gpd.GeoDataFrame(dormice, geometry=gpd.points_from_xy(dormice['decimalLongitude'], dormice['decimalLatitude']), crs='EPSG:4326')
utm_crs_dormice = CRS("EPSG:32630")
gdf_dormice_utm = gdf_dormice.to_crs(utm_crs_dormice)
# Extract UTM X/Y for clustering
coords_utm_dormice = gdf_dormice_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_dormice = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=250.0,  # 250 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_dormice = clusterer_dormice.fit_predict(np.array(coords_utm_dormice))
# Assuming `clusterer_dormice` is your trained HDBSCAN object
with open('dormice_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_dormice, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_dormice = np.where(labels_dormice == -1)[0]
max_label_dormice = labels_dormice.max()
for i, idx in enumerate(noise_points_dormice):
    labels_dormice[idx] = max_label_dormice + 1 + i
gdf_dormice_utm['region_id'] = labels_dormice
dormice['region_id'] = gdf_dormice_utm['region_id']



In [18]:
# Hedgehog region clustering
gdf_hedgehog = gpd.GeoDataFrame(hedgehog, geometry=gpd.points_from_xy(hedgehog['decimalLongitude'], hedgehog['decimalLatitude']), crs='EPSG:4326')
utm_crs_hedgehog = CRS("EPSG:32630")
gdf_hedgehog_utm = gdf_hedgehog.to_crs(utm_crs_hedgehog)
# Extract UTM X/Y for clustering
coords_utm_hedgehog = gdf_hedgehog_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_hedgehog = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=250.0,  # 250 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_hedgehog = clusterer_hedgehog.fit_predict(np.array(coords_utm_hedgehog))
# Assuming `clusterer_hedgehog` is your trained HDBSCAN object
with open('hedgehog_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_hedgehog, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_hedgehog = np.where(labels_hedgehog == -1)[0]
max_label_hedgehog = labels_hedgehog.max()
for i, idx in enumerate(noise_points_hedgehog):
    labels_hedgehog[idx] = max_label_hedgehog + 1 + i
gdf_hedgehog_utm['region_id'] = labels_hedgehog
hedgehog['region_id'] = gdf_hedgehog_utm['region_id']



In [19]:
# Red squirrel region clustering
gdf_red_squirrel = gpd.GeoDataFrame(red_squirrel, geometry=gpd.points_from_xy(red_squirrel['decimalLongitude'], red_squirrel['decimalLatitude']), crs='EPSG:4326')
utm_crs_red_squirrel = CRS("EPSG:32630")
gdf_red_squirrel_utm = gdf_red_squirrel.to_crs(utm_crs_red_squirrel)
# Extract UTM X/Y for clustering
coords_utm_red_squirrel = gdf_red_squirrel_utm.geometry.apply(lambda p: (p.x, p.y)).to_list()
# HDBSCAN clustering
clusterer_red_squirrel = hdbscan.HDBSCAN(
    min_cluster_size=2,       # ensures even sparse regions form clusters
    min_samples=1,
    cluster_selection_epsilon=250.0,  # 250 meters
    metric='euclidean'  # Now in meters, not degrees
)
# Fit and assign region IDs
labels_red_squirrel = clusterer_red_squirrel.fit_predict(np.array(coords_utm_red_squirrel))
# Assuming `clusterer_red_squirrel` is your trained HDBSCAN object
with open('red_squirrel_region_clusterer.pkl', 'wb') as f:
    pickle.dump(clusterer_red_squirrel, f)
# Assign unique cluster IDs to noise points (-1)
noise_points_red_squirrel = np.where(labels_red_squirrel == -1)[0]
max_label_red_squirrel = labels_red_squirrel.max()
for i, idx in enumerate(noise_points_red_squirrel):
    labels_red_squirrel[idx] = max_label_red_squirrel + 1 + i
gdf_red_squirrel_utm['region_id'] = labels_red_squirrel
red_squirrel['region_id'] = gdf_red_squirrel_utm['region_id']



In [10]:
# drop lat/long columns
bat = bat.drop(columns=['decimalLatitude', 'decimalLongitude'])
brown_hare = brown_hare.drop(columns=['decimalLatitude', 'decimalLongitude'])
dormice = dormice.drop(columns=['decimalLatitude', 'decimalLongitude'])
hedgehog = hedgehog.drop(columns=['decimalLatitude', 'decimalLongitude'])
red_squirrel = red_squirrel.drop(columns=['decimalLatitude', 'decimalLongitude'])

In [20]:
bat_bool_columns = ['ceratopogonidae_presence','chironomidae_presence','cats_presence','wind_turbines_presence','near_road']
dormice_bool_columns = ['hazel_presence','birch_presence','beech_presence','honeysuckle_presence','oak_presence','hawthorn_presence']
hedgehog_bool_columns = ['badger_presence','ground_beetles_presence','near_road']
brown_hare_bool_columns = ['winter_wheat_presence']
red_squirrel_bool_columns = ['sitka_spruce_presence','grey_squirrel_presence']

bat[bat_bool_columns] = bat[bat_bool_columns].astype(bool)
dormice[dormice_bool_columns] = dormice[dormice_bool_columns].astype(bool)
hedgehog[hedgehog_bool_columns] = hedgehog[hedgehog_bool_columns].astype(bool)
brown_hare[brown_hare_bool_columns] = brown_hare[brown_hare_bool_columns].astype(bool)
red_squirrel[red_squirrel_bool_columns] = red_squirrel[red_squirrel_bool_columns].astype(bool)

In [21]:
bat = pd.get_dummies(bat, columns=['Land_cover'])
brown_hare = pd.get_dummies(brown_hare, columns=['Land_cover'])
dormice = pd.get_dummies(dormice, columns=['Land_cover'])
hedgehog = pd.get_dummies(hedgehog, columns=['Land_cover'])
red_squirrel = pd.get_dummies(red_squirrel, columns=['Land_cover'])


In [31]:
# save the dataframes
bat.to_csv('bat_final_data_preprocessed.csv', index=False)
brown_hare.to_csv('brown_hare_final_data_preprocessed.csv', index=False)
dormice.to_csv('dormice_final_data_preprocessed.csv', index=False)
hedgehog.to_csv('hedgehog_final_data_preprocessed.csv', index=False)
red_squirrel.to_csv('red_squirrel_final_data_preprocessed.csv', index=False)

In [22]:
# # save the dataframes
# bat.to_csv('bat_final_data_preprocessed_with_coords.csv', index=False)
# brown_hare.to_csv('brown_hare_final_data_preprocessed_with_coords.csv', index=False)
# dormice.to_csv('dormice_final_data_preprocessed_with_coords.csv', index=False)
# hedgehog.to_csv('hedgehog_final_data_preprocessed_with_coords.csv', index=False)
# red_squirrel.to_csv('red_squirrel_final_data_preprocessed_with_coords.csv', index=False)