In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt

# Load your data
df = pd.read_csv('../00_data/solar_sites_coords.csv')

coords = df[['latitude', 'longitude']].to_numpy()
kms_per_radian = 6371.0088
epsilon = 0.1 / kms_per_radian  # 100 meters

db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine')
df['cluster'] = db.fit_predict(np.radians(coords))

# Take one point per cluster
df_dedup = df.groupby('cluster').first().reset_index()
df_dedup

Unnamed: 0,cluster,id,latitude,longitude
0,0,2674178575,12.658645,120.416232
1,1,3268411673,16.307015,119.785458
2,2,6380661297,14.961935,120.906647
3,3,6761083556,7.070329,125.622841
4,4,8947255151,11.642965,122.307870
...,...,...,...,...
142,142,1383901465,15.648852,120.399136
143,143,1383902503,16.069367,120.635435
144,144,1383904331,16.074920,120.643048
145,145,1383905226,16.072856,120.639324


In [3]:
import folium
# from folium.plugins import MarkerCluster

m = folium.Map(location=[df_dedup.latitude.mean(), df_dedup.longitude.mean()], zoom_start=6)
# marker_cluster = MarkerCluster().add_to(m)

for _, row in df_dedup.iterrows():
    folium.CircleMarker(
        location=(row['latitude'], row['longitude']),
        radius=2,
        color='blue',
        fill=True,
        fill_opacity=0.7
    ).add_to(m)

m.save("deduplicated_pv_sites_NO_MARKER.html")


# Sample Spatially Diverse Unlabeled Data

In [2]:
from shapely.geometry import Point
import random

# Philippines bounding box (rough)
bbox = {
    'min_lat': 5.58100332277,
    'max_lat': 18.5052273625,
    'min_lon': 117.17427453,
    'max_lon': 126.537423944,
}

# Create unlabeled candidate points
num_candidates = 2000
random.seed(42)
candidates = [
    Point(random.uniform(bbox['min_lon'], bbox['max_lon']),
          random.uniform(bbox['min_lat'], bbox['max_lat']))
    for _ in range(num_candidates)
]

# Build GeoDataFrames
gdf_pos = gpd.GeoDataFrame(geometry=gpd.points_from_xy(df_dedup.longitude, df_dedup.latitude), crs='EPSG:4326')
gdf_candidates = gpd.GeoDataFrame(geometry=candidates, crs='EPSG:4326')

# Buffer positives (e.g., 500m exclusion zone)
gdf_pos_buffered = gdf_pos.to_crs(epsg=32651).buffer(500).to_crs(epsg=4326)

# Remove candidates near positives
mask = ~gdf_candidates.geometry.apply(lambda pt: any(pt.within(poly) for poly in gdf_pos_buffered))
filtered_candidates = gdf_candidates[mask]
n_available = len(filtered_candidates)
n_to_sample = min(num_candidates, n_available)

gdf_unlabeled = filtered_candidates.sample(n=n_to_sample, random_state=42).reset_index(drop=True)

In [6]:
gdf_unlabeled

Unnamed: 0,geometry
0,POINT (120.17926 17.41872)
1,POINT (124.18688 12.93756)
2,POINT (117.9815 11.19637)
3,POINT (120.98421 17.66557)
4,POINT (120.10823 16.9671)
...,...
1995,POINT (126.05825 9.87465)
1996,POINT (123.26051 11.26895)
1997,POINT (123.61653 5.9254)
1998,POINT (120.06798 11.91951)


In [19]:
import folium

m2 = folium.Map(location=[df_dedup.latitude.mean(), df_dedup.longitude.mean()], zoom_start=6)

# Softer gray-blue for unlabeled points
for _, row in gdf_unlabeled.iterrows():
    folium.CircleMarker(
        location=(row.geometry.y, row.geometry.x),
        radius=2,
        color='#A0AEC0',  # muted gray-blue
        fill=True,
        fill_opacity=0.15
    ).add_to(m2)

# Strong green-teal for positive points
for _, row in df_dedup.iterrows():
    folium.CircleMarker(
        location=(row['latitude'], row['longitude']),
        radius=2,
        color='#2C7A7B',  # teal
        fill=True,
        fill_opacity=0.8
    ).add_to(m2)

m2.save("labeled_and_unlabeled.html")


In [14]:
df_dedup

Unnamed: 0,cluster,id,latitude,longitude
0,0,2674178575,12.658645,120.416232
1,1,3268411673,16.307015,119.785458
2,2,6380661297,14.961935,120.906647
3,3,6761083556,7.070329,125.622841
4,4,8947255151,11.642965,122.307870
...,...,...,...,...
142,142,1383901465,15.648852,120.399136
143,143,1383902503,16.069367,120.635435
144,144,1383904331,16.074920,120.643048
145,145,1383905226,16.072856,120.639324


In [None]:
# Positive sites
gdf_pos['label'] = 1
gdf_pos['longitude'] = df_dedup.longitude
gdf_pos['latitude'] = df_dedup.latitude
gdf_pos['id'] = df_dedup.id

# Unlabeled sites
gdf_unlabeled['label'] = 0
gdf_unlabeled['longitude'] = gdf_unlabeled.geometry.x
gdf_unlabeled['latitude'] = gdf_unlabeled.geometry.y
gdf_unlabeled['id'] = ['unl_' + str(i) for i in range(len(gdf_unlabeled))]

# Combine and save
combined = pd.concat([gdf_pos, gdf_unlabeled], ignore_index=True)
combined

Unnamed: 0,geometry,label,longitude,latitude,id
0,POINT (120.41623 12.65865),1,120.416232,12.658645,2674178575
1,POINT (119.78546 16.30702),1,119.785458,16.307015,3268411673
2,POINT (120.90665 14.96193),1,120.906647,14.961935,6380661297
3,POINT (125.62284 7.07033),1,125.622841,7.070329,6761083556
4,POINT (122.30787 11.64297),1,122.307870,11.642965,8947255151
...,...,...,...,...,...
20141,POINT (122.98338 11.41626),0,122.983377,11.416262,unl_19994
20142,POINT (125.97356 10.1455),0,125.973563,10.145500,unl_19995
20143,POINT (117.58548 14.56979),0,117.585480,14.569789,unl_19996
20144,POINT (123.61653 5.9254),0,123.616530,5.925397,unl_19997


In [None]:
combined[['id', 'longitude', 'latitude', 'label']].to_csv('../00_data/pv_labeled_data.csv', index=False)

# Random Sample per Barangay

In [7]:
# Positive sites
gdf_pos = gpd.GeoDataFrame(geometry=gpd.points_from_xy(df_dedup.longitude, df_dedup.latitude), crs='EPSG:4326')
gdf_pos['label'] = 1
gdf_pos['longitude'] = df_dedup.longitude
gdf_pos['latitude'] = df_dedup.latitude
gdf_pos['id'] = df_dedup.id
gdf_pos

Unnamed: 0,geometry,label,longitude,latitude,id
0,POINT (120.41623 12.65865),1,120.416232,12.658645,2674178575
1,POINT (119.78546 16.30702),1,119.785458,16.307015,3268411673
2,POINT (120.90665 14.96193),1,120.906647,14.961935,6380661297
3,POINT (125.62284 7.07033),1,125.622841,7.070329,6761083556
4,POINT (122.30787 11.64297),1,122.307870,11.642965,8947255151
...,...,...,...,...,...
142,POINT (120.39914 15.64885),1,120.399136,15.648852,1383901465
143,POINT (120.63544 16.06937),1,120.635435,16.069367,1383902503
144,POINT (120.64305 16.07492),1,120.643048,16.074920,1383904331
145,POINT (120.63932 16.07286),1,120.639324,16.072856,1383905226


In [8]:
import geopandas as gpd

gdf_boundaries = gpd.read_file("../00_data/PH_Adm4_BgySubMuns.shp")
gdf_boundaries

  result = read_func(


Unnamed: 0,adm1_psgc,adm2_psgc,adm3_psgc,adm4_psgc,adm4_en,geo_level,len_crs,area_crs,len_km,area_km2,geometry
0,100000000,102800000,102801000,102801001,Adams,Bgy,45997,111184551,45,111.0,"POLYGON ((280486.139 2048388.148, 283183.575 2..."
1,100000000,102800000,102802000,102802001,Bani,Bgy,5982,1761135,5,1.0,"POLYGON ((247619.019 2022359.605, 247623.585 2..."
2,100000000,102800000,102802000,102802002,Buyon,Bgy,9117,3875134,9,3.0,"POLYGON ((249124.075 2019071.573, 249782.73 20..."
3,100000000,102800000,102802000,102802003,Cabaruan,Bgy,7745,2987648,7,2.0,"POLYGON ((245180.248 2021560.567, 246674.214 2..."
4,100000000,102800000,102802000,102802004,Cabulalaan,Bgy,4502,1018354,4,1.0,"POLYGON ((244550.24 2023086.477, 244585.568 20..."
...,...,...,...,...,...,...,...,...,...,...,...
42012,1900000000,1999900000,1999908000,1999908004,Lagunde,Bgy,14257,6423026,14,6.0,"POLYGON ((679278.134 782440.633, 679278.237 78..."
42013,1900000000,1999900000,1999908000,1999908005,Macasendeg,Bgy,14420,11386837,14,11.0,"POLYGON ((673037.738 779700.887, 672394.148 77..."
42014,1900000000,1999900000,1999908000,1999908006,Manaulanan,Bgy,21329,7760288,21,7.0,"POLYGON ((678960.645 780490.878, 678987.69 780..."
42015,1900000000,1999900000,1999908000,1999908007,Pamalian,Bgy,10548,3346944,10,3.0,"POLYGON ((678011.707 780670.924, 678040.822 77..."


In [9]:
gdf_boundaries.geom_type.value_counts()

Polygon         40886
MultiPolygon     1117
Name: count, dtype: int64

In [10]:
import numpy as np
from shapely.geometry import Point
import random

# Step 1: Normalize the area column to get probabilities
gdf_boundaries['sampling_prob'] = gdf_boundaries['area_crs'] / gdf_boundaries['area_crs'].sum()

# Step 2: Choose how many total points you want (e.g. 20,000)
n_samples = 2000

# Step 3: Sample barangay indices by area-weighted probability (with replacement)
random.seed(42)
np.random.seed(42)
sampled_indices = np.random.choice(
    gdf_boundaries.index,
    size=n_samples,
    replace=True,  # Allows larger barangays to appear multiple times
    p=gdf_boundaries['sampling_prob'].values
)

# Step 4: Create a new GeoDataFrame of just the sampled barangays
sampled_barangays = gdf_boundaries.loc[sampled_indices].copy()
sampled_barangays.reset_index(drop=True, inplace=True)

# Step 5: Function to sample a random point within a polygon
def random_point_within(poly):
    if poly:
        minx, miny, maxx, maxy = poly.bounds
        for _ in range(100):  # attempt limit
            p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
            if poly.contains(p):
                return p
    return None

# Step 6: Generate random point inside each selected barangay
sampled_barangays.to_crs(epsg=4326, inplace=True)
sampled_barangays['rand_point'] = sampled_barangays.geometry.apply(random_point_within)

# Step 7: Clean and build final GeoDataFrame
gdf_unlabeled = sampled_barangays.dropna(subset=['rand_point']).copy()
gdf_unlabeled = gpd.GeoDataFrame(geometry=gdf_unlabeled['rand_point'], crs='EPSG:4326')
gdf_unlabeled['longitude'] = gdf_unlabeled.geometry.x
gdf_unlabeled['latitude'] = gdf_unlabeled.geometry.y
gdf_unlabeled['label'] = 0
gdf_unlabeled['id'] = ['unl_' + str(i) for i in range(len(gdf_unlabeled))]
gdf_unlabeled

Unnamed: 0,geometry,longitude,latitude,label,id
0,POINT (123.15942 10.31659),123.159420,10.316585,0,unl_0
1,POINT (119.11538 10.16416),119.115385,10.164159,0,unl_1
2,POINT (124.4069 6.18207),124.406897,6.182069,0,unl_2
3,POINT (123.71743 8.46488),123.717426,8.464881,0,unl_3
4,POINT (120.76483 15.64637),120.764834,15.646374,0,unl_4
...,...,...,...,...,...
1995,POINT (126.28912 7.38133),126.289118,7.381335,0,unl_1995
1996,POINT (121.88947 6.50527),121.889470,6.505267,0,unl_1996
1997,POINT (122.24607 18.42042),122.246072,18.420424,0,unl_1997
1998,POINT (122.08381 18.20398),122.083808,18.203984,0,unl_1998


In [16]:
import folium
# from folium.plugins import MarkerCluster

m = folium.Map(location=[gdf_unlabeled.latitude.mean(), gdf_unlabeled.longitude.mean()], zoom_start=6)
# marker_cluster = MarkerCluster().add_to(m)

for _, row in gdf_unlabeled.iterrows():
    folium.CircleMarker(
        location=(row['latitude'], row['longitude']),
        radius=2,
        color='blue',
        fill=True,
        fill_opacity=0.7
    ).add_to(m)

m.save("unlabeled_barangays.html")


In [12]:
from shapely.geometry import Point
import random

# Philippines bounding box
bbox = {
    'min_lat': 5.5,
    'max_lat': 19,
    'min_lon': 116.8,
    'max_lon': 127,
}

# Sample 1000 sea points
sea_points = []
random.seed(42)
while len(sea_points) < 1000:
    pt = Point(random.uniform(bbox['min_lon'], bbox['max_lon']),
               random.uniform(bbox['min_lat'], bbox['max_lat']))
    if not any(poly.contains(pt) for poly in gdf_boundaries.geometry):  # ensure it's not in land
        sea_points.append(pt)

gdf_sea = gpd.GeoDataFrame(geometry=sea_points, crs='EPSG:4326')
gdf_sea['longitude'] = gdf_sea.geometry.x
gdf_sea['latitude'] = gdf_sea.geometry.y
gdf_sea['label'] = 0
gdf_sea['id'] = ['sea_' + str(i) for i in range(len(gdf_sea))]

gdf_sea

Unnamed: 0,geometry,longitude,latitude,label,id
0,POINT (123.32215 5.83765),123.322153,5.837645,0,sea_0
1,POINT (119.6053 8.51334),119.605299,8.513345,0,sea_1
2,POINT (124.31201 14.63544),124.312006,14.635443,0,sea_2
3,POINT (125.90023 6.67367),125.900232,6.673674,0,sea_3
4,POINT (119.03011 12.3223),119.030107,12.322296,0,sea_4
...,...,...,...,...,...
995,POINT (118.49301 7.37967),118.493013,7.379672,0,sea_995
996,POINT (118.90752 8.29373),118.907520,8.293727,0,sea_996
997,POINT (117.40545 10.23601),117.405450,10.236009,0,sea_997
998,POINT (119.66707 12.77338),119.667067,12.773375,0,sea_998


In [14]:
import folium
# from folium.plugins import MarkerCluster

m = folium.Map(location=[gdf_sea.latitude.mean(), gdf_sea.longitude.mean()], zoom_start=6)
# marker_cluster = MarkerCluster().add_to(m)

for _, row in gdf_sea.iterrows():
    folium.CircleMarker(
        location=(row['latitude'], row['longitude']),
        radius=2,
        color='blue',
        fill=True,
        fill_opacity=0.7
    ).add_to(m)

m.save("unlabeled_sea.html")


In [17]:
gdf_unlabeled_all = pd.concat([gdf_unlabeled, gdf_sea], ignore_index=True)

In [11]:
# Build GeoDataFrames
gdf_candidates = gdf_unlabeled.copy() # EXCLUDE sea (for now)

# Buffer positives (e.g., 500m exclusion zone)
gdf_pos_buffered = gdf_pos.to_crs(epsg=32651).buffer(500).to_crs(epsg=4326)

# Remove candidates near positives
mask = ~gdf_candidates.geometry.apply(lambda pt: any(pt.within(poly) for poly in gdf_pos_buffered))
filtered_candidates = gdf_candidates[mask]
n_available = len(filtered_candidates)
n_to_sample = min(21000, n_available)

gdf_unlabeled = filtered_candidates.sample(n=n_to_sample, random_state=42).reset_index(drop=True)
gdf_unlabeled

Unnamed: 0,geometry,longitude,latitude,label,id
0,POINT (121.378 17.5032),121.377998,17.503200,0,unl_1860
1,POINT (120.97099 15.38373),120.970989,15.383731,0,unl_353
2,POINT (124.17043 6.79996),124.170426,6.799961,0,unl_1333
3,POINT (121.36287 12.32934),121.362875,12.329338,0,unl_905
4,POINT (122.32301 16.7336),122.323011,16.733596,0,unl_1289
...,...,...,...,...,...
1995,POINT (124.34196 6.36864),124.341962,6.368640,0,unl_1130
1996,POINT (121.36942 18.28745),121.369424,18.287449,0,unl_1294
1997,POINT (120.9689 17.3665),120.968898,17.366497,0,unl_860
1998,POINT (126.08329 7.9575),126.083286,7.957502,0,unl_1459


In [12]:
combined = pd.concat([gdf_pos, gdf_unlabeled], ignore_index=True)
combined

Unnamed: 0,geometry,label,longitude,latitude,id
0,POINT (120.41623 12.65865),1,120.416232,12.658645,2674178575
1,POINT (119.78546 16.30702),1,119.785458,16.307015,3268411673
2,POINT (120.90665 14.96193),1,120.906647,14.961935,6380661297
3,POINT (125.62284 7.07033),1,125.622841,7.070329,6761083556
4,POINT (122.30787 11.64297),1,122.307870,11.642965,8947255151
...,...,...,...,...,...
2142,POINT (124.34196 6.36864),0,124.341962,6.368640,unl_1130
2143,POINT (121.36942 18.28745),0,121.369424,18.287449,unl_1294
2144,POINT (120.9689 17.3665),0,120.968898,17.366497,unl_860
2145,POINT (126.08329 7.9575),0,126.083286,7.957502,unl_1459


In [13]:
combined[['longitude', 'latitude', 'id', 'label']].to_csv('../00_data/pv_labeled_data_v3.csv', index=False)