## SETUP

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
import mlflow
from math import radians, cos, sin, asin, sqrt


sample_url = "floods_geocoordinates.csv"

sample_data = pd.read_csv(sample_url)

## TREINAMENTO

In [5]:
#------------------------HAVERSINE DISTANCE------------------------
def haversine(coordinates_1, coordinates_2):
    
    latitude_1, longitude_1 = coordinates_1
    latitude_2, longitude_2 = coordinates_2
    longitude_1, latitude_1, longitude_2, latitude_2 = map(radians, [longitude_1, latitude_1, longitude_2, latitude_2])

    # haversine formula 
    distance_longitude = longitude_2 - longitude_1 # diference between two longitude
    distance_latitude = latitude_2 - latitude_1    # diference between two latitude
    
    aux = sin(distance_latitude/2)**2 + cos(latitude_1) * cos(latitude_2) * sin(distance_longitude/2)**2
    result = 2 * asin(sqrt(aux)) 
    
    radius = 6371 # Radius of earth in kilometers
    
    return result * radius

In [7]:
from scipy.spatial.distance import pdist, squareform

sample_data = sample_data[['latitude', 'longitude']]

#---------------------DISTANCE MATRIX---------------------
distance_matrix = squareform(pdist(sample_data, (lambda u,v: haversine(u,v))))

#---------------------CLUSTERS AGGLOMERATIVE - COMPLETE---------------------
agglomerative_clustering = AgglomerativeClustering(n_clusters = None, linkage='complete', distance_threshold=0.9, compute_full_tree=True)  
rotulo_agglomerative = agglomerative_clustering.fit(distance_matrix)


## VALIDAÇÃO

In [8]:
#---------------------MÉTRICAS---------------------
silhoutte_metric = metrics.silhouette_score(distance_matrix, rotulo_agglomerative.labels_)
print("silhoutte_metric: ", silhoutte_metric)
calinski_harabasz_metric = metrics.calinski_harabasz_score(distance_matrix, rotulo_agglomerative.labels_)
print("calinski_harabasz_metric: ", calinski_harabasz_metric)
davies_bouldin_metric = metrics.davies_bouldin_score(distance_matrix, rotulo_agglomerative.labels_)
print("davies_bouldin_metric: ", davies_bouldin_metric)

silhoutte_metric:  0.622582712622008
calinski_harabasz_metric:  5816153.446100937
davies_bouldin_metric:  0.0173946483077762


## VERSIONAMENTO

In [9]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%d/%m/%Y - %H:%M:%S")
current_time

'09/02/2025 - 14:17:02'

In [10]:
mlflow.set_tracking_uri('http://localhost:5001/')
mlflow.set_experiment(f'flood_areas_identifier')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1739108914241, experiment_id='1', last_update_time=1739108914241, lifecycle_stage='active', name='flood_areas_identifier', tags={}>

In [11]:
distance_threshold_list = [100,200,300,400,500,600,700,800,900,1000,1100,1200,1300]

for i in distance_threshold_list:

    run_name = f'floods_identifier_distance_threshold_{i}'

    # Start run
    mlflow.start_run(run_name=run_name)

    #---------------------CLUSTERS AGGLOMERATIVE - WARD---------------------
    agglomerative_clustering = AgglomerativeClustering(n_clusters = None, metric='precomputed', linkage='complete', distance_threshold=i/1000, compute_full_tree=True)  
    agglomerative_clustering = agglomerative_clustering.fit(distance_matrix)

    silhoutte_metric = metrics.silhouette_score(distance_matrix, agglomerative_clustering.labels_)
    calinski_harabasz_metric = metrics.calinski_harabasz_score(distance_matrix, agglomerative_clustering.labels_)
    davies_bouldin_metric = metrics.davies_bouldin_score(distance_matrix, agglomerative_clustering.labels_)

    parametros = {
        "n_clusters": None,
        "affinity": 'precomputed',
        "linkage": "complete",
        "distance_threshold": i,
        "compute_full_tree": True,
    }

    print(parametros)

    metricas = {
        "num_generated_clusters": len(set(agglomerative_clustering.labels_)),
        "silhouette": silhoutte_metric,
        "calinski_harabasz": calinski_harabasz_metric,
        "davies_bouldin_metric": davies_bouldin_metric,
    }

    print(metricas)

    mlflow.set_tag("data", current_time)
    mlflow.log_params(parametros)
    mlflow.log_metrics(metricas)

    mlflow.sklearn.log_model(agglomerative_clustering, "agglomerative_clustering")

    mlflow.end_run()

{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 100, 'compute_full_tree': True}
{'num_generated_clusters': 506, 'silhouette': np.float64(0.6456452328143671), 'calinski_harabasz': np.float64(320821.95671159046), 'davies_bouldin_metric': np.float64(0.08448773714470238)}




🏃 View run floods_identifier_distance_threshold_100 at: http://localhost:5001/#/experiments/1/runs/6c2685d428ce41038cf47b0f0a248e10
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 200, 'compute_full_tree': True}
{'num_generated_clusters': 434, 'silhouette': np.float64(0.6432629638676016), 'calinski_harabasz': np.float64(65414.09020961857), 'davies_bouldin_metric': np.float64(0.1323203027572419)}




🏃 View run floods_identifier_distance_threshold_200 at: http://localhost:5001/#/experiments/1/runs/a12e1109dff445bab69331ceb6375b43
🧪 View experiment at: http://localhost:5001/#/experiments/1
{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 300, 'compute_full_tree': True}
{'num_generated_clusters': 374, 'silhouette': np.float64(0.6371885691143028), 'calinski_harabasz': np.float64(29248.00633842535), 'davies_bouldin_metric': np.float64(0.18035678038265587)}




🏃 View run floods_identifier_distance_threshold_300 at: http://localhost:5001/#/experiments/1/runs/0ba311cfda6b4b88a6faa895c7e489ef
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 400, 'compute_full_tree': True}
{'num_generated_clusters': 325, 'silhouette': np.float64(0.6422741489271767), 'calinski_harabasz': np.float64(18049.26736773855), 'davies_bouldin_metric': np.float64(0.23849658834855714)}




🏃 View run floods_identifier_distance_threshold_400 at: http://localhost:5001/#/experiments/1/runs/6ec4f43da07f4c72a6ab7dca0b25225c
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 500, 'compute_full_tree': True}
{'num_generated_clusters': 287, 'silhouette': np.float64(0.627543240246473), 'calinski_harabasz': np.float64(11797.753517548437), 'davies_bouldin_metric': np.float64(0.28600236666940065)}




🏃 View run floods_identifier_distance_threshold_500 at: http://localhost:5001/#/experiments/1/runs/a659087f3e9342388a7d64f05aa8ce2a
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 600, 'compute_full_tree': True}
{'num_generated_clusters': 256, 'silhouette': np.float64(0.6336616200712654), 'calinski_harabasz': np.float64(9366.01910530452), 'davies_bouldin_metric': np.float64(0.3166701510838368)}




🏃 View run floods_identifier_distance_threshold_600 at: http://localhost:5001/#/experiments/1/runs/062e7ed9a71d41e89023941176d1e310
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 700, 'compute_full_tree': True}
{'num_generated_clusters': 234, 'silhouette': np.float64(0.6330914097510535), 'calinski_harabasz': np.float64(8224.67659537765), 'davies_bouldin_metric': np.float64(0.3347316911566836)}




🏃 View run floods_identifier_distance_threshold_700 at: http://localhost:5001/#/experiments/1/runs/6915720b5ed742899d67730bb1c9a5b2
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 800, 'compute_full_tree': True}
{'num_generated_clusters': 217, 'silhouette': np.float64(0.6303548579598637), 'calinski_harabasz': np.float64(7275.816355435488), 'davies_bouldin_metric': np.float64(0.3527423031621193)}




🏃 View run floods_identifier_distance_threshold_800 at: http://localhost:5001/#/experiments/1/runs/528e650f3a0f452f9f492a9d571a26b8
🧪 View experiment at: http://localhost:5001/#/experiments/1
{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 900, 'compute_full_tree': True}
{'num_generated_clusters': 199, 'silhouette': np.float64(0.6255070788595858), 'calinski_harabasz': np.float64(6237.508216167444), 'davies_bouldin_metric': np.float64(0.36036177368683)}




🏃 View run floods_identifier_distance_threshold_900 at: http://localhost:5001/#/experiments/1/runs/1895d67dfefc4ea9be6f5ae52814afc8
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1000, 'compute_full_tree': True}
{'num_generated_clusters': 183, 'silhouette': np.float64(0.622038657717427), 'calinski_harabasz': np.float64(5525.159166869797), 'davies_bouldin_metric': np.float64(0.36751446558720346)}




🏃 View run floods_identifier_distance_threshold_1000 at: http://localhost:5001/#/experiments/1/runs/db357fc7569343b2b95ac525c42782f0
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1100, 'compute_full_tree': True}
{'num_generated_clusters': 172, 'silhouette': np.float64(0.6038880820411042), 'calinski_harabasz': np.float64(4772.1663687048995), 'davies_bouldin_metric': np.float64(0.41736906550860586)}




🏃 View run floods_identifier_distance_threshold_1100 at: http://localhost:5001/#/experiments/1/runs/6d81f2781d3a4b6b9e55180124e13ea5
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1200, 'compute_full_tree': True}
{'num_generated_clusters': 158, 'silhouette': np.float64(0.5869519849181912), 'calinski_harabasz': np.float64(4114.094689937568), 'davies_bouldin_metric': np.float64(0.45732807494293476)}




🏃 View run floods_identifier_distance_threshold_1200 at: http://localhost:5001/#/experiments/1/runs/c0bedfddb7cf44febb26aecf4aa44aa5
🧪 View experiment at: http://localhost:5001/#/experiments/1




{'n_clusters': None, 'affinity': 'precomputed', 'linkage': 'complete', 'distance_threshold': 1300, 'compute_full_tree': True}
{'num_generated_clusters': 151, 'silhouette': np.float64(0.5888104681800227), 'calinski_harabasz': np.float64(4043.4187548152804), 'davies_bouldin_metric': np.float64(0.46798412947398227)}




🏃 View run floods_identifier_distance_threshold_1300 at: http://localhost:5001/#/experiments/1/runs/7fb504446767426289748e221187633f
🧪 View experiment at: http://localhost:5001/#/experiments/1
