In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from geopy.distance import geodesic
import time
from tqdm import tqdm
import logging


def setup_logger(verbose=True):
    """
    Configure logger for verbose output.
    
    Parameters:
    - verbose: Enable detailed logging
    
    Returns:
    - Logger instance
    """
    level = logging.INFO if verbose else logging.WARNING
    logging.basicConfig(
        level=level,
        format="%(asctime)s - %(levelname)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )
    return logging.getLogger("RadialDistanceClustering")


def haversine_distance_matrix(coords, logger, verbose=True):
    """
    Calculate the pairwise haversine distance matrix for geospatial points.
    
    Parameters:
    - coords: numpy array of (latitude, longitude) points
    - logger: Logger instance for logging progress
    - verbose: Enable progress bar
    
    Returns:
    - Distance matrix in kilometers
    """
    n = len(coords)
    distance_matrix = np.zeros((n, n))

    logger.info(f"Computing distance matrix for {n} points...")

    progress_bar = tqdm(
        total=(n * (n - 1)) // 2,
        desc="Pairwise Distance",
        unit="pairs",
        disable=not verbose
    )

    for i in range(n):
        for j in range(i + 1, n):
            distance_matrix[i, j] = geodesic(coords[i], coords[j]).kilometers
            distance_matrix[j, i] = distance_matrix[i, j]  # Symmetry
            progress_bar.update(1)

    progress_bar.close()
    logger.info("Distance matrix computation completed.")
    return distance_matrix


def cluster_with_radial_distance(
    file_path, lat_col='latitude', lng_col='longitude',
    max_radius_km=5, output_file='clustered_results.csv',
    sample_size=10000, verbose=True
):
    """
    Perform clustering using radial distance with data sampling.
    
    Parameters:
    - file_path: Path to the CSV file
    - lat_col: Name of the latitude column
    - lng_col: Name of the longitude column
    - max_radius_km: Maximum cluster radius in kilometers
    - output_file: Output file to save results
    - sample_size: Number of points to sample for clustering
    - verbose: Enable detailed logging
    
    Returns:
    - Clustered DataFrame
    - Cluster centroids
    """
    logger = setup_logger(verbose)

    # Start timer
    start_time = time.time()
    logger.info(f"Starting clustering with max radius {max_radius_km} km.")

    # Read data
    logger.info(f"Reading data from {file_path}...")
    df = pd.read_csv(file_path)
    if lat_col not in df.columns or lng_col not in df.columns:
        raise ValueError(f"Columns {lat_col} or {lng_col} not found in the dataset.")

    # Drop rows with missing coordinates
    df_clean = df.dropna(subset=[lat_col, lng_col])
    logger.info(f"Cleaned dataset: {len(df_clean)} points (removed {len(df) - len(df_clean)} invalid points).")

    # Sample data if needed
    if len(df_clean) > sample_size:
        logger.info(f"Sampling data to {sample_size} points from {len(df_clean)} points.")
        df_clean = df_clean.sample(n=sample_size, random_state=42).reset_index(drop=True)

    coords = df_clean[[lat_col, lng_col]].values

    # Calculate the pairwise distance matrix
    distance_matrix = haversine_distance_matrix(coords, logger, verbose)

    # Perform clustering using DBSCAN
    logger.info("Performing clustering...")
    db = DBSCAN(eps=max_radius_km, min_samples=1, metric='precomputed')
    labels = db.fit_predict(distance_matrix)
    logger.info(f"Clustering complete. Total clusters formed: {len(np.unique(labels))}.")

    # Assign cluster labels to DataFrame
    df_clean['cluster'] = labels

    # Compute cluster centroids
    logger.info("Calculating centroids...")
    cluster_centroids = df_clean.groupby('cluster')[[lat_col, lng_col]].mean().values

    # Save results
    df_clean.to_csv(output_file, index=False)
    logger.info(f"Clustered results saved to {output_file}.")

    # Performance summary
    end_time = time.time()
    logger.info(f"Clustering completed in {end_time - start_time:.2f} seconds.")

    return df_clean, cluster_centroids


# Main execution
def main():
    input_file = 'data-1733399738886.csv'  # Replace with your actual file path
    output_file = 'clustered_results.csv'

    try:
        clustered_df, centroids = cluster_with_radial_distance(
            file_path=input_file,
            lat_col='lat',  # Replace with your dataset's latitude column
            lng_col='lng',  # Replace with your dataset's longitude column
            max_radius_km=20,  # Adjust the radius as needed
            sample_size=10000,  # Set sample size for clustering
            output_file=output_file,
            verbose=True
        )

        print("\nCentroid Summary:")
        for i, centroid in enumerate(centroids, 1):
            print(f"Centroid {i}: {centroid}")

    except Exception as e:
        print(f"An error occurred: {e}")


# Uncomment to run
main()

2024-12-05 18:39:33,536 - INFO: Starting clustering with max radius 20 km.
2024-12-05 18:39:33,537 - INFO: Reading data from data-1733399738886.csv...
2024-12-05 18:39:33,614 - INFO: Cleaned dataset: 70313 points (removed 0 invalid points).
2024-12-05 18:39:33,614 - INFO: Sampling data to 10000 points from 70313 points.
2024-12-05 18:39:33,622 - INFO: Computing distance matrix for 10000 points...

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 


[A