<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/2_Stages_Clustering_Updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-geohash
import geohash
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from dataclasses import dataclass
from sklearn.cluster import DBSCAN
from geopy.distance import geodesic
from typing import Optional, List, Dict, Union, Tuple

Collecting python-geohash
  Downloading python-geohash-0.8.5.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-geohash
  Building wheel for python-geohash (setup.py) ... [?25l[?25hdone
  Created wheel for python-geohash: filename=python_geohash-0.8.5-cp310-cp310-linux_x86_64.whl size=41538 sha256=3d8137d859e6a822ff5885f3df00ce36b20123d29011b3b1dd8e64a145a2ce4d
  Stored in directory: /root/.cache/pip/wheels/19/e8/74/3f800ffdbb57c27a3fee3a695c7009769356448837c1f4f899
Successfully built python-geohash
Installing collected packages: python-geohash
Successfully installed python-geohash-0.8.5


In [None]:
# @title
class TwoStageClustering:
    def __init__(self, eps: float = 50, min_samples: int = 10):
        """
        Initialize TwoStageClustering with DBSCAN parameters.
        """
        self.eps = eps / 6371000.0  # Convert meters to radians
        self.min_samples = min_samples
        self._setup_logging()

    def _setup_logging(self) -> None:
        """
        Configure logging for the clustering process.
        """
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            force=True,
        )
        self.logger = logging.getLogger(__name__)

    def first_level_clustering(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform first-level clustering on individual vehicles.

        Args:
            df: Input dataframe containing stationary location data.

        Returns:
            Dataframe with first-level clustering results.
        """
        self.logger.info("Starting first-level clustering...")

        # Ensure sim_numbers is normalized (handle lists of identifiers)
        # df["sim_numbers"] = df["sim_numbers"].apply(
        #     lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x
        # )

        # # Extract the first element from the list to normalize sim_number
        # df["sim_number"] = df["sim_numbers"].apply(lambda x: x[0] if isinstance(x, list) else x)

        all_clusters = []

        for sim_number, group in df.groupby("sim_number"):
            coords = group[["latitude", "longitude"]].to_numpy()
            clustering = DBSCAN(
                eps=self.eps, min_samples=self.min_samples, metric="haversine"
            ).fit(np.radians(coords))

            group["first_level_cluster"] = clustering.labels_

            valid_clusters = group[group["first_level_cluster"] != -1]

            if not valid_clusters.empty:
                all_clusters.append(valid_clusters)
                self.logger.info(
                    f"Vehicle {sim_number}: Found "
                    f"{valid_clusters['first_level_cluster'].nunique()} clusters"
                )

        return pd.concat(all_clusters, ignore_index=True) if all_clusters else pd.DataFrame()

    def second_level_clustering(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform second-level clustering on centroids of first-level clusters.

        Args:
            df: Dataframe with first-level clustering results.

        Returns:
            Dataframe with second-level clustering results.
        """
        self.logger.info("Starting second-level clustering...")
        centroids = df.groupby(["sim_number", "first_level_cluster"]).agg(
            {"latitude": "mean", "longitude": "mean"}
        ).reset_index()

        coords = centroids[["latitude", "longitude"]].to_numpy()

        clustering = DBSCAN(
            eps=self.eps, min_samples=self.min_samples, metric="haversine"
        ).fit(np.radians(coords))

        centroids["second_level_cluster"] = clustering.labels_
        centroids = centroids[centroids["second_level_cluster"] != -1]
        centroids["cluster_id"] = [
            f"cluster_{i}" for i in range(len(centroids))
        ]  # Generate unique cluster IDs
        centroids["geohash"] = centroids.apply(
            lambda row: geohash.encode(row["latitude"], row["longitude"], precision=6),
            axis=1,
        )

        self.logger.info(
            f"Second-level clustering identified {centroids['second_level_cluster'].nunique()} clusters."
        )
        return centroids

    def process(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Execute the two-stage clustering process.

        Args:
            df: Input dataframe containing stationary location data.

        Returns:
            Tuple of dataframes for first-level and second-level clustering results.
        """
        first_level_result = self.first_level_clustering(df)
        if first_level_result.empty:
            self.logger.warning("No clusters identified in the first-level clustering.")
            return pd.DataFrame(), pd.DataFrame()

        second_level_result = self.second_level_clustering(first_level_result)
        return first_level_result, second_level_result

# Load input data
df = pd.read_csv("cluster_df.csv")

# Initialize clustering process
clustering = TwoStageClustering(eps=50, min_samples=10)

# Perform clustering
first_level, second_level = clustering.process(df)

# Save and output results
if not first_level.empty:
    # first_level.to_csv("first_level_clustering.csv", index=False)
    print("First-level clustering results saved as 'first_level_clustering.csv'.")

if not second_level.empty:
    # second_level.to_csv("second_level_clustering.csv", index=False)
    print("Second-level clustering results saved as 'second_level_clustering.csv'.")

print("\nClustering process complete.")

In [2]:
import pandas as pd
import numpy as np
import logging
import geohash
from typing import Tuple, Optional
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import seaborn as sns

class TwoStageClustering:
    def __init__(self,
                 first_level_eps: float = 50,
                 first_level_min_samples: int = 2,
                 second_level_eps: float = 150,
                 second_level_min_samples: int = 2):
        """
        Initialize TwoStageClustering with configurable DBSCAN parameters.

        Args:
            first_level_eps: Maximum distance (in meters) for first-level clustering
            first_level_min_samples: Minimum number of samples to form a cluster in first-level clustering
            second_level_eps: Maximum distance (in meters) for second-level clustering
            second_level_min_samples: Minimum number of samples to form a cluster in second-level clustering
        """
        # Convert meters to radians (Earth's radius)
        self.first_level_eps = first_level_eps / 6371000.0
        self.first_level_min_samples = first_level_min_samples
        self.second_level_eps = second_level_eps / 6371000.0
        self.second_level_min_samples = second_level_min_samples

        self._setup_logging()

    def _setup_logging(self) -> None:
        """
        Configure logging for the clustering process.
        """
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            force=True
        )
        self.logger = logging.getLogger(__name__)

    def _normalize_coordinates(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize and validate coordinate data.

        Args:
            df: Input dataframe with latitude and longitude columns

        Returns:
            Validated dataframe with cleaned coordinate data
        """
        # Remove rows with invalid or missing coordinates
        df = df.dropna(subset=['latitude', 'longitude'])

        # Optional: Add coordinate validation if needed
        df = df[
            (df['latitude'].between(-90, 90)) &
            (df['longitude'].between(-180, 180))
        ]

        return df

    def first_level_clustering(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform first-level clustering on individual vehicles.

        Args:
            df: Input dataframe containing stationary location data.

        Returns:
            Dataframe with first-level clustering results.
        """
        self.logger.info("Starting first-level clustering...")

        # Normalize coordinates
        df = self._normalize_coordinates(df)

        all_clusters = []

        for sim_number, group in df.groupby("sim_number"):
            coords = group[["latitude", "longitude"]].to_numpy()

            # Perform DBSCAN clustering
            clustering = DBSCAN(
                eps=self.first_level_eps,
                min_samples=self.first_level_min_samples,
                metric="haversine"
            ).fit(np.radians(coords))

            # Create first-level cluster labels
            group['first_level_cluster'] = clustering.labels_

            # Filter out noise points (label -1)
            valid_clusters = group[group['first_level_cluster'] != -1].copy()

            # Create unique cluster identifiers
            valid_clusters['first_level_cluster'] = (
                valid_clusters['sim_number'].astype(str) + '_' +
                valid_clusters['first_level_cluster'].astype(str)
            )

            if not valid_clusters.empty:
                # Calculate centroids for each first-level cluster
                centroids = valid_clusters.groupby('first_level_cluster').agg({
                    'latitude': 'mean',
                    'longitude': 'mean'
                }).reset_index()
                centroids.columns = ['first_level_cluster', 'centroid_latitude', 'centroid_longitude']

                # Merge centroids back to the original dataframe
                valid_clusters = valid_clusters.merge(
                    centroids, on='first_level_cluster', how='left'
                )

                all_clusters.append(valid_clusters)
                self.logger.info(
                    f"Vehicle {sim_number}: Found {valid_clusters['first_level_cluster'].nunique()} clusters"
                )

        return pd.concat(all_clusters, ignore_index=True) if all_clusters else pd.DataFrame()

    def second_level_clustering(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Perform second-level clustering on centroids of first-level clusters.

        Args:
            df: Dataframe with first-level clustering results.

        Returns:
            Dataframe with second-level clustering results.
        """
        self.logger.info("Starting second-level clustering...")

        # Extract centroids for second-level clustering
        centroids = df.groupby('first_level_cluster')[
            ['centroid_latitude', 'centroid_longitude']
        ].first().reset_index()

        coords = centroids[['centroid_latitude', 'centroid_longitude']].to_numpy()

        # Perform DBSCAN on centroids
        clustering = DBSCAN(
            eps=self.second_level_eps,
            min_samples=self.second_level_min_samples,
            metric="haversine"
        ).fit(np.radians(coords))

        centroids['second_level_cluster'] = clustering.labels_

        # Filter out noise points
        centroids = centroids[centroids['second_level_cluster'] != -1]

        # Generate unique geohashes for second-level clusters
        centroids['geohash'] = centroids.apply(
            lambda row: geohash.encode(
                row['centroid_latitude'],
                row['centroid_longitude'],
                precision=6
            ),
            axis=1
        )

        self.logger.info(
            f"Second-level clustering identified {centroids['second_level_cluster'].nunique()} clusters."
        )

        return centroids

    def visualize_first_level_clustering(self, df: pd.DataFrame) -> None:
        """
        Create a visualization of first-level clustering for each vehicle.

        Args:
            df: Dataframe with first-level clustering results
        """
        plt.figure(figsize=(15, 10))
        vehicles = df['sim_number'].unique()

        for i, vehicle in enumerate(vehicles, 1):
            plt.subplot(len(vehicles), 1, i)
            vehicle_data = df[df['sim_number'] == vehicle]

            sns.scatterplot(
                data=vehicle_data,
                x='longitude',
                y='latitude',
                hue='first_level_cluster',
                palette='deep'
            )
            plt.title(f'First-Level Clustering for Vehicle {vehicle}')
            plt.xlabel('Longitude')
            plt.ylabel('Latitude')

        plt.tight_layout()
        plt.savefig('first_level_clustering.png')
        plt.close()

    def visualize_second_level_clustering(self, first_level_df: pd.DataFrame, second_level_df: pd.DataFrame) -> None:
        """
        Create a visualization of second-level clustering of centroids.

        Args:
            first_level_df: Dataframe with first-level clustering results
            second_level_df: Dataframe with second-level clustering results
        """
        # Merge first-level centroids with second-level cluster labels
        centroids = first_level_df.groupby('first_level_cluster')[
            ['centroid_latitude', 'centroid_longitude']
        ].first().reset_index()

        centroids = centroids.merge(
            second_level_df[['first_level_cluster', 'second_level_cluster']],
            on='first_level_cluster',
            how='left'
        )

        plt.figure(figsize=(12, 8))
        sns.scatterplot(
            data=centroids,
            x='centroid_longitude',
            y='centroid_latitude',
            hue='second_level_cluster',
            palette='Set2'
        )
        plt.title('Second-Level Clustering of First-Level Cluster Centroids')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.savefig('second_level_clustering.png')
        plt.close()

    def process(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
        """
        Execute the two-stage clustering process.

        Args:
            df: Input dataframe containing stationary location data.

        Returns:
            Tuple of dataframes for first-level and second-level clustering results.
        """
        first_level_result = self.first_level_clustering(df)
        if first_level_result.empty:
            self.logger.warning("No clusters identified in the first-level clustering.")
            return pd.DataFrame(), None

        second_level_result = self.second_level_clustering(first_level_result)

        # Perform visualizations
        self.visualize_first_level_clustering(first_level_result)
        self.visualize_second_level_clustering(first_level_result, second_level_result)

        # Optional: Save results to CSV
        first_level_result.to_csv('first_level_clusters.csv', index=False)
        second_level_result.to_csv('second_level_clusters.csv', index=False)

        return first_level_result, second_level_result

def main():
    # Load input data
    df = pd.read_csv("/content/stationary_events_4mil_rows.csv")

    # Initialize clustering process with custom parameters
    clustering = TwoStageClustering(
        first_level_eps=50,      # 50 meters for first-level
        first_level_min_samples=10,
        second_level_eps=50,    # 50 meters for second-level
        second_level_min_samples=10
    )

    # Perform clustering
    first_level, second_level = clustering.process(df)

if __name__ == "__main__":
    main()

2024-11-27 14:49:45,141 - INFO - Starting first-level clustering...
2024-11-27 14:49:45,286 - INFO - Vehicle 1: Found 1 clusters
2024-11-27 14:49:45,308 - INFO - Vehicle 4: Found 1 clusters
2024-11-27 14:49:45,320 - INFO - Vehicle 5: Found 1 clusters
2024-11-27 14:49:45,345 - INFO - Vehicle 9: Found 1 clusters
2024-11-27 14:49:45,368 - INFO - Vehicle 10: Found 1 clusters
2024-11-27 14:49:45,381 - INFO - Vehicle 11: Found 1 clusters
2024-11-27 14:49:45,394 - INFO - Vehicle 12: Found 1 clusters
2024-11-27 14:49:45,494 - INFO - Vehicle 19: Found 3 clusters
2024-11-27 14:49:45,507 - INFO - Vehicle 20: Found 2 clusters
2024-11-27 14:49:45,519 - INFO - Vehicle 21: Found 2 clusters
2024-11-27 14:49:45,542 - INFO - Vehicle 24: Found 2 clusters
2024-11-27 14:49:45,556 - INFO - Vehicle 25: Found 1 clusters
2024-11-27 14:49:45,570 - INFO - Vehicle 26: Found 1 clusters
2024-11-27 14:49:45,585 - INFO - Vehicle 27: Found 5 clusters
2024-11-27 14:49:45,600 - INFO - Vehicle 28: Found 9 clusters
2024-1

In [3]:
df = pd.read_csv("/content/first_level_clusters.csv")
df

Unnamed: 0,sim_number,latitude,longitude,first_level_cluster,centroid_latitude,centroid_longitude
0,1,53.313702,-6.360803,1_0,53.313157,-6.360686
1,1,53.313240,-6.360750,1_0,53.313157,-6.360686
2,1,53.313198,-6.360745,1_0,53.313157,-6.360686
3,1,53.313251,-6.360727,1_0,53.313157,-6.360686
4,1,53.313190,-6.360745,1_0,53.313157,-6.360686
...,...,...,...,...,...,...
137151,10717,-0.466228,-78.566460,10717_0,-0.465864,-78.566503
137152,10717,-0.465613,-78.566444,10717_0,-0.465864,-78.566503
137153,10717,-0.465608,-78.566689,10717_0,-0.465864,-78.566503
137154,10717,-0.466165,-78.566391,10717_0,-0.465864,-78.566503


In [4]:
df2 = pd.read_csv("/content/second_level_clusters.csv")
df2

Unnamed: 0,first_level_cluster,centroid_latitude,centroid_longitude,second_level_cluster,geohash
0,1005_0,52.811918,-0.137435,0,gcruky
1,10086_0,51.460890,22.588274,58,u90hf0
2,10113_0,52.723634,-2.459501,1,gcq7bn
3,1034_0,-0.357352,-78.448765,2,6rbjyy
4,1038_2,-33.805353,150.836709,3,r3gr8m
...,...,...,...,...,...
1044,988_0,-0.357400,-78.448490,2,6rbjyy
1045,9916_0,52.928826,-0.155520,14,gcrvh7
1046,998_0,-26.018823,28.265842,11,kek5gp
1047,999_0,46.249305,3.614727,23,u04x47
