In [1]:
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import DBSCAN
from folium.plugins import HeatMap

In [None]:
# Step 1: Load the large dataset
df = pd.read_csv("datasets/Tabelle_Purchase_Value.csv")  # Replace with your CSV file path

# Step 2: Randomly sample 350,000 rows from the dataset
sample_size = 350000
sampled_df = df.sample(n=sample_size, random_state=42)  # Ensures reproducibility

# Step 3: Check your column names for D_CUS_DeliveryLatitude and D_CUS_DeliveryLongitude
D_CUS_DeliveryLatitude_column = 'D_CUS_DeliveryLatitude'  # Replace with your actual D_CUS_DeliveryLatitude column name
D_CUS_DeliveryLongitude_column = 'D_CUS_DeliveryLongitude'  # Replace with your actual D_CUS_DeliveryLongitude column name

# Step 4: Extract coordinates and clean the data by removing NaNs
coords = sampled_df[[D_CUS_DeliveryLatitude_column, D_CUS_DeliveryLongitude_column]].dropna()

# Convert coordinates to numpy array for clustering
coordinates = coords.to_numpy()

# Step 5: Apply DBSCAN for clustering
# DBSCAN parameters
db = DBSCAN(eps=0.01, min_samples=100, metric='haversine')  # eps=0.01 for ~1 km tolerance, you can adjust
labels = db.fit_predict(np.radians(coordinates))  # DBSCAN works in radians

# Step 6: Get the coordinates of the hotspots (cluster centers)
hotspot_coords = []
for label in set(labels):
    if label != -1:  # Skip noise points
        # Get all points in the current cluster
        cluster_points = coordinates[labels == label]
        # Calculate the centroid (mean) of the cluster
        centroid = np.mean(cluster_points, axis=0)
        hotspot_coords.append(centroid)

# Convert to DataFrame for easier access
hotspot_df = pd.DataFrame(hotspot_coords, columns=[D_CUS_DeliveryLatitude_column, D_CUS_DeliveryLongitude_column])

# Print the hotspot coordinates
print(hotspot_df)

# Step 7: Create a map centered around the average location
center_lat = sampled_df[D_CUS_DeliveryLatitude_column].mean()
center_lon = sampled_df[D_CUS_DeliveryLongitude_column].mean()
heatmap_map = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Step 8: Prepare data for the HeatMap (display purchase activity)
heat_data = sampled_df[[D_CUS_DeliveryLatitude_column, D_CUS_DeliveryLongitude_column]].dropna().values.tolist()
HeatMap(heat_data, radius=12).add_to(heatmap_map)

# Step 9: Add the hotspot markers to the map
for _, row in hotspot_df.iterrows():
    folium.Marker(
        location=[row[D_CUS_DeliveryLatitude_column], row[D_CUS_DeliveryLongitude_column]],
        popup=f"Hotspot",
        icon=folium.Icon(color="red", icon="info-sign")
    ).add_to(heatmap_map)

# Step 10: Add a legend to the map
legend_html = '''
<div style="
    position: fixed; 
    bottom: 550px; left: 390px; width: 150px; height: 100px; 
    background-color: white; z-index:1000; font-size:14px;
    border:1px solid black; padding: 10px;">
    <b>Heatmap Intensity</b><br>
    <i style="background:yellow; width:10px; height:10px; display:inline-block;"></i> Low<br>
    <i style="background:orange; width:10px; height:10px; display:inline-block;"></i> Medium<br>
    <i style="background:red; width:10px; height:10px; display:inline-block;"></i> High<br>
</div>
'''
heatmap_map.get_root().html.add_child(folium.Element(legend_html))

# Step 11: Save the map to an HTML file and display it
heatmap_map.save('heatmap_with_hotspots.html')
print("Heatmap with hotspots saved as 'heatmap_with_hotspots.html'")
