In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KernelDensity
import folium
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
import sys
!{sys.executable} -m pip install scikit-learn folium

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting folium
  Using cached folium-0.20.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting branca>=0.6.0 (from folium)
  Using cached branca-0.8.2-py3-none-any.whl.metadata (1.7 kB)
Collecting xyzservices (from folium)
  Using cached xyzservices-2025.11.0-py3-none-any.whl.metadata (4.3 kB)
Using cached scikit_learn-1.8.0-cp311-cp311-win_amd64.whl (8.1 MB)
Using cached folium-0.20.0-py2.py3-none-any.whl (113 kB)
Using cached branca-0.8.2-py3-none-any.whl (26 kB)
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Using cached scipy-1.16.3-cp31

In [4]:
import os

In [6]:


# ÿßÿµŸÑÿßÿ≠ ŸÜÿßŸÖ ŸÅÿß€åŸÑ ÿ®Ÿá cleaned_data.csv (ŸÜÿßŸÖ€å ⁄©Ÿá preprocess.py ÿ™ŸàŸÑ€åÿØ ŸÖ€å‚Äå⁄©ŸÜÿØ)
INPUT_FILE = '../data/cleaned_data.csv'
OUTPUT_FILE = '../data/analyzed_data.csv'

if os.path.exists(INPUT_FILE):
    df = pd.read_csv(INPUT_FILE)
    print(f"‚úÖ Data loaded successfully. Shape: {df.shape}")
else:
    print(f"‚ùå Error: File not found at {INPUT_FILE}")
    print("Please run 'preprocess.py' first.")

‚úÖ Data loaded successfully. Shape: (12818, 53)


In [7]:
print("Running DBSCAN clustering...")

# ⁄Øÿ±ŸÅÿ™ŸÜ ŸÖÿÆÿ™ÿµÿßÿ™ (ÿ™Ÿàÿ¨Ÿá: ÿ™ÿ±ÿ™€åÿ® Longitude Ÿà Latitude ÿ®ÿ±ÿß€å ÿ®ÿ±ÿÆ€å ÿßŸÑ⁄ØŸàÿ±€åÿ™ŸÖ‚ÄåŸáÿß ŸÖŸáŸÖ ÿßÿ≥ÿ™)
coords = df[['Longitude', 'Latitude']].values

# ÿ™ŸÜÿ∏€åŸÖÿßÿ™ DBSCAN
# eps=0.01: ÿ¥ÿπÿßÿπ ŸáŸÖÿ≥ÿß€å⁄Ø€å (ÿ≠ÿØŸàÿØ €± ⁄©€åŸÑŸàŸÖÿ™ÿ±)
# min_samples=10: ÿ≠ÿØÿßŸÇŸÑ ÿ™ÿπÿØÿßÿØ ÿ™ÿµÿßÿØŸÅ ÿ®ÿ±ÿß€å ÿ™ÿ¥⁄©€åŸÑ €å⁄© ⁄©ÿßŸÜŸàŸÜ ÿÆÿ∑ÿ±
db = DBSCAN(eps=0.01, min_samples=10).fit(coords)

# ÿßŸÅÿ≤ŸàÿØŸÜ ÿ≥ÿ™ŸàŸÜ Cluster ÿ®Ÿá ÿØ€åÿ™ÿßŸÅÿ±€åŸÖ
df['Cluster'] = db.labels_

# ⁄Øÿ≤ÿßÿ±ÿ¥ ŸÜÿ™ÿß€åÿ¨
n_clusters = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)
n_noise = list(db.labels_).count(-1)

print(f"üìç Total Clusters found: {n_clusters}")
print(f"üóëÔ∏è Noise points (outliers): {n_noise}")

Running DBSCAN clustering...
üìç Total Clusters found: 111
üóëÔ∏è Noise points (outliers): 10182


In [8]:
print("Running KDE for density estimation...")

# ÿßÿ¨ÿ±ÿß€å KDE ÿ®ÿß ⁄©ÿ±ŸÜŸÑ ⁄ØŸàÿ≥€å
kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(coords)

# ŸÖÿ≠ÿßÿ≥ÿ®Ÿá ÿßŸÖÿ™€åÿßÿ≤ ŸÑ⁄Øÿßÿ±€åÿ™ŸÖ€å
log_dens = kde.score_samples(coords)

# ÿ™ÿ®ÿØ€åŸÑ ÿ®Ÿá ⁄Ü⁄ØÿßŸÑ€å ŸÜÿ±ŸÖÿßŸÑ Ÿà ÿ∞ÿÆ€åÿ±Ÿá ÿØÿ± ÿ≥ÿ™ŸàŸÜ Density
df['Density'] = np.exp(log_dens)

print("‚úÖ KDE calculation complete. 'Density' column added.")
print(df[['Latitude', 'Longitude', 'Cluster', 'Density']].head())

Running KDE for density estimation...
‚úÖ KDE calculation complete. 'Density' column added.
    Latitude  Longitude  Cluster   Density
0  53.488762  -1.117674       -1  0.388904
1  53.488762  -1.117674       -1  0.388904
2  51.508536   0.063809       -1  1.367807
3  51.547797  -0.393649      110  1.125135
4  51.492533  -0.416336       -1  1.621488


In [9]:
df.to_csv(OUTPUT_FILE, index=False)
print(f"‚úÖ Analyzed data saved to: {OUTPUT_FILE}")

‚úÖ Analyzed data saved to: ../data/analyzed_data.csv


In [10]:
# ÿ≥ÿßÿÆÿ™ ŸÜŸÇÿ¥Ÿá ÿ®ÿß ÿ™ŸÖÿ±⁄©ÿ≤ ÿ®ÿ± ŸÖ€åÿßŸÜ⁄Ø€åŸÜ ŸÖÿÆÿ™ÿµÿßÿ™ ÿØÿßÿØŸá‚ÄåŸáÿß
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=6)

# ŸÅ€åŸÑÿ™ÿ± ⁄©ÿ±ÿØŸÜ ŸÜŸÇÿßÿ∑ ŸÜŸà€åÿ≤ (ŸÅŸÇÿ∑ ŸÜŸÖÿß€åÿ¥ ŸÜŸÇÿßÿ∑ ÿÆÿ∑ÿ±ŸÜÿß⁄©)
high_risk_points = df[df['Cluster'] != -1]

print(f"üó∫Ô∏è Plotting {len(high_risk_points)} high-risk points...")

for _, row in high_risk_points.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=3,
        color='red',
        fill=True,
        fill_color='darkred',
        fill_opacity=0.7,
        popup=f"Cluster: {row['Cluster']} | Density: {row['Density']:.4f}"
    ).add_to(m)

# ŸÜŸÖÿß€åÿ¥ ŸÜŸÇÿ¥Ÿá
m

üó∫Ô∏è Plotting 2636 high-risk points...
