In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
df=pd.read_csv("/content/drive/MyDrive/NewYorkCityTaxiTripDuration.csv")
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [3]:
X = df[["pickup_latitude", "pickup_longitude"]]
print(X.head())

   pickup_latitude  pickup_longitude
0        40.767937        -73.982155
1        40.738564        -73.980415
2        40.763939        -73.979027
3        40.719971        -74.010040
4        40.793209        -73.973053


In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled[:5])

[[ 0.51749362 -0.12226117]
 [-0.37581901 -0.09772722]
 [ 0.39591029 -0.07814311]
 [-0.94127431 -0.51555758]
 [ 1.28609052  0.00611164]]


In [None]:
from sklearn.cluster import DBSCAN
db1 = DBSCAN(eps=0.2, min_samples=5)
labels_1 = db1.fit_predict(X_scaled)
db2 = DBSCAN(eps=0.3, min_samples=5)
labels_2 = db2.fit_predict(X_scaled)
db3 = DBSCAN(eps=0.5, min_samples=5)
labels_3 = db3.fit_predict(X_scaled)

In [None]:
def evaluate(labels, name):
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    noise_points = list(labels).count(-1)
    noise_ratio = noise_points / len(labels)

    print("\n", name)
    print("Clusters (excluding noise):", n_clusters)
    print("Noise points:", noise_points)
    print("Noise ratio:", round(noise_ratio, 3))

evaluate(labels_1, "Experiment 1 (eps=0.2)")
evaluate(labels_2, "Experiment 2 (eps=0.3)")
evaluate(labels_3, "Experiment 3 (eps=0.5)")


In [None]:
from sklearn.metrics import silhouette_score

def silhouette_calc(labels, name):
    mask = labels != -1
    if len(set(labels[mask])) > 1:
        score = silhouette_score(X_scaled[mask], labels[mask])
        print(name, "Silhouette Score:", round(score, 3))
        return score
    else:
        print(name, "Silhouette Score: Not Applicable")
        return -1

s1 = silhouette_calc(labels_1, "Experiment 1")
s2 = silhouette_calc(labels_2, "Experiment 2")
s3 = silhouette_calc(labels_3, "Experiment 3")


In [None]:
import matplotlib.pyplot as plt

def plot_clusters(labels, title):
    plt.figure(figsize=(6,5))

    for label in set(labels):
        mask = labels == label

        if label == -1:
            plt.scatter(X.iloc[mask,1], X.iloc[mask,0],
                        c="black", s=5, label="Noise")
        else:
            plt.scatter(X.iloc[mask,1], X.iloc[mask,0],
                        s=5, label=f"Cluster {label}")

    plt.title(title)
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.legend()
    plt.show()

plot_clusters(labels_1, "DBSCAN eps=0.2")
plot_clusters(labels_2, "DBSCAN eps=0.3")
plot_clusters(labels_3, "DBSCAN eps=0.5")


In [None]:
results = {
    0.2: s1,
    0.3: s2,
    0.5: s3
}

best_eps = max(results, key=results.get)
print("\nBest eps value =", best_eps)
