In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import NearestNeighbors


In [16]:
df = pd.read_excel("smartseg_cleaned.xlsx")

In [17]:
scaler = RobustScaler()
X_scaled = scaler.fit_transform(df)

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [25]:
pca = PCA(n_components=3)
pca_data = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(pca_data, columns=['PCA1', 'PCA2', 'PCA3'])

print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total variance captured:", round(sum(pca.explained_variance_ratio_) * 100, 2), "%")


Explained variance ratio: [9.98020230e-01 8.27557376e-04 2.91180052e-04]
Total variance captured: 99.91 %


In [26]:
def find_best_eps(X, min_samples=5):
    neigh = NearestNeighbors(n_neighbors=min_samples)
    nbrs = neigh.fit(X)
    distances, _ = nbrs.kneighbors(X)
    distances = np.sort(distances[:, -1])
    diffs = np.diff(distances)
    eps = distances[np.argmax(diffs)]
    return round(float(eps), 2)
    
best_eps = find_best_eps(pca_df, min_samples=5)
if best_eps < 1 or best_eps > 10:
    best_eps = 3  
print(f"üîç Auto-selected (adjusted) eps: {best_eps}")


üîç Auto-selected (adjusted) eps: 3.87


In [27]:
dbscan = DBSCAN(eps=best_eps, min_samples=5)
labels = dbscan.fit_predict(pca_df)   

df['segment_id'] = labels
df['outlier_flag'] = (labels == -1)

cluster_counts = df['segment_id'].value_counts().sort_index()

num_clusters = len([c for c in cluster_counts.index if c != -1])

num_noise = cluster_counts.get(-1, 0)

print("\nCluster counts:")
print(cluster_counts)
print(f"\nTotal clusters (excluding noise): {num_clusters}")
print(f"Total noise points: {num_noise}")



Cluster counts:
segment_id
-1       1
 0    9333
 1     668
Name: count, dtype: int64

Total clusters (excluding noise): 2
Total noise points: 1


In [30]:
from sklearn.metrics import silhouette_score

mask = labels != -1
if len(set(labels[mask])) > 1:
    score = silhouette_score(X_scaled[mask], labels[mask])
    print(f"üß≠ Silhouette Score: {score:.3f}")
else:
    score = None
    print("‚ö†Ô∏è Clusters too small or only noise present ‚Äî silhouette score unavailable.")


üß≠ Silhouette Score: 0.543


In [31]:
from sklearn.pipeline import Pipeline
import joblib

pipeline = Pipeline([
    ('scaler', scaler),   
    ('pca', pca),         
    ('dbscan', dbscan)    
])

joblib.dump(pipeline, 'customer_segmentation.pkl')
print("‚úÖ Model pipeline saved successfully!")


‚úÖ Model pipeline saved successfully!
