<a href="https://colab.research.google.com/github/anjalijadhav002/Zeotap_Datascience_Task/blob/main/Anjali_Jadhav_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')


merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')

customer_data = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Price': 'mean',
}).reset_index()


region_dummies = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'])
customer_data = pd.merge(customer_data, region_dummies, on='CustomerID', how='left')


numerical_features = ['Quantity', 'TotalValue', 'Price']
scaler = StandardScaler()
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])


features = customer_data[numerical_features + list(region_dummies.columns[1:])]


db_scores = []
k_values = range(2, 11)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(features)
    db_index = davies_bouldin_score(features, labels)
    db_scores.append(db_index)


plt.figure(figsize=(10, 6))
plt.plot(k_values, db_scores, marker='o', linestyle='--', color='b')
plt.title("Davies-Bouldin Index vs. Number of Clusters")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Davies-Bouldin Index")
plt.xticks(k_values)
plt.grid()
plt.show()


optimal_k = k_values[db_scores.index(min(db_scores))]
print(f"Optimal Number of Clusters: {optimal_k}")

kmeans = KMeans(n_clusters=optimal_k, random_state=42)
customer_data['Cluster'] = kmeans.fit_predict(features)


customer_data[['CustomerID', 'Cluster']].to_csv('FirstName_LastName_Clustering.csv', index=False)


from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_features = pca.fit_transform(features)
customer_data['PCA1'] = pca_features[:, 0]
customer_data['PCA2'] = pca_features[:, 1]

plt.figure(figsize=(10, 8))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=customer_data, palette='viridis', s=100)
plt.title("Customer Segmentation (PCA Visualization)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


print("\nClustering Metrics:")
print(f"Optimal Number of Clusters: {optimal_k}")
print(f"Davies-Bouldin Index: {min(db_scores)}")