# Clustering Analysis 

# 1.Data Preprocessing

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
df = pd.read_csv(r"C:\Users\G.S.AZARUDDIN\Downloads\EastWestAirlines.xlsx")
imputer = SimpleImputer(strategy='mean') 
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
iso_forest = IsolationForest(contamination=0.01)
outliers = iso_forest.fit_predict(df_imputed)
df_clean = df_imputed[outliers == 1]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_clean), columns=df_clean.columns)

# 2. Implementing Clustering Algorithms

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
inertia = []
K_range = range(1, 11)
for K in K_range:
    kmeans = KMeans(n_clusters=K, random_state=42)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)
plt.figure(figsize=(8, 6))
plt.plot(K_range, inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df_scaled['Cluster'] = kmeans.fit_predict(df_scaled)
silhouette_avg = silhouette_score(df_scaled.drop('Cluster', axis=1), df_scaled['Cluster'])
print(f'Silhouette Score for K-Means: {silhouette_avg}')


# b. DBSCAN  Clustering

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5) 
df_scaled['Cluster'] = dbscan.fit_predict(df_scaled)
silhouette_avg = silhouette_score(df_scaled.drop('Cluster', axis=1), df_scaled['Cluster'])
print(f'Silhouette Score for DBSCAN: {silhouette_avg}')



# 3.Cluster Analysis and Interpretation

In [None]:
print("K-Means Cluster Centers:")
print(kmeans.cluster_centers_)
print("DBSCAN Cluster Counts:")
print(df_scaled['Cluster'].value_counts())


# 4. Visualization

# K-Means

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_scaled, x=df_scaled.columns[0], y=df_scaled.columns[1], hue='Cluster', palette='viridis')
plt.title('K-Means Clustering')
plt.show()

# Hierarchical Clustering

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_scaled, x=df_scaled.columns[0], y=df_scaled.columns[1], hue='Cluster', palette='viridis')
plt.title('Hierarchical Clustering')
plt.show()

# DBSCAN

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_scaled, x=df_scaled.columns[0], y=df_scaled.columns[1], hue='Cluster', palette='viridis')
plt.title('DBSCAN Clustering')
plt.show()

# 5.Evaluation and Performance Metrics

In [None]:
from sklearn.metrics import silhouette_score
# K-Means
silhouette_kmeans = silhouette_score(df_scaled.drop('Cluster', axis=1), df_scaled['Cluster'])
print(f'Silhouette Score for K-Means: {silhouette_kmeans}')
# DBSCAN
silhouette_dbscan = silhouette_score(df_scaled.drop('Cluster', axis=1), df_scaled['Cluster'])
print(f'Silhouette Score for DBSCAN: {silhouette_dbscan}')