In [None]:
import pandas as pd

# Load the dataset
df = pd.read_excel('/content/EastWestAirlines.xlsx')

# Display the first few rows and the data types
print(df.head())  # Preview data
print(df.dtypes)  # Check the types of columns


In [None]:
# Convert all columns to numeric where possible (non-convertible values will be set to NaN)
df_converted = df.apply(pd.to_numeric, errors='coerce')

# Check the data types after conversion
print(df_converted.dtypes)

# Preview the converted data
print(df_converted.head())


In [None]:
# Drop columns that are completely non-numeric or contain only NaN values
df_clean = df_converted.dropna(axis=1, how='all')

# Drop rows with any missing values
df_clean = df_clean.dropna()

# Check the cleaned data
print(df_clean.head())


In [None]:
from sklearn.preprocessing import StandardScaler

# Ensure there is data left after cleaning
if not df_clean.empty:
    # Perform feature scaling
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df_clean)

    # Check the shape of the scaled data
    print(f"Scaled Data Shape: {scaled_data.shape}")
else:
    print("No numeric data available for scaling.")


In [None]:
# Check the shape of the cleaned dataset
print(f"Number of samples: {df_clean.shape[0]}")


In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Use the Elbow method to find the optimal number of clusters (adjust K range to fit sample size)
max_clusters = min(10, df_clean.shape[0])  # Limit K to the number of samples or 10
inertia = []
K = range(1, max_clusters + 1)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow curve
plt.plot(K, inertia, 'bo-')
plt.title('Elbow Method For Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

# Apply K-Means with the optimal number of clusters (based on the elbow curve)
optimal_k = 2  # Adjust based on the elbow curve or manually set a reasonable value
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_data)

# Silhouette Score for K-Means
silhouette_kmeans = silhouette_score(scaled_data, kmeans_labels)
print(f'Silhouette Score for K-Means: {silhouette_kmeans:.2f}')


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Perform hierarchical clustering (linkage method can be 'ward', 'complete', 'average', etc.)
linked = linkage(scaled_data, method='ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linked)
plt.title('Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Euclidean distances')
plt.show()

# Apply Agglomerative Clustering
# Replace 'affinity' with 'metric' for newer versions of scikit-learn
hierarchical = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
hierarchical_labels = hierarchical.fit_predict(scaled_data)

# Silhouette Score for Hierarchical Clustering
silhouette_hierarchical = silhouette_score(scaled_data, hierarchical_labels)
print(f'Silhouette Score for Hierarchical Clustering: {silhouette_hierarchical:.2f}')


In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Apply DBSCAN (experiment with different epsilon and min_samples)
dbscan = DBSCAN(eps=0.5, min_samples=2)  # Adjust eps and min_samples as needed
dbscan_labels = dbscan.fit_predict(scaled_data)

# Silhouette Score for DBSCAN (only if it creates clusters)
if len(set(dbscan_labels)) > 1:  # Check if there are more than one cluster
    silhouette_dbscan = silhouette_score(scaled_data, dbscan_labels)
    print(f'Silhouette Score for DBSCAN: {silhouette_dbscan:.2f}')
else:
    print("DBSCAN did not form clusters or only formed one cluster.")


In [None]:
# Check the shape of the scaled data
print(f"Scaled Data Shape: {scaled_data.shape}")


In [None]:
# Single feature visualization for K-Means
plt.figure(figsize=(8,6))
sns.scatterplot(x=range(len(scaled_data)), y=scaled_data[:, 0], hue=kmeans_labels, palette='Set1')
plt.title('K-Means Clustering (Single Feature)')
plt.xlabel('Sample Index')
plt.ylabel('Feature Value')
plt.legend(title='K-Means Clusters')
plt.show()


In [None]:
# Single feature visualization for Hierarchical Clustering
plt.figure(figsize=(8,6))
sns.scatterplot(x=range(len(scaled_data)), y=scaled_data[:, 0], hue=hierarchical_labels, palette='Set2')
plt.title('Hierarchical Clustering (Single Feature)')
plt.xlabel('Sample Index')
plt.ylabel('Feature Value')
plt.legend(title='Hierarchical Clusters')
plt.show()


In [None]:
# Single feature visualization for DBSCAN Clustering
plt.figure(figsize=(8,6))
sns.scatterplot(x=range(len(scaled_data)), y=scaled_data[:, 0], hue=dbscan_labels, palette='Set3')
plt.title('DBSCAN Clustering (Single Feature)')
plt.xlabel('Sample Index')
plt.ylabel('Feature Value')
plt.legend(title='DBSCAN Clusters')
plt.show()
