In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Load the dataset
df = pd.read_csv('/content/wine.csv')

# Check the first few rows
df.head()


In [None]:
# Basic statistics
print(df.describe())

# Visualize distribution of features using histograms
df.hist(bins=20, figsize=(20, 15))
plt.show()

# Visualize correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

# Pairplot for selected features
sns.pairplot(df)
plt.show()


In [None]:
# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Implement PCA
pca = PCA()
pca_data = pca.fit_transform(scaled_data)

# Scree plot to determine optimal number of components
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.grid(True)
plt.show()

# Choose optimal number of components (e.g., 2)
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)


In [None]:
# Apply KMeans on the original dataset
kmeans = KMeans(n_clusters=3, random_state=42)
clusters_original = kmeans.fit_predict(scaled_data)

# Add cluster labels to the dataframe
df['Cluster_Original'] = clusters_original

# Visualize the clustering results
sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 1], hue=df['Cluster_Original'], palette='Set1')
plt.title('K-Means Clustering (Original Data)')
plt.show()

# Evaluate clustering performance
silhouette_original = silhouette_score(scaled_data, clusters_original)
davies_bouldin_original = davies_bouldin_score(scaled_data, clusters_original)
print(f'Silhouette Score (Original Data): {silhouette_original}')
print(f'Davies-Bouldin Index (Original Data): {davies_bouldin_original}')


In [None]:
# Apply KMeans on the PCA-transformed data
kmeans_pca = KMeans(n_clusters=3, random_state=42)
clusters_pca = kmeans_pca.fit_predict(pca_data)

# Visualize the clustering results from PCA-transformed data
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters_pca, cmap='Set1', marker='o')
plt.title('K-Means Clustering (PCA Data)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Evaluate clustering performance
silhouette_pca = silhouette_score(pca_data, clusters_pca)
davies_bouldin_pca = davies_bouldin_score(pca_data, clusters_pca)
print(f'Silhouette Score (PCA Data): {silhouette_pca}')
print(f'Davies-Bouldin Index (PCA Data): {davies_bouldin_pca}')


In [None]:
# Print comparison results
print(f'Silhouette Score (Original Data): {silhouette_original}')
print(f'Silhouette Score (PCA Data): {silhouette_pca}')
print(f'Davies-Bouldin Index (Original Data): {davies_bouldin_original}')
print(f'Davies-Bouldin Index (PCA Data): {davies_bouldin_pca}')

# Analysis of performance differences
