# PCA 


In [None]:

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score


## Load Wine Dataset

In [None]:

# Load wine dataset
wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df.head()


## Task 1: Exploratory Data Analysis (EDA)

In [None]:

df.info()
df.describe()


In [None]:

# Feature distributions
df.hist(figsize=(14,10))
plt.tight_layout()
plt.show()


In [None]:

# Correlation heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title("Correlation Heatmap - Wine Dataset")
plt.show()


## Task 2: PCA â€“ Dimensionality Reduction

In [None]:

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)


In [None]:

# Apply PCA
pca = PCA()
pca_data = pca.fit_transform(scaled_data)

# Scree plot
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Scree Plot")
plt.show()


In [None]:

# PCA with 90% variance
pca_final = PCA(n_components=0.90)
pca_transformed = pca_final.fit_transform(scaled_data)
pca_transformed.shape


## Task 3: Clustering on Original Dataset

In [None]:

# KMeans clustering on original data
kmeans_original = KMeans(n_clusters=3, random_state=42)
clusters_original = kmeans_original.fit_predict(scaled_data)

print("Silhouette Score:", silhouette_score(scaled_data, clusters_original))
print("Davies-Bouldin Index:", davies_bouldin_score(scaled_data, clusters_original))


In [None]:

# Visualization (first two features)
plt.scatter(scaled_data[:,0], scaled_data[:,1], c=clusters_original)
plt.xlabel(df.columns[0])
plt.ylabel(df.columns[1])
plt.title("Clustering on Original Wine Data")
plt.show()


## Task 4: Clustering on PCA-transformed Dataset

In [None]:

# KMeans on PCA data
kmeans_pca = KMeans(n_clusters=3, random_state=42)
clusters_pca = kmeans_pca.fit_predict(pca_transformed)

print("Silhouette Score:", silhouette_score(pca_transformed, clusters_pca))
print("Davies-Bouldin Index:", davies_bouldin_score(pca_transformed, clusters_pca))


In [None]:

# Visualization using first two PCs
plt.scatter(pca_transformed[:,0], pca_transformed[:,1], c=clusters_pca)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Clustering on PCA-transformed Wine Data")
plt.show()


## Task 5: Comparison and Analysis


### Comparison of Results
- PCA-based clustering shows better separation.
- Noise and redundancy are reduced after PCA.
- Silhouette score improves after PCA.

### Trade-offs
- PCA improves clustering performance.
- Original features lose interpretability after PCA.


## Task 6: Conclusion and Insights


### Key Findings
- PCA effectively reduces dimensionality while preserving variance.
- Clustering on PCA data performs better than original data.
- Wine dataset has correlated features, benefiting from PCA.

### Practical Implications
- PCA is useful for high-dimensional datasets.
- Clustering accuracy improves after dimensionality reduction.

### Recommendation
- Use PCA before clustering when features are correlated.
- Use original data when interpretability is required.
