# Day 8: Unsupervised Learning

Today we explore **Unsupervised Learning** - finding patterns in data WITHOUT labels!

### Topics Covered:
1. Supervised vs Unsupervised Learning
2. K-Means Clustering
3. The Elbow Method
4. Principal Component Analysis (PCA)
5. **Mini Project: Customer Segmentation**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)
print("Libraries loaded!")

## 1. Supervised vs Unsupervised

| Aspect | Supervised | Unsupervised |
|--------|------------|---------------|
| **Labels** | Has labels (y) | No labels |
| **Goal** | Predict outcomes | Find patterns |
| **Examples** | Classification, Regression | Clustering, Dimensionality Reduction |

In [None]:
# Visualize the difference
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Supervised - has labels
np.random.seed(42)
x1 = np.random.normal(2, 0.5, 50)
y1 = np.random.normal(2, 0.5, 50)
x2 = np.random.normal(4, 0.5, 50)
y2 = np.random.normal(4, 0.5, 50)

axes[0].scatter(x1, y1, c='#3498db', label='Class A', s=60)
axes[0].scatter(x2, y2, c='#e74c3c', label='Class B', s=60)
axes[0].set_title('Supervised: Data HAS Labels', fontsize=12, fontweight='bold')
axes[0].legend()

# Unsupervised - no labels
axes[1].scatter(np.concatenate([x1, x2]), np.concatenate([y1, y2]), c='gray', s=60)
axes[1].set_title('Unsupervised: Find the Groups!', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 2. K-Means Clustering

**Goal:** Partition data into K clusters where each point belongs to the nearest centroid.

**Algorithm:**
1. Choose K (number of clusters)
2. Initialize K random centroids
3. Assign each point to nearest centroid
4. Update centroids to cluster means
5. Repeat until convergence

In [None]:
# Create sample clustered data
from sklearn.datasets import make_blobs

X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.8, random_state=42)

plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c='gray', s=50, alpha=0.6)
plt.title('Raw Data - Can You See the Clusters?', fontsize=12, fontweight='bold')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
# Apply K-Means
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', s=50, alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
            c='red', marker='X', s=200, edgecolors='black', linewidth=2, label='Centroids')
plt.title('K-Means Clustering (K=4)', fontsize=12, fontweight='bold')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.colorbar(scatter, label='Cluster')
plt.show()

print(f"Cluster Centers:\n{kmeans.cluster_centers_}")
print(f"\nInertia (within-cluster sum of squares): {kmeans.inertia_:.2f}")

In [None]:
# Visualize K-Means steps
fig, axes = plt.subplots(2, 3, figsize=(14, 9))

for i, ax in enumerate(axes.flatten()):
    if i == 0:
        ax.scatter(X[:, 0], X[:, 1], c='gray', s=30)
        ax.set_title('Step 1: Raw Data')
    else:
        km = KMeans(n_clusters=4, random_state=42, n_init=1, max_iter=i)
        km.fit(X)
        ax.scatter(X[:, 0], X[:, 1], c=km.labels_, cmap='viridis', s=30)
        ax.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], 
                   c='red', marker='X', s=150, edgecolors='black')
        ax.set_title(f'Iteration {i}')

plt.suptitle('K-Means Algorithm Steps', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 3. The Elbow Method - Finding Optimal K

**Problem:** How do we choose the right number of clusters?

**Solution:** Plot inertia vs K and look for the "elbow" point.

In [None]:
# Elbow Method
inertias = []
silhouettes = []
K_range = range(2, 11)

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X, km.labels_))

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Elbow Plot
axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].axvline(x=4, color='r', linestyle='--', label='Optimal K=4')
axes[0].set_title('Elbow Method', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Inertia')
axes[0].legend()

# Silhouette Score
axes[1].plot(K_range, silhouettes, 'go-', linewidth=2, markersize=8)
axes[1].axvline(x=4, color='r', linestyle='--', label='Optimal K=4')
axes[1].set_title('Silhouette Score (Higher = Better)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Silhouette Score')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Best K by Silhouette: {K_range[np.argmax(silhouettes)]}")

## 4. PCA - Dimensionality Reduction

**Goal:** Reduce features while keeping most information.

**Use Cases:**
- Visualization (reduce to 2D/3D)
- Speed up ML algorithms
- Remove noise and redundancy

In [None]:
# Create high-dimensional data
np.random.seed(42)
n_samples = 200

# 6 features, but really only 2 dimensions of variance
X_pca = np.random.randn(n_samples, 2)
X_high = np.column_stack([
    X_pca[:, 0],
    X_pca[:, 1],
    X_pca[:, 0] + np.random.randn(n_samples) * 0.1,
    X_pca[:, 1] + np.random.randn(n_samples) * 0.1,
    X_pca[:, 0] * 2 + np.random.randn(n_samples) * 0.2,
    X_pca[:, 1] * 0.5 + np.random.randn(n_samples) * 0.1
])

print(f"Original shape: {X_high.shape} (6 features)")

# Apply PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_high)

pca = PCA()
X_pca_result = pca.fit_transform(X_scaled)

# Explained variance
print(f"\nExplained Variance Ratio:")
for i, var in enumerate(pca.explained_variance_ratio_):
    print(f"  PC{i+1}: {var*100:.1f}%")
print(f"\nTotal with 2 components: {sum(pca.explained_variance_ratio_[:2])*100:.1f}%")

In [None]:
# Visualize PCA
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Cumulative Variance
cumulative = np.cumsum(pca.explained_variance_ratio_)
axes[0].bar(range(1, 7), pca.explained_variance_ratio_, alpha=0.7, label='Individual')
axes[0].plot(range(1, 7), cumulative, 'ro-', label='Cumulative')
axes[0].axhline(y=0.95, color='g', linestyle='--', label='95% threshold')
axes[0].set_title('Explained Variance by Component', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].legend()

# 2D visualization
axes[1].scatter(X_pca_result[:, 0], X_pca_result[:, 1], c='#3498db', alpha=0.6)
axes[1].set_title('Data in 2D (PCA)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')

plt.tight_layout()
plt.show()

---
## Mini Project: Customer Segmentation

**Goal:** Segment mall customers based on their spending habits.

In [None]:
# Create customer dataset
np.random.seed(42)
n = 200

# 5 customer segments
segments = [
    {'age': (20, 5), 'income': (20, 5), 'score': (80, 10), 'n': 40},   # Young high spenders
    {'age': (25, 5), 'income': (80, 15), 'score': (20, 10), 'n': 40},  # Young high income, low spend
    {'age': (45, 10), 'income': (50, 15), 'score': (50, 15), 'n': 50}, # Middle aged moderate
    {'age': (55, 10), 'income': (70, 20), 'score': (80, 10), 'n': 35}, # Older high spenders
    {'age': (40, 10), 'income': (30, 10), 'score': (30, 10), 'n': 35}, # Moderate income low spend
]

data = []
for seg in segments:
    for _ in range(seg['n']):
        data.append({
            'Age': np.random.normal(seg['age'][0], seg['age'][1]),
            'Annual_Income': np.random.normal(seg['income'][0], seg['income'][1]),
            'Spending_Score': np.random.normal(seg['score'][0], seg['score'][1])
        })

customers = pd.DataFrame(data)
customers['Age'] = customers['Age'].clip(18, 70).astype(int)
customers['Annual_Income'] = customers['Annual_Income'].clip(10, 150).astype(int)
customers['Spending_Score'] = customers['Spending_Score'].clip(1, 100).astype(int)
customers['CustomerID'] = range(1, len(customers) + 1)
customers = customers[['CustomerID', 'Age', 'Annual_Income', 'Spending_Score']]

print(" MALL CUSTOMER DATASET")
print(f"Total Customers: {len(customers)}")
print(customers.head(10))
print("\nStatistics:")
print(customers.describe().round(1))

In [None]:
# EDA
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

axes[0].hist(customers['Age'], bins=20, color='#3498db', edgecolor='white')
axes[0].set_title('Age Distribution')
axes[0].set_xlabel('Age')

axes[1].hist(customers['Annual_Income'], bins=20, color='#2ecc71', edgecolor='white')
axes[1].set_title('Income Distribution (K$)')
axes[1].set_xlabel('Annual Income')

axes[2].hist(customers['Spending_Score'], bins=20, color='#e74c3c', edgecolor='white')
axes[2].set_title('Spending Score Distribution')
axes[2].set_xlabel('Spending Score (1-100)')

plt.tight_layout()
plt.show()

In [None]:
# Find optimal K
X_cluster = customers[['Annual_Income', 'Spending_Score']]

inertias = []
sil_scores = []
K_range = range(2, 11)

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_cluster)
    inertias.append(km.inertia_)
    sil_scores.append(silhouette_score(X_cluster, km.labels_))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(K_range, inertias, 'bo-', linewidth=2)
axes[0].axvline(x=5, color='r', linestyle='--')
axes[0].set_title('Elbow Method')
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')

axes[1].plot(K_range, sil_scores, 'go-', linewidth=2)
axes[1].axvline(x=5, color='r', linestyle='--')
axes[1].set_title('Silhouette Score')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Score')

plt.tight_layout()
plt.show()

print(f"Optimal K: 5 (by elbow and silhouette analysis)")

In [None]:
# Apply K-Means with K=5
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
customers['Cluster'] = kmeans.fit_predict(X_cluster)

# Visualize clusters
plt.figure(figsize=(10, 7))
scatter = plt.scatter(customers['Annual_Income'], customers['Spending_Score'], 
                       c=customers['Cluster'], cmap='viridis', s=80, alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
            c='red', marker='X', s=300, edgecolors='black', linewidth=2)
plt.title('Customer Segments', fontsize=14, fontweight='bold')
plt.xlabel('Annual Income (K$)')
plt.ylabel('Spending Score (1-100)')
plt.colorbar(scatter, label='Cluster')
plt.show()

In [None]:
# Analyze Clusters
cluster_analysis = customers.groupby('Cluster').agg({
    'Age': 'mean',
    'Annual_Income': 'mean',
    'Spending_Score': 'mean',
    'CustomerID': 'count'
}).round(1)
cluster_analysis.columns = ['Avg_Age', 'Avg_Income', 'Avg_Spending', 'Count']

# Name the segments
segment_names = {
    0: 'Moderate All',
    1: 'High Income Low Spend',
    2: 'Young Budget Shoppers',
    3: 'High Income High Spend',
    4: 'Low Income Low Spend'
}

cluster_analysis['Segment'] = cluster_analysis.index.map(segment_names)

print(" CUSTOMER SEGMENTS")
print("="*60)
print(cluster_analysis)

In [None]:
# Detailed segment visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Segment profiles
cluster_analysis[['Avg_Income', 'Avg_Spending']].plot(kind='bar', ax=axes[0], 
                                                       color=['#3498db', '#e74c3c'])
axes[0].set_title('Segment Profiles', fontweight='bold')
axes[0].set_xlabel('Cluster')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
axes[0].legend(['Avg Income', 'Avg Spending'])

# Segment sizes
colors = plt.cm.viridis(np.linspace(0.2, 0.8, 5))
axes[1].pie(cluster_analysis['Count'], labels=[segment_names[i] for i in range(5)],
            autopct='%1.1f%%', colors=colors, explode=[0.02]*5)
axes[1].set_title('Segment Distribution', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Business Recommendations
print("\n" + "="*60)
print(" BUSINESS INSIGHTS & RECOMMENDATIONS")
print("="*60)

recommendations = {
    'High Income High Spend': 'VIP treatment, exclusive products, loyalty rewards',
    'High Income Low Spend': 'Target with premium campaigns, needs convincing',
    'Young Budget Shoppers': 'Discounts, trendy items, social media marketing',
    'Moderate All': 'Standard promotions, balanced offerings',
    'Low Income Low Spend': 'Budget products, clearance sales'
}

for segment, rec in recommendations.items():
    print(f"\n {segment}:")
    print(f"   Strategy: {rec}")

In [None]:
# Final Summary
print("\n" + "="*60)
print(" DAY 8 COMPLETE!")
print("="*60)
print("""
 KEY TAKEAWAYS:

 1. K-MEANS CLUSTERING
    - Partitions data into K groups
    - Uses centroids and distances
    - Need to choose K beforehand

 2. ELBOW METHOD
    - Plot inertia vs K
    - Look for the "elbow" bend
    - Silhouette score helps confirm

 3. PCA
    - Reduces dimensions
    - Keeps most variance
    - Great for visualization

 4. CUSTOMER SEGMENTATION
    - Real business application
    - Enables targeted marketing
    - Data-driven decisions
""")
print("="*60)
print(" Next: Day 9 - Model Evaluation & Optimization!")

---
## Practice Exercises

1. Try clustering with 3 features (include Age)
2. Apply PCA before clustering
3. Experiment with different K values
4. Try hierarchical clustering (`scipy.cluster.hierarchy`)

---
**Next Up:** Day 9 - Model Evaluation & Hyperparameter Tuning!