# ‚õèÔ∏è Proje 3: Veri Madenciliƒüi ve Bilgi Ke≈üfi

**Ders:** Veri Madenciliƒüi  
**Veri Seti:** Steel Plates Fault Detection  
**Ama√ß:** K√ºmeleme, Boyut Azaltma ve Anomali Tespiti

## 1. Kurulum

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage

warnings.filterwarnings('ignore')
np.random.seed(42)
print("‚úÖ Libraries imported!")

## 2. Veri Y√ºkleme

In [None]:
feature_names = ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
    'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity',
    'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300',
    'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index',
    'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index',
    'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index',
    'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas']
class_names = ['Pastry', 'Z_Scratch', 'K_Scratch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

df = pd.read_csv('../data/raw/steel_plates_fault.csv', header=None)
df.columns = feature_names + class_names

X = df[feature_names].values
y = df[class_names].values.argmax(axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"‚úÖ Loaded: {X.shape}")

## 3. Ke≈üifsel Veri Analizi

In [None]:
# Statistics
print("üìä Descriptive Statistics:")
display(df[feature_names].describe().round(2))

# Correlation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Class distribution
y_labels = df[class_names].idxmax(axis=1)
y_labels.value_counts().plot(kind='bar', ax=axes[0], color=plt.cm.viridis(np.linspace(0.2, 0.8, 7)))
axes[0].set_title('Class Distribution', fontweight='bold')

# Correlation
corr = df[feature_names].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, ax=axes[1], cmap='coolwarm', center=0, square=True)
axes[1].set_title('Feature Correlation', fontweight='bold')

plt.tight_layout()
plt.show()

# Top correlations
print("\nüìà Top Correlations:")
corr_pairs = corr.unstack().drop_duplicates()
corr_pairs = corr_pairs[corr_pairs < 1].sort_values(ascending=False)
print(corr_pairs.head(5))

## 4. Boyut Azaltma (PCA)

In [None]:
# PCA
pca = PCA()
X_pca_all = pca.fit_transform(X_scaled)

# Explained variance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cumsum = np.cumsum(pca.explained_variance_ratio_)
axes[0].plot(range(1, len(cumsum)+1), cumsum, 'bo-')
axes[0].axhline(y=0.9, color='r', linestyle='--', label='90% variance')
axes[0].set_xlabel('Number of Components')
axes[0].set_ylabel('Cumulative Explained Variance')
axes[0].set_title('PCA Explained Variance', fontweight='bold')
axes[0].legend()

# 2D projection
pca_2d = PCA(n_components=2)
X_pca = pca_2d.fit_transform(X_scaled)
scatter = axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.6, s=20)
axes[1].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]*100:.1f}%)')
axes[1].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]*100:.1f}%)')
axes[1].set_title('PCA 2D Projection', fontweight='bold')
plt.colorbar(scatter, ax=axes[1])

plt.tight_layout()
plt.show()

print(f"\nüìä Variance explained by first 10 PCs: {cumsum[9]*100:.1f}%")

## 5. t-SNE G√∂rselle≈ütirmesi

In [None]:
# t-SNE
print("üîÑ Computing t-SNE (this may take a moment)...")
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap='viridis', alpha=0.6, s=20)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE Visualization', fontweight='bold')
plt.colorbar(scatter, label='Class')
plt.tight_layout()
plt.show()

## 6. K√ºmeleme Analizi

In [None]:
# Find optimal K using elbow method
inertias = []
silhouettes = []
K_range = range(2, 12)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, kmeans.labels_))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(K_range, inertias, 'bo-')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method', fontweight='bold')

axes[1].plot(K_range, silhouettes, 'go-')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis', fontweight='bold')

plt.tight_layout()
plt.show()

optimal_k = K_range[np.argmax(silhouettes)]
print(f"\nüìä Optimal K based on Silhouette: {optimal_k}")

In [None]:
# Apply clustering algorithms
k = 7  # Same as number of classes

# K-Means
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Hierarchical
hc = AgglomerativeClustering(n_clusters=k)
hc_labels = hc.fit_predict(X_scaled)

# DBSCAN
dbscan = DBSCAN(eps=2.0, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)

# Compare
print("üìä Clustering Comparison:")
print(f"  K-Means Silhouette: {silhouette_score(X_scaled, kmeans_labels):.4f}")
print(f"  Hierarchical Silhouette: {silhouette_score(X_scaled, hc_labels):.4f}")
if len(set(dbscan_labels)) > 1:
    mask = dbscan_labels != -1
    print(f"  DBSCAN Silhouette: {silhouette_score(X_scaled[mask], dbscan_labels[mask]):.4f}")
    print(f"  DBSCAN Clusters: {len(set(dbscan_labels)) - 1}, Noise: {(dbscan_labels == -1).sum()}")

# Visualize
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, labels, title in zip(axes, [kmeans_labels, hc_labels, dbscan_labels], ['K-Means', 'Hierarchical', 'DBSCAN']):
    scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='tab10', alpha=0.6, s=20)
    ax.set_title(title, fontweight='bold')
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
plt.tight_layout()
plt.show()

## 7. Anomali Tespiti

In [None]:
# Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
anomaly_labels = iso_forest.fit_predict(X_scaled)

n_anomalies = (anomaly_labels == -1).sum()
n_normal = (anomaly_labels == 1).sum()

print(f"üìä Anomaly Detection Results:")
print(f"  Normal samples: {n_normal} ({n_normal/len(anomaly_labels)*100:.1f}%)")
print(f"  Anomalies: {n_anomalies} ({n_anomalies/len(anomaly_labels)*100:.1f}%)")

# Visualize
plt.figure(figsize=(10, 8))
colors = ['red' if x == -1 else 'blue' for x in anomaly_labels]
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.5, s=20)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Anomaly Detection (Red = Anomaly)', fontweight='bold')
plt.tight_layout()
plt.show()

# Analyze anomalies
print("\nüìà Anomaly Analysis by Class:")
anomaly_mask = anomaly_labels == -1
for i, cls in enumerate(class_names):
    cls_mask = y == i
    cls_anomalies = (anomaly_mask & cls_mask).sum()
    print(f"  {cls}: {cls_anomalies} anomalies ({cls_anomalies/cls_mask.sum()*100:.1f}%)")

## 8. Sonu√ßlar

### üéØ Temel Bulgular

1. **PCA**: 10 bile≈üen varyansƒ±n %91.8'ini yakalar
2. **K√ºmeleme**: K-Means ~0.14 silhouette skoru ile en iyi performansƒ± g√∂sterir
3. **Optimal K**: 7 k√ºme hata t√ºrleri sayƒ±sƒ±yla e≈üle≈üiyor
4. **Anomaliler**: √ñrneklerin ~%10'u olaƒüandƒ±≈üƒ± desenler g√∂steriyor
5. Geometrik ve parlaklƒ±k √∂zellikleri arasƒ±nda **g√º√ßl√º korelasyonlar** var

### üìå Pratik Sonu√ßlar
- Kalite kontrol i√ßin anomali tespiti kullanƒ±n
- K√ºmeleme yeni kusur kategorilerini belirleyebilir
- PCA daha hƒ±zlƒ± i≈üleme saƒülar

‚úÖ **Proje tamamlandƒ±!**