# Supervised Learning Algorithms

This notebook demonstrates:
- Regression algorithms (Linear, Polynomial)
- Classification algorithms (Decision Trees, Random Forest, KNN)
- Differential Privacy implementations

## 1. Setup and Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.datasets import load_iris, load_breast_cancer, make_classification

# Regression
from sklearn.linear_model import LinearRegression

# Classification
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Differential Privacy
try:
    from diffprivlib.models import LinearRegression as DPLinearRegression
    from diffprivlib.models import DecisionTreeClassifier as DPDecisionTreeClassifier
except ImportError:
    !pip install diffprivlib
    from diffprivlib.models import LinearRegression as DPLinearRegression
    from diffprivlib.models import DecisionTreeClassifier as DPDecisionTreeClassifier

plt.style.use('seaborn-v0_8')
%matplotlib inline

## 2. Regression Examples

### 2.1 Simple Linear Regression (House Price Prediction)

In [None]:
# Sample data
X = np.array([750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200]).reshape(-1, 1)
y = np.array([150000, 165000, 175000, 185000, 195000, 210000, 220000, 230000, 245000, 250000])

# Create and fit model
model = LinearRegression()
model.fit(X, y)

# Predictions
y_pred = model.predict(X)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Actual Prices')
plt.plot(X, y_pred, color='red', linewidth=2, label='Predicted Prices')
plt.title('House Price Prediction', fontsize=16)
plt.xlabel('Size (sq ft)', fontsize=14)
plt.ylabel('Price ($)', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()

print(f"Model equation: Price = {model.intercept_:.2f} + {model.coef_[0]:.2f}*Size")
print(f"MSE: {mean_squared_error(y, y_pred):.2f}")

### 2.2 Polynomial Regression

In [None]:
# Generate non-linear data
np.random.seed(42)
X = np.linspace(-3, 3, 100).reshape(-1, 1)
y = 0.5*X**3 + X**2 - 2*X + np.random.normal(0, 3, 100).reshape(-1, 1)

# Transform features
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(X)

# Fit model
model = LinearRegression()
model.fit(X_poly, y)

# Predictions
y_pred = model.predict(X_poly)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Data points')
plt.plot(X, y_pred, color='red', linewidth=2, label='Polynomial fit (degree=3)')
plt.title('Polynomial Regression Example', fontsize=16)
plt.xlabel('Feature', fontsize=14)
plt.ylabel('Target', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()

## 3. Classification Examples

### 3.1 Decision Tree (Iris Dataset)

In [None]:
# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit model
tree = DecisionTreeClassifier(max_depth=3, random_state=42)
tree.fit(X_train, y_train)

# Visualize tree
plt.figure(figsize=(20, 10))
plot_tree(tree, feature_names=iris.feature_names, 
          class_names=iris.target_names, filled=True, rounded=True)
plt.title('Decision Tree for Iris Classification', fontsize=16)
plt.show()

# Evaluate
print(f"Test accuracy: {tree.score(X_test, y_test):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, tree.predict(X_test), target_names=iris.target_names))

### 3.2 Random Forest (Breast Cancer Detection)

In [None]:
# Load data
data = load_breast_cancer()
X, y = data.data, data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
print("Random Forest Performance:")
print(classification_report(y_test, rf.predict(X_test), target_names=data.target_names))

# Feature importance
plt.figure(figsize=(12, 8))
sorted_idx = rf.feature_importances_.argsort()
plt.barh(np.array(data.feature_names)[sorted_idx], rf.feature_importances_[sorted_idx])
plt.title('Feature Importance in Breast Cancer Detection', fontsize=16)
plt.xlabel('Importance Score', fontsize=14)
plt.show()

### 3.3 K-Nearest Neighbors (Customer Segmentation)

In [None]:
# Generate synthetic data
X, y = make_classification(n_samples=200, n_features=2, n_classes=3, 
                          n_clusters_per_class=1, random_state=42)

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create and fit model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_scaled, y)

# Create meshgrid for decision boundaries
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))

# Predict for each point in meshgrid
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, s=50, edgecolor='k', cmap='viridis')
plt.title('KNN Decision Boundaries (k=5)', fontsize=16)
plt.xlabel('Feature 1 (scaled)', fontsize=14)
plt.ylabel('Feature 2 (scaled)', fontsize=14)
plt.colorbar(label='Class')
plt.show()

## 4. Differential Privacy in Supervised Learning

### 4.1 Differentially Private Linear Regression

In [None]:
# Reuse the simple linear regression data
X = np.array([750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200]).reshape(-1, 1)
y = np.array([150000, 165000, 175000, 185000, 195000, 210000, 220000, 230000, 245000, 250000])

# Create DP model (epsilon is privacy budget)
dp_model = DPLinearRegression(epsilon=1.0, data_norm=10.0)
dp_model.fit(X, y)

# Regular regression for comparison
reg_model = LinearRegression()
reg_model.fit(X, y)

# Predictions
y_pred_dp = dp_model.predict(X)
y_pred_reg = reg_model.predict(X)

# Plot comparison
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', alpha=0.7, label='Actual Data')
plt.plot(X, y_pred_reg, color='red', linewidth=2, label='Regular Regression')
plt.plot(X, y_pred_dp, color='green', linewidth=2, linestyle='--', label='DP Regression (ε=1.0)')
plt.title('Comparison: Regular vs Differentially Private Regression', fontsize=16)
plt.xlabel('Size (sq ft)', fontsize=14)
plt.ylabel('Price ($)', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()

print("Regular Regression MSE:", mean_squared_error(y, y_pred_reg))
print("DP Regression MSE:", mean_squared_error(y, y_pred_dp))

### 4.2 Privacy-Preserving Decision Trees

In [None]:
# Load iris data again
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DP decision tree
dp_tree = DPDecisionTreeClassifier(epsilon=0.5, max_depth=3, random_state=42)
dp_tree.fit(X_train, y_train)

# Regular decision tree for comparison
reg_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
reg_tree.fit(X_train, y_train)

# Visualize DP tree
plt.figure(figsize=(20, 10))
plot_tree(dp_tree, feature_names=iris.feature_names, 
          class_names=iris.target_names, filled=True, rounded=True)
plt.title('Differentially Private Decision Tree (ε=0.5)', fontsize=16)
plt.show()

# Compare performance
print("Regular Decision Tree Accuracy:", reg_tree.score(X_test, y_test))
print("DP Decision Tree Accuracy:", dp_tree.score(X_test, y_test))

## 5. Summary

This notebook demonstrated:
- Regression techniques (Linear and Polynomial)
- Classification algorithms (Decision Trees, Random Forest, KNN)
- Differential Privacy implementations for both regression and classification

Key takeaways:
- Different algorithms work better for different types of problems
- Differential Privacy adds noise to protect individual data while maintaining model utility
- There's always a trade-off between model accuracy and privacy protection