# MLOps CI/CD Pipeline - Interactive Demo

This notebook demonstrates the ML pipeline in action, showing how to:
- Train a model
- Evaluate performance
- Make predictions
- Visualize results

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Load and Explore Data

In [None]:
# Load wine dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target, name='quality')

print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(wine.target_names)}")
print(f"\nClass names: {wine.target_names}")

# Display first few rows
X.head()

In [None]:
# Class distribution
plt.figure(figsize=(8, 5))
y.value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Wine Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 2. Data Preprocessing

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

## 3. Train Model

In [None]:
# Train Random Forest
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)
print("âœ“ Model training completed!")

## 4. Evaluate Model

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=wine.target_names))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=wine.target_names,
            yticklabels=wine.target_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## 5. Feature Importance

In [None]:
# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1][:10]  # Top 10

plt.figure(figsize=(12, 6))
plt.title('Top 10 Feature Importances')
plt.bar(range(len(indices)), importances[indices])
plt.xticks(range(len(indices)),
           [wine.feature_names[i] for i in indices],
           rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print importances
print("\nTop 10 Most Important Features:")
for i, idx in enumerate(indices, 1):
    print(f"{i}. {wine.feature_names[idx]}: {importances[idx]:.4f}")

## 6. Make Predictions on New Data

In [None]:
# Example prediction
sample_idx = 0
sample = X_test.iloc[sample_idx:sample_idx+1]
sample_scaled = scaler.transform(sample)

prediction = model.predict(sample_scaled)[0]
probabilities = model.predict_proba(sample_scaled)[0]

print(f"Sample features:")
print(sample.T)
print(f"\nPredicted class: {wine.target_names[prediction]}")
print(f"Confidence: {probabilities[prediction]:.2%}")
print(f"\nClass probabilities:")
for i, prob in enumerate(probabilities):
    print(f"  {wine.target_names[i]}: {prob:.2%}")

## 7. Model Performance Summary

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, average='weighted'),
    'Recall': recall_score(y_test, y_pred, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
}

# Create summary DataFrame
summary = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Score'])
summary['Score'] = summary['Score'].round(4)

print("\nðŸ“Š Model Performance Summary:")
print("=" * 40)
print(summary.to_string(index=False))
print("=" * 40)

# Visualize metrics
plt.figure(figsize=(10, 5))
plt.bar(summary['Metric'], summary['Score'], color='skyblue')
plt.ylim(0, 1.1)
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.axhline(y=0.8, color='r', linestyle='--', label='80% Threshold')
plt.legend()
for i, v in enumerate(summary['Score']):
    plt.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')
plt.show()

## Next Steps

This demo shows the core ML pipeline. In the full automated CI/CD setup:

1. **GitHub Actions** automatically runs this pipeline on every code push
2. **CML** generates reports and posts them to pull requests
3. **Automated testing** ensures model quality
4. **Deployment** happens automatically when tests pass

To see the automation in action:
```bash
# Run the full pipeline
make pipeline

# Or create a PR to see CML reports
git checkout -b feature/improve-model
# Make changes to params.yaml
git commit -am "Improve model hyperparameters"
git push
# Create PR and see automated CML report!
```