# MLOps Production Pipeline - Data Exploration and Model Training

This notebook demonstrates the complete MLOps pipeline including data loading, feature engineering, model training, and deployment preparation.

## 1. Setup and Imports

In [None]:
# Install missing packages
%pip install matplotlib

# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import sys
import os

# Add src to path
sys.path.append('../src')

# Custom modules
from data.data_loader import DataLoader, DataPreprocessor
from features.feature_engineering import FeatureEngineer, FeatureScaler, FeatureSelector
from deployment.deployment_utils import ModelDeployment, create_model_artifact

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Setup complete!")

## 2. Data Loading and Exploration

In [None]:
# Initialize data loader
data_loader = DataLoader("../data")

# Load or generate sample data
df = data_loader.generate_sample_data(n_samples=2000)

print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()

In [None]:
# Basic statistics
print("Dataset Description:")
df.describe()

In [None]:
# Target distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
df['target'].value_counts().plot(kind='bar')
plt.title('Target Distribution')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
df['target'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Target Distribution (Percentage)')

plt.tight_layout()
plt.show()

print(f"Target distribution:\n{df['target'].value_counts()}")

## 3. Data Preprocessing

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Clean data
df_clean = preprocessor.clean_data(df)
print(f"Data shape after cleaning: {df_clean.shape}")

# Encode categorical variables
df_encoded = preprocessor.encode_categorical(df_clean)
print(f"Data shape after encoding: {df_encoded.shape}")

# Split features and target
X, y = preprocessor.split_features_target(df_encoded, 'target')
print(f"Features shape: {X.shape}, Target shape: {y.shape}")

## 4. Feature Engineering

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer()

# Create interaction features
X_interactions = engineer.create_interaction_features(X)
print(f"Shape after interaction features: {X_interactions.shape}")

# Create polynomial features
X_poly = engineer.create_polynomial_features(X_interactions, degree=2)
print(f"Shape after polynomial features: {X_poly.shape}")

# Feature scaling
scaler = FeatureScaler(method='standard')
X_scaled = scaler.fit_transform(X_poly)
print(f"Shape after scaling: {X_scaled.shape}")

In [None]:
# Feature selection
selector = FeatureSelector(method='univariate', k=15)
X_selected = selector.fit_transform(X_scaled, y)
print(f"Shape after feature selection: {X_selected.shape}")
print(f"Selected features: {selector.selected_features}")

## 5. Model Training and Evaluation

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

In [None]:
# Train Random Forest model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

print("Training model...")
model.fit(X_train, y_train)
print("Model training completed!")

In [None]:
# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Calculate metrics
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_selected.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

## 6. Model Deployment Preparation

In [None]:
# Prepare metrics for model artifact
metrics = {
    'train_accuracy': train_accuracy,
    'test_accuracy': test_accuracy,
    'n_features': X_selected.shape[1],
    'n_samples_train': X_train.shape[0],
    'n_samples_test': X_test.shape[0]
}

# Create model artifact
artifact = create_model_artifact(
    model=model,
    model_name="random_forest_classifier",
    version="1.0",
    metrics=metrics,
    feature_names=X_selected.columns.tolist()
)

print(f"Model artifact created: {artifact['model_filename']}")
print(f"Model metadata: {artifact['metadata']}")

In [None]:
# Test deployment
deployment = ModelDeployment("../models")
deployment.load_model(artifact['model_filename'])

# Test prediction
sample_features = X_test.iloc[0].values
prediction = deployment.predict(sample_features)
probabilities = deployment.predict_proba(sample_features)

print(f"Sample prediction: {prediction[0]}")
print(f"Prediction probabilities: {probabilities[0]}")
print(f"Actual value: {y_test.iloc[0]}")

# Get model info
model_info = deployment.get_model_info()
print(f"\nModel Info: {model_info}")

## 7. Save Processed Data

In [None]:
# Save processed data for future use
processed_data = X_selected.copy()
processed_data['target'] = y

data_loader.save_processed_data(processed_data, "processed_features.csv")
print("Processed data saved successfully!")

# Save reference data for monitoring
reference_data = X_selected.sample(200, random_state=42)
reference_data.to_csv("../data/reference/reference_features.csv", index=False)
print("Reference data saved for monitoring!")

## Summary

This notebook demonstrated the complete MLOps pipeline:

1. **Data Loading**: Generated sample data and loaded it using our custom data loader
2. **Data Preprocessing**: Cleaned and encoded the data
3. **Feature Engineering**: Created interaction and polynomial features
4. **Feature Selection**: Selected the most important features
5. **Model Training**: Trained a Random Forest classifier
6. **Model Evaluation**: Evaluated model performance
7. **Model Deployment**: Prepared the model for deployment with metadata
8. **Data Persistence**: Saved processed data for future use

The trained model is now ready for deployment using the FastAPI service!