# Project 30: Virtual Network Function (VNF) Performance Prediction

**Objective:** Build a regression model that can predict the maximum achievable throughput (in Gbps) of a VNF based on its type, allocated resources (vCPUs, RAM), and workload characteristics.

**Dataset Source:** Synthetically Generated (realistic VNF performance data)

**Model:** XGBoost Regressor for capturing complex non-linear relationships between resource allocation and VNF performance

**Instructions:**
This notebook is fully self-contained and does not require external files. Simply run all cells in sequence.

In [None]:
# ==================================================================================
#  Project 30: VNF Performance Prediction - Setup and Imports
# ==================================================================================

# Install XGBoost if not already installed
try:
    import xgboost as xgb
except ImportError:
    print("Installing XGBoost...")
    !pip install -q xgboost
    import xgboost as xgb

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)

print("All libraries imported successfully.")

In [None]:
# ==================================================================================
#  Synthetic VNF Performance Data Generation
# ==================================================================================

print("--- Generating Synthetic VNF Performance Dataset ---")

num_samples = 3000
data = []
vnf_types = ['Firewall', 'Router', 'LoadBalancer', 'IDS']

for _ in range(num_samples):
    vnf_type = random.choice(vnf_types)
    vcpus = random.randint(2, 16)
    ram_gb = random.choice([4, 8, 16, 32])
    
    # Configuration complexity affects performance
    if vnf_type == 'Firewall':
        config_complexity = np.random.randint(100, 5000)  # Number of rules
    elif vnf_type == 'IDS':
        config_complexity = np.random.randint(500, 10000)  # Number of signatures
    else:
        config_complexity = np.random.randint(10, 100)  # e.g., number of routes/VIPs

    # --- Performance Formula ---
    # Base performance is driven by vCPUs (primary factor) and RAM (secondary)
    base_throughput = (vcpus * 1.5) + (ram_gb * 0.2)
    
    # Complexity introduces a performance penalty (non-linear)
    complexity_penalty = np.log1p(config_complexity) * 0.5
    if vnf_type in ['Firewall', 'IDS']:
        complexity_penalty *= 1.5  # These are more sensitive to complexity
    
    # Add random noise
    random_noise = np.random.normal(0, 0.5)
    
    # Calculate final throughput
    throughput_gbps = base_throughput - complexity_penalty + random_noise
    throughput_gbps = max(1, throughput_gbps)  # Ensure a minimum performance
    
    data.append([vnf_type, vcpus, ram_gb, config_complexity, throughput_gbps])

df = pd.DataFrame(data, columns=['vnf_type', 'vcpus', 'ram_gb', 'config_complexity', 'throughput_gbps'])
print(f"Dataset generation complete. Created {len(df)} samples.")
print("\nDataset Sample:")
print(df.sample(10))

print("\nDataset Statistics:")
print(df.describe())

print("\nVNF Type Distribution:")
print(df['vnf_type'].value_counts())

In [None]:
# ==================================================================================
#  Data Preprocessing and Encoding
# ==================================================================================

print("--- Data Preprocessing and Encoding ---")

# Separate features and target
X = df.drop(columns=['throughput_gbps'])
y = df['throughput_gbps']

print(f"Original feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# One-hot encode the 'vnf_type' categorical feature
X_encoded = pd.get_dummies(X, columns=['vnf_type'], drop_first=True)

print(f"\nAfter encoding, feature matrix shape: {X_encoded.shape}")
print(f"Encoded columns: {list(X_encoded.columns)}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print("Data preprocessing complete.")

In [None]:
# ==================================================================================
#  Model Training with XGBoost Regressor
# ==================================================================================

print("--- Model Training ---")

# Initialize XGBoost Regressor with optimized parameters
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=150,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

print("Training the XGBoost Regressor model...")
model.fit(X_train, y_train)
print("Training complete.")

# Display model parameters
print(f"\nModel Parameters:")
print(f"• Number of estimators: {model.n_estimators}")
print(f"• Learning rate: {model.learning_rate}")
print(f"• Max depth: {model.max_depth}")

In [None]:
# ==================================================================================
#  Model Evaluation
# ==================================================================================

print("--- Model Evaluation ---")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"\nModel Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.3f} Gbps")
print(f"  (On average, the model's throughput prediction is off by ±{mae:.3f} Gbps)")
print(f"Root Mean Square Error (RMSE): {rmse:.3f} Gbps")
print(f"R-squared (R²): {r2:.3f}")
print(f"  ({r2:.1%} of the variance in throughput can be explained by our features)")

# Calculate additional metrics
mean_actual = np.mean(y_test)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"\nAdditional Metrics:")
print(f"Mean Actual Throughput: {mean_actual:.3f} Gbps")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

In [None]:
# ==================================================================================
#  Results Visualization
# ==================================================================================

print("--- Results Visualization ---")

# Create subplots for comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# 1. Actual vs. Predicted scatter plot
ax1.scatter(y_test, y_pred, alpha=0.6, color='blue')
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2, label='Perfect Prediction')
ax1.set_xlabel('Actual Throughput (Gbps)')
ax1.set_ylabel('Predicted Throughput (Gbps)')
ax1.set_title('Actual vs. Predicted VNF Throughput')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Residuals plot
residuals = y_test - y_pred
ax2.scatter(y_pred, residuals, alpha=0.6, color='green')
ax2.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax2.set_xlabel('Predicted Throughput (Gbps)')
ax2.set_ylabel('Residuals (Actual - Predicted)')
ax2.set_title('Residuals Plot')
ax2.grid(True, alpha=0.3)

# 3. Feature importance (using XGBoost's built-in plot)
feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=True)

ax3.barh(feature_importance['feature'], feature_importance['importance'])
ax3.set_xlabel('Feature Importance')
ax3.set_title('Feature Importance in VNF Performance Prediction')
ax3.grid(True, alpha=0.3)

# 4. Distribution of predictions vs actual
ax4.hist(y_test, bins=30, alpha=0.7, label='Actual', color='blue')
ax4.hist(y_pred, bins=30, alpha=0.7, label='Predicted', color='red')
ax4.set_xlabel('Throughput (Gbps)')
ax4.set_ylabel('Frequency')
ax4.set_title('Distribution: Actual vs Predicted Throughput')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Display feature importance table
print("\n--- Feature Importance: What drives VNF performance? ---")
feature_importance_sorted = feature_importance.sort_values('importance', ascending=False)
print(feature_importance_sorted)

In [None]:
# ==================================================================================
#  Performance Prediction Examples
# ==================================================================================

print("--- Performance Prediction Examples ---")

# Create example VNF configurations for prediction
example_configs = [
    {'vnf_type': 'Firewall', 'vcpus': 4, 'ram_gb': 8, 'config_complexity': 1000},
    {'vnf_type': 'Firewall', 'vcpus': 8, 'ram_gb': 16, 'config_complexity': 1000},
    {'vnf_type': 'Router', 'vcpus': 4, 'ram_gb': 8, 'config_complexity': 50},
    {'vnf_type': 'LoadBalancer', 'vcpus': 6, 'ram_gb': 16, 'config_complexity': 20},
    {'vnf_type': 'IDS', 'vcpus': 8, 'ram_gb': 32, 'config_complexity': 5000}
]

# Convert to DataFrame and encode
examples_df = pd.DataFrame(example_configs)
examples_encoded = pd.get_dummies(examples_df, columns=['vnf_type'], drop_first=True)

# Ensure all columns from training are present
for col in X_encoded.columns:
    if col not in examples_encoded.columns:
        examples_encoded[col] = 0

# Reorder columns to match training data
examples_encoded = examples_encoded[X_encoded.columns]

# Make predictions
predictions = model.predict(examples_encoded)

print("\nVNF Performance Predictions:")
print("=" * 80)
for i, (config, pred) in enumerate(zip(example_configs, predictions)):
    print(f"Example {i+1}:")
    print(f"  VNF Type: {config['vnf_type']}")
    print(f"  vCPUs: {config['vcpus']}, RAM: {config['ram_gb']} GB")
    print(f"  Config Complexity: {config['config_complexity']}")
    print(f"  Predicted Throughput: {pred:.2f} Gbps")
    print("-" * 40)

In [None]:
# ==================================================================================
#  Conclusion
# ==================================================================================

print("--- Conclusion ---")
print(f"The XGBoost model successfully learned the complex relationships between resources, configuration, and VNF performance, achieving an R² score of {r2:.3f}.")

print("\nKey Insights:")
print(f"• Total VNF configurations analyzed: {len(df):,}")
print(f"• Average prediction error: ±{mae:.3f} Gbps")
print(f"• Model explains {r2:.1%} of throughput variance")
print(f"• Mean Absolute Percentage Error: {mape:.2f}%")

print("\nBusiness Impact:")
print("• The model provides accurate performance predictions essential for resource planning in NFV environments")
print("• Prevents both under-provisioning (SLA violations) and over-provisioning (resource waste)")
print("• Feature importance confirms that vCPUs are the most critical factor for performance")
print("• Configuration complexity acts as a significant performance penalty, especially for security VNFs")

print("\nReal-world Application:")
print("• NFV Orchestrators can use this model as a 'performance oracle'")
print("• Before deploying VNFs, ask: 'What resources are needed to guarantee X Gbps?'")
print("• Enables automatic right-sizing of VNF deployments")
print("• Leads to highly efficient, automated, and SLA-aware cloud platforms")

print("\nTechnical Validation:")
print("• The model correctly captures the non-linear relationship between configuration complexity and performance")
print("• Resource scaling effects are accurately modeled with vCPUs having the highest impact")
print("• Performance penalties for complex configurations align with real-world VNF behavior")