# Project 028: Predicting Optimal MTU Size for a Network Path

## Objective
Build a regression model that can predict the optimal MTU size for a given network path and application type, aiming to maximize throughput and minimize fragmentation.

## Dataset
We'll create a synthetic dataset simulating the results of path MTU discovery tests under various conditions.

## Model
**Gradient Boosting Regressor** - A powerful regression model that can capture complex interactions between network factors.

In [None]:
# ==================================================================================
#  Project 28: Predicting Optimal MTU Size for a Network Path
# ==================================================================================
#
# Objective:
# This notebook builds a regression model to predict the optimal MTU size
# based on application type and path characteristics, using a synthetic dataset.
#
# To Run in Google Colab:
# Copy and paste this entire code block into a single cell and run it.

# ----------------------------------------
# 1. Import Necessary Libraries
# ----------------------------------------
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully.")

In [None]:
# ----------------------------------------
# 2. Synthetic Optimal MTU Data Generation
# ----------------------------------------
print("--- Generating Synthetic Optimal MTU Dataset ---")

num_samples = 2000
data = []
application_types = ['VOIP', 'Video_Streaming', 'Bulk_Data_Transfer', 'Web_Browsing', 'Database_Replication']

for _ in range(num_samples):
    app_type = random.choice(application_types)
    base_latency_ms = np.random.uniform(5, 100)
    # Simulate if the path includes a VPN, which adds overhead and reduces optimal MTU
    has_vpn_tunnel = np.random.choice([0, 1], p=[0.7, 0.3])
    
    # --- Define Rules for Optimal MTU ---
    # This is our ground truth logic
    if has_vpn_tunnel:
        base_mtu = 1400
    else:
        base_mtu = 1500

    if app_type == 'VOIP':
        # VOIP uses small packets, so large MTU is inefficient overhead
        optimal_mtu = np.random.randint(500, 700)
    elif app_type == 'Bulk_Data_Transfer':
        # Bulk transfers benefit from the largest possible MTU to reduce header overhead
        optimal_mtu = base_mtu - np.random.randint(0, 20) # Small variations
    elif app_type == 'Web_Browsing':
        # Web browsing has a mix of packet sizes
        optimal_mtu = np.random.randint(1300, 1500)
    else: # Video streaming and DB replication
        optimal_mtu = base_mtu - np.random.randint(10, 50)
        
    data.append([app_type, base_latency_ms, has_vpn_tunnel, optimal_mtu])

df = pd.DataFrame(data, columns=['application_type', 'base_latency_ms', 'has_vpn_tunnel', 'optimal_mtu'])
print(f"Dataset generation complete. Created {len(df)} samples.")
print("\nDataset Sample:")
print(df.sample(5))

# Display basic statistics
print("\nDataset Statistics:")
print(df.describe())

In [None]:
# ----------------------------------------
# 3. Exploratory Data Analysis
# ----------------------------------------
print("\n--- Exploratory Data Analysis ---")

# Analyze MTU distribution by application type
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
sns.boxplot(data=df, x='application_type', y='optimal_mtu')
plt.title('MTU Distribution by Application Type')
plt.xticks(rotation=45)

plt.subplot(2, 3, 2)
sns.scatterplot(data=df, x='base_latency_ms', y='optimal_mtu', hue='application_type')
plt.title('MTU vs Base Latency by Application')

plt.subplot(2, 3, 3)
vpn_mtu = df.groupby('has_vpn_tunnel')['optimal_mtu'].mean()
vpn_labels = ['No VPN', 'With VPN']
plt.bar(vpn_labels, vpn_mtu, color=['lightblue', 'orange'])
plt.title('Average MTU: VPN vs No VPN')
plt.ylabel('Average Optimal MTU')

plt.subplot(2, 3, 4)
app_counts = df['application_type'].value_counts()
plt.pie(app_counts.values, labels=app_counts.index, autopct='%1.1f%%')
plt.title('Application Type Distribution')

plt.subplot(2, 3, 5)
sns.histplot(data=df, x='optimal_mtu', bins=30, kde=True)
plt.title('Distribution of Optimal MTU Values')

plt.subplot(2, 3, 6)
correlation_matrix = df[['base_latency_ms', 'has_vpn_tunnel', 'optimal_mtu']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')

plt.tight_layout()
plt.show()

print("\nKey Insights from EDA:")
print(f"- Average MTU with VPN: {df[df['has_vpn_tunnel']==1]['optimal_mtu'].mean():.0f} bytes")
print(f"- Average MTU without VPN: {df[df['has_vpn_tunnel']==0]['optimal_mtu'].mean():.0f} bytes")
print(f"- MTU range: {df['optimal_mtu'].min():.0f} - {df['optimal_mtu'].max():.0f} bytes")

In [None]:
# ----------------------------------------
# 4. Data Splitting and Encoding
# ----------------------------------------
print("\n--- Splitting and Encoding Data ---")

X = df.drop(columns=['optimal_mtu'])
y = df['optimal_mtu']

# One-hot encode the 'application_type' categorical feature
X_encoded = pd.get_dummies(X, columns=['application_type'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")

print("\nEncoded feature columns:")
print(list(X_encoded.columns))

In [None]:
# ----------------------------------------
# 5. Model Training with Gradient Boosting Regressor
# ----------------------------------------
print("\n--- Model Training ---")

# Initialize the Gradient Boosting Regressor
model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

print("Training the Gradient Boosting model...")
model.fit(X_train, y_train)
print("Training complete.")

print("\nModel parameters:")
print(f"- Number of estimators: {model.n_estimators}")
print(f"- Learning rate: {model.learning_rate}")
print(f"- Max depth: {model.max_depth}")
print(f"- Random state: {model.random_state}")

In [None]:
# ----------------------------------------
# 6. Model Evaluation
# ----------------------------------------
print("\n--- Model Evaluation ---")
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f} bytes")
print(f"  (On average, the model's MTU prediction is off by just +/- {mae:.2f} bytes)")
print(f"R-squared (R²): {r2:.2%}")
print(f"  ({r2:.0%} of the variance in the optimal MTU can be explained by our features)")

# Calculate additional metrics
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} bytes")

# --- Visualization: Actual vs. Predicted ---
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2, label='Perfect Prediction')
plt.xlabel('Actual Optimal MTU (bytes)')
plt.ylabel('Predicted Optimal MTU (bytes)')
plt.title('Actual vs. Predicted Optimal MTU')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted MTU (bytes)')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# ----------------------------------------
# 7. Feature Importance Analysis
# ----------------------------------------
print("\n--- Feature Importance ---")
importances = model.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values('Importance', ascending=False)

print("\nFeature Importance Ranking:")
for i, row in feature_importance_df.iterrows():
    print(f"{row['Feature']:.<30} {row['Importance']:.3f}")

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Feature Importance in Predicting Optimal MTU')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

# Analyze most important features
top_feature = feature_importance_df.iloc[0]['Feature']
print(f"\nMost important feature: {top_feature} (importance: {feature_importance_df.iloc[0]['Importance']:.3f})")

# Show feature importance insights
print("\nFeature Importance Insights:")
for _, row in feature_importance_df.head(3).iterrows():
    feature_name = row['Feature']
    if 'application_type' in feature_name:
        app_type = feature_name.split('_')[-1]
        print(f"- {app_type} applications significantly influence MTU selection")
    elif feature_name == 'has_vpn_tunnel':
        print(f"- VPN tunneling is a major factor in MTU optimization")
    elif feature_name == 'base_latency_ms':
        print(f"- Network latency affects optimal MTU size selection")

In [None]:
# ----------------------------------------
# 8. Model Predictions on New Scenarios
# ----------------------------------------
print("\n--- Testing Model on New Scenarios ---")

# Create test scenarios
test_scenarios = [
    {'application_type': 'VOIP', 'base_latency_ms': 20, 'has_vpn_tunnel': 0, 'description': 'VOIP call without VPN'},
    {'application_type': 'VOIP', 'base_latency_ms': 20, 'has_vpn_tunnel': 1, 'description': 'VOIP call with VPN'},
    {'application_type': 'Bulk_Data_Transfer', 'base_latency_ms': 50, 'has_vpn_tunnel': 0, 'description': 'Large file transfer'},
    {'application_type': 'Video_Streaming', 'base_latency_ms': 30, 'has_vpn_tunnel': 1, 'description': 'Video streaming through VPN'},
    {'application_type': 'Web_Browsing', 'base_latency_ms': 80, 'has_vpn_tunnel': 0, 'description': 'Web browsing (high latency)'}
]

print("Scenario Predictions:")
print("" + "=" * 80)

for scenario in test_scenarios:
    # Prepare data for prediction
    scenario_df = pd.DataFrame([scenario[:-1]])  # Exclude description
    scenario_encoded = pd.get_dummies(scenario_df, columns=['application_type'])
    
    # Ensure all columns are present (same as training)
    for col in X_encoded.columns:
        if col not in scenario_encoded.columns:
            scenario_encoded[col] = 0
    
    scenario_encoded = scenario_encoded[X_encoded.columns]  # Reorder columns
    
    # Make prediction
    predicted_mtu = model.predict(scenario_encoded)[0]
    
    print(f"Scenario: {scenario['description']}")
    print(f"- Application: {scenario['application_type']}")
    print(f"- Base Latency: {scenario['base_latency_ms']} ms")
    print(f"- VPN Tunnel: {'Yes' if scenario['has_vpn_tunnel'] else 'No'}")
    print(f"- Predicted Optimal MTU: {predicted_mtu:.0f} bytes")
    print("-" * 40)

print("\nPrediction completed for all test scenarios.")

In [None]:
# ----------------------------------------
# 9. Model Performance by Application Type
# ----------------------------------------
print("\n--- Model Performance by Application Type ---")

# Add predictions to test set for analysis
test_results = X_test.copy()
test_results['actual_mtu'] = y_test
test_results['predicted_mtu'] = y_pred
test_results['error'] = abs(y_test - y_pred)

# Decode application types for analysis
app_cols = [col for col in X_test.columns if 'application_type' in col]
test_results['app_type'] = 'Bulk_Data_Transfer'  # Default (first dropped)

for col in app_cols:
    app_name = col.split('_')[-1]
    test_results.loc[test_results[col] == 1, 'app_type'] = app_name

# Calculate performance metrics by application
app_performance = test_results.groupby('app_type').agg({
    'error': ['mean', 'std', 'count'],
    'actual_mtu': 'mean',
    'predicted_mtu': 'mean'
}).round(2)

print("Performance by Application Type:")
print(app_performance)

# Visualize performance by application
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
sns.boxplot(data=test_results, x='app_type', y='error')
plt.title('Prediction Error by Application Type')
plt.ylabel('Absolute Error (bytes)')
plt.xticks(rotation=45)

plt.subplot(2, 2, 2)
for app_type in test_results['app_type'].unique():
    app_data = test_results[test_results['app_type'] == app_type]
    plt.scatter(app_data['actual_mtu'], app_data['predicted_mtu'], 
               label=app_type, alpha=0.7)
plt.plot([test_results['actual_mtu'].min(), test_results['actual_mtu'].max()],
         [test_results['actual_mtu'].min(), test_results['actual_mtu'].max()],
         '--r', label='Perfect Prediction')
plt.xlabel('Actual MTU')
plt.ylabel('Predicted MTU')
plt.title('Predictions by Application Type')
plt.legend()

plt.subplot(2, 2, 3)
mean_errors = test_results.groupby('app_type')['error'].mean()
plt.bar(mean_errors.index, mean_errors.values, color='lightcoral')
plt.title('Mean Absolute Error by Application')
plt.ylabel('Mean Error (bytes)')
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
vpn_performance = test_results.groupby('has_vpn_tunnel')['error'].mean()
plt.bar(['No VPN', 'With VPN'], vpn_performance.values, color=['lightblue', 'orange'])
plt.title('Prediction Error: VPN vs No VPN')
plt.ylabel('Mean Absolute Error (bytes)')

plt.tight_layout()
plt.show()

In [None]:
# ----------------------------------------
# 10. Conclusion and Future Applications
# ----------------------------------------
print("\n" + "="*80)
print("                              CONCLUSION")
print("="*80)

print(f"The Gradient Boosting model learned to predict the optimal MTU size with a high degree of accuracy (R² of {r2:.2%}).")
print("\nKey Takeaways:")
print("- The model's low Mean Absolute Error shows it can provide very precise MTU recommendations,")
print("  helping to avoid both fragmentation and unnecessary overhead.")
print("- The feature importance plot provides critical insights. It clearly shows that the")
print("  `application_type` (specifically `Bulk_Data_Transfer` and `VOIP`) is the most decisive factor,")
print("  followed by whether a `VPN tunnel` is in use.")
print("- This confirms that the model learned the underlying network engineering principles correctly.")

print("\nPractical Applications:")
print("- This type of predictive model could be a key component in an advanced Software-Defined")
print("  Networking (SDN) controller.")
print("- The controller could monitor application flows, and for each new flow, query the model")
print("  to determine the optimal MTU.")
print("- It could then enforce this MTU size on the virtual interfaces for that specific flow,")
print("  creating a highly dynamic and application-aware network that optimizes its own performance")
print("  on a per-flow basis.")

print("\nModel Performance Summary:")
print(f"- Mean Absolute Error: {mae:.2f} bytes")
print(f"- Root Mean Squared Error: {rmse:.2f} bytes")
print(f"- R-squared Score: {r2:.2%}")
print(f"- Prediction accuracy within 50 bytes: {sum(abs(y_test - y_pred) <= 50) / len(y_test) * 100:.1f}%")

print("\nFuture Enhancements:")
print("- Integration with real-world Path MTU Discovery (PMTUD) protocols")
print("- Support for multi-hop network paths with varying MTU constraints")
print("- Real-time adaptation based on network congestion and performance feedback")
print("- Extension to cloud and container networking environments")
print("- Protocol-specific MTU optimization (TCP, UDP, QUIC)")

print("" + "="*80)
print("                    MTU OPTIMIZATION MODEL COMPLETED SUCCESSFULLY")
print("="*80)