# Project 3: Network Traffic Volume Forecasting

**Objective:** To predict future network traffic volume based on historical data. This is a critical task for capacity planning, resource allocation, and identifying future bottlenecks before they occur.

**Dataset Source:** Kaggle - Internet Traffic Time Series dataset with daily traffic data from an ISP.

**Model:** Facebook's Prophet - a forecasting procedure designed for time-series with strong seasonal patterns.

**Business Value:** Enables proactive network management, capacity planning, and resource optimization.

## 1. Setup Kaggle API and Download Data

In [None]:
import os

# Check if kaggle.json already exists to avoid re-uploading
if not os.path.exists('/root/.kaggle/kaggle.json'):
    print("--- Setting up Kaggle API ---")

    # Install the Kaggle library
    !pip install -q kaggle

    # For Google Colab - prompt user to upload their kaggle.json file
    try:
        from google.colab import files
        print("\nPlease upload your kaggle.json file:")
        uploaded = files.upload()

        if 'kaggle.json' not in uploaded:
            print("\nError: kaggle.json not uploaded. Please restart the cell and upload the file.")
            exit()

        print("\nkaggle.json uploaded successfully.")

        # Create the .kaggle directory and move the json file there
        !mkdir -p ~/.kaggle
        !cp kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    except ImportError:
        print("Not running in Google Colab. Please ensure Kaggle API is configured.")
        print("Place your kaggle.json in ~/.kaggle/ directory")
else:
    print("Kaggle API already configured.")

In [None]:
print("\n--- Downloading Internet Traffic Time Series Dataset from Kaggle ---")
# Download the dataset (user: shenba, dataset: internet-traffic-time-series-data)
!kaggle datasets download -d shenba/internet-traffic-time-series-data

print("\n--- Unzipping the dataset ---")
# Unzip the downloaded file
!unzip -q internet-traffic-time-series-data.zip -d .

print("\nDataset setup complete.")

## 2. Load and Prepare the Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("--- Loading and Preprocessing Data ---")

# Install the Prophet library
!pip install -q prophet

from prophet import Prophet

In [None]:
try:
    # Load the CSV file
    df = pd.read_csv('internet_traffic_data.csv', parse_dates=['Date'], dayfirst=True)
    print("Loaded internet_traffic_data.csv successfully.")
except FileNotFoundError:
    print("Error: internet_traffic_data.csv not found.")
    print("Please ensure the dataset is downloaded and extracted properly.")
    exit()

# Prophet requires columns to be named 'ds' (datestamp) and 'y' (value)
df.rename(columns={'Date': 'ds', 'Traffic': 'y'}, inplace=True)

print("\nDataset preview:")
print(df.head(10))

print("\nDataset info:")
df.info()

print("\nBasic statistics:")
print(df['y'].describe())

## 3. Exploratory Data Analysis

In [None]:
# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Visualize the complete time series
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Full time series plot
axes[0].plot(df['ds'], df['y'], linewidth=1, alpha=0.8)
axes[0].set_title('Historical Internet Traffic Volume', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Traffic Volume')
axes[0].grid(True, alpha=0.3)

# Distribution of traffic values
axes[1].hist(df['y'], bins=50, alpha=0.7, edgecolor='black')
axes[1].set_title('Distribution of Traffic Values', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Traffic Volume')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Dataset spans from {df['ds'].min()} to {df['ds'].max()}")
print(f"Total data points: {len(df)}")
print(f"Missing values: {df['y'].isnull().sum()}")

In [None]:
# Analyze seasonality patterns
df_analysis = df.copy()
df_analysis['year'] = df_analysis['ds'].dt.year
df_analysis['month'] = df_analysis['ds'].dt.month
df_analysis['day_of_week'] = df_analysis['ds'].dt.day_of_week
df_analysis['day_name'] = df_analysis['ds'].dt.day_name()

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Monthly patterns
monthly_avg = df_analysis.groupby('month')['y'].mean()
axes[0,0].bar(monthly_avg.index, monthly_avg.values, alpha=0.7)
axes[0,0].set_title('Average Traffic by Month', fontweight='bold')
axes[0,0].set_xlabel('Month')
axes[0,0].set_ylabel('Average Traffic Volume')
axes[0,0].grid(True, alpha=0.3)

# Day of week patterns
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_avg = df_analysis.groupby('day_of_week')['y'].mean()
axes[0,1].bar(range(7), dow_avg.values, alpha=0.7)
axes[0,1].set_title('Average Traffic by Day of Week', fontweight='bold')
axes[0,1].set_xlabel('Day of Week')
axes[0,1].set_ylabel('Average Traffic Volume')
axes[0,1].set_xticks(range(7))
axes[0,1].set_xticklabels(day_names, rotation=45)
axes[0,1].grid(True, alpha=0.3)

# Yearly trends
yearly_avg = df_analysis.groupby('year')['y'].mean()
axes[1,0].plot(yearly_avg.index, yearly_avg.values, marker='o', linewidth=2, markersize=8)
axes[1,0].set_title('Average Traffic by Year', fontweight='bold')
axes[1,0].set_xlabel('Year')
axes[1,0].set_ylabel('Average Traffic Volume')
axes[1,0].grid(True, alpha=0.3)

# Recent 90 days detail view
recent_data = df.tail(90)
axes[1,1].plot(recent_data['ds'], recent_data['y'], linewidth=2)
axes[1,1].set_title('Traffic Volume - Last 90 Days', fontweight='bold')
axes[1,1].set_xlabel('Date')
axes[1,1].set_ylabel('Traffic Volume')
axes[1,1].grid(True, alpha=0.3)
plt.setp(axes[1,1].xaxis.get_majorticklabels(), rotation=45)

plt.tight_layout()
plt.show()

## 4. Data Preparation for Modeling

In [None]:
# Split data for evaluation
# Use the last 90 days for testing to evaluate model performance
split_point = len(df) - 90
df_train = df.iloc[:split_point].copy()
df_test = df.iloc[split_point:].copy()

print(f"\nSplitting data for evaluation:")
print(f"Training data: {df_train['ds'].min()} to {df_train['ds'].max()} ({len(df_train)} points)")
print(f"Test data: {df_test['ds'].min()} to {df_test['ds'].max()} ({len(df_test)} points)")

# Visualize the split
plt.figure(figsize=(15, 6))
plt.plot(df_train['ds'], df_train['y'], label='Training Data', alpha=0.8)
plt.plot(df_test['ds'], df_test['y'], label='Test Data', alpha=0.8, color='red')
plt.axvline(df_train['ds'].max(), color='black', linestyle='--', alpha=0.7, label='Train/Test Split')
plt.title('Training and Test Data Split', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Traffic Volume')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Model Training with Prophet

In [None]:
print("\n--- Model Training ---")

# Initialize the Prophet model with appropriate seasonality settings
# Enable yearly and weekly seasonality, disable daily (not relevant for daily data)
model = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True, 
    daily_seasonality=False,
    seasonality_mode='multiplicative',  # Often better for traffic data
    changepoint_prior_scale=0.05  # Controls trend flexibility
)

print("Training the Prophet model...")
print("This may take a few minutes for large datasets...")

# Fit the model to training data
model.fit(df_train)
print("Training complete.")

# Display model parameters
print("\nModel Configuration:")
print(f"- Yearly seasonality: {model.yearly_seasonality}")
print(f"- Weekly seasonality: {model.weekly_seasonality}")
print(f"- Seasonality mode: {model.seasonality_mode}")
print(f"- Number of changepoints: {len(model.changepoints)}")

## 6. Generate Forecasts

In [None]:
# Create a dataframe for future predictions
# We'll predict for the test period plus additional future periods
future_periods = len(df_test) + 90  # Test period + 90 days ahead
future = model.make_future_dataframe(periods=future_periods)

print("\nGenerating forecast...")
print(f"Forecasting {future_periods} days into the future...")

# Generate the forecast
forecast = model.predict(future)

# Display forecast summary
print("\nForecast preview (last 10 predictions):")
forecast_summary = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(10)
print(forecast_summary)

print(f"\nForecast range: {forecast['ds'].min()} to {forecast['ds'].max()}")
print(f"Total forecast points: {len(forecast)}")

## 7. Model Evaluation and Performance Metrics

In [None]:
print("\n--- Model Evaluation ---")

# Extract predictions for the test period
test_forecast = forecast.iloc[split_point:split_point+len(df_test)]
y_pred = test_forecast['yhat'].values
y_true = df_test['y'].values

# Calculate performance metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
r2_score = 1 - (np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2))

print("\n=== Quantitative Evaluation Results ===")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"R-squared Score: {r2_score:.4f}")

print(f"\nRelative Performance:")
print(f"RMSE as % of mean traffic: {(rmse/np.mean(y_true)*100):.2f}%")
print(f"MAE as % of mean traffic: {(mae/np.mean(y_true)*100):.2f}%")

# Performance interpretation
if mape < 10:
    performance = "Excellent"
elif mape < 20:
    performance = "Good"
elif mape < 30:
    performance = "Acceptable"
else:
    performance = "Needs Improvement"
    
print(f"\nOverall Model Performance: {performance} (MAPE: {mape:.1f}%)")

## 8. Comprehensive Forecast Visualization

In [None]:
# Create comprehensive forecast visualization
fig, axes = plt.subplots(3, 1, figsize=(16, 15))

# 1. Main forecast plot
# Plot historical data
axes[0].plot(df_train['ds'], df_train['y'], label='Training Data', color='blue', alpha=0.7, linewidth=1)
axes[0].plot(df_test['ds'], df_test['y'], label='Actual Test Data', color='red', alpha=0.8, linewidth=2)

# Plot forecast
forecast_line = forecast.iloc[split_point:]
axes[0].plot(forecast_line['ds'], forecast_line['yhat'], label='Forecast', color='green', linewidth=2)

# Plot uncertainty interval
axes[0].fill_between(forecast_line['ds'], forecast_line['yhat_lower'], forecast_line['yhat_upper'], 
                     alpha=0.3, color='green', label='Uncertainty Interval')

# Add vertical line at forecast start
axes[0].axvline(df_train['ds'].max(), color='black', linestyle='--', alpha=0.8, label='Forecast Start')

axes[0].set_title('Network Traffic Forecast with Uncertainty Intervals', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Traffic Volume')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. Test period detailed comparison
axes[1].plot(df_test['ds'], df_test['y'], label='Actual', color='red', linewidth=2, marker='o', markersize=4)
axes[1].plot(test_forecast['ds'], test_forecast['yhat'], label='Predicted', color='green', linewidth=2, marker='s', markersize=4)
axes[1].fill_between(test_forecast['ds'], test_forecast['yhat_lower'], test_forecast['yhat_upper'], 
                     alpha=0.3, color='green', label='Prediction Interval')

axes[1].set_title('Test Period: Actual vs Predicted Traffic', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Traffic Volume')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# 3. Residuals analysis
residuals = y_true - y_pred
axes[2].scatter(y_pred, residuals, alpha=0.6)
axes[2].axhline(y=0, color='red', linestyle='--', alpha=0.8)
axes[2].set_title('Residuals Analysis (Actual - Predicted)', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Predicted Values')
axes[2].set_ylabel('Residuals')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Additional residuals statistics
print("\nResiduals Analysis:")
print(f"Mean residual: {np.mean(residuals):.4f}")
print(f"Std deviation of residuals: {np.std(residuals):.4f}")
print(f"Min residual: {np.min(residuals):.2f}")
print(f"Max residual: {np.max(residuals):.2f}")

## 9. Forecast Components Analysis

In [None]:
# Plot forecast components to understand the underlying patterns
print("\nAnalyzing forecast components...")

fig = model.plot_components(forecast, figsize=(12, 10))
plt.suptitle('Traffic Forecast Components Analysis', fontsize=16, fontweight='bold', y=0.98)
plt.show()

print("\n=== Component Analysis Insights ===")
print("1. TREND: Shows the long-term direction of traffic volume over time")
print("   - Upward trend indicates growing network usage")
print("   - Changepoints show where growth patterns shifted")

print("\n2. YEARLY SEASONALITY: Reveals annual patterns")
print("   - Peak periods may correspond to holiday seasons or business cycles")
print("   - Helpful for annual capacity planning")

print("\n3. WEEKLY SEASONALITY: Shows day-of-week patterns")
print("   - Typically lower on weekends (days 5-6)")
print("   - Peak usage often mid-week")
print("   - Critical for daily operational planning")

## 10. Future Traffic Predictions and Business Insights

In [None]:
# Extract future predictions (beyond test period)
future_start_idx = split_point + len(df_test)
future_predictions = forecast.iloc[future_start_idx:]

print("\n=== Future Traffic Predictions (Next 90 Days) ===")
print(f"Prediction period: {future_predictions['ds'].min()} to {future_predictions['ds'].max()}")

# Key statistics for future predictions
future_mean = future_predictions['yhat'].mean()
future_max = future_predictions['yhat'].max()
future_min = future_predictions['yhat'].min()
current_mean = df_train['y'].mean()

print(f"\nFuture Traffic Statistics:")
print(f"- Average predicted traffic: {future_mean:.2f}")
print(f"- Peak predicted traffic: {future_max:.2f}")
print(f"- Minimum predicted traffic: {future_min:.2f}")
print(f"- Historical average: {current_mean:.2f}")
print(f"- Projected growth: {((future_mean/current_mean - 1) * 100):.1f}%")

# Capacity planning insights
capacity_threshold = future_max * 1.2  # 20% buffer above peak prediction
print(f"\nCapacity Planning Recommendation:")
print(f"- Recommended capacity: {capacity_threshold:.2f} (20% buffer above peak)")
print(f"- Current capacity utilization at peak: {(future_max/current_mean*100):.1f}%")

# Visualize future predictions
plt.figure(figsize=(15, 8))

# Plot recent historical data for context
recent_history = df.tail(180)  # Last 6 months
plt.plot(recent_history['ds'], recent_history['y'], label='Recent Historical Data', 
         color='blue', alpha=0.7, linewidth=2)

# Plot future predictions
plt.plot(future_predictions['ds'], future_predictions['yhat'], 
         label='Future Predictions', color='red', linewidth=3)

plt.fill_between(future_predictions['ds'], 
                future_predictions['yhat_lower'], 
                future_predictions['yhat_upper'], 
                alpha=0.3, color='red', label='Prediction Uncertainty')

# Add capacity planning line
plt.axhline(y=capacity_threshold, color='orange', linestyle='--', 
           linewidth=2, label=f'Recommended Capacity ({capacity_threshold:.0f})')

plt.axvline(df['ds'].max(), color='green', linestyle='--', 
           alpha=0.8, label='Current Date')

plt.title('Network Traffic Forecast for Capacity Planning', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Traffic Volume')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 11. Key Insights and Business Recommendations

In [None]:
print("\n" + "="*60)
print("           NETWORK TRAFFIC FORECASTING INSIGHTS")
print("="*60)

print("\n📊 MODEL PERFORMANCE SUMMARY:")
print(f"   • Forecast Accuracy (MAPE): {mape:.1f}% - {performance}")
print(f"   • R-squared Score: {r2_score:.3f}")
print(f"   • Model explains {r2_score*100:.1f}% of traffic variation")

print("\n📈 TRAFFIC PATTERN INSIGHTS:")
# Weekly patterns
weekly_pattern = forecast[['ds']].copy()
weekly_pattern['day_of_week'] = weekly_pattern['ds'].dt.dayofweek
weekly_avg = forecast.groupby(weekly_pattern['day_of_week'])['yhat'].mean()
peak_day = weekly_avg.idxmax()
low_day = weekly_avg.idxmin()

day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
print(f"   • Peak traffic day: {day_names[peak_day]}")
print(f"   • Lowest traffic day: {day_names[low_day]}")
print(f"   • Weekly variation: {((weekly_avg.max() - weekly_avg.min()) / weekly_avg.mean() * 100):.1f}%")

print("\n🔮 FUTURE PROJECTIONS (Next 90 Days):")
print(f"   • Expected average traffic: {future_mean:.0f} units")
print(f"   • Peak traffic prediction: {future_max:.0f} units")
print(f"   • Growth vs historical avg: {((future_mean/current_mean - 1) * 100):+.1f}%")

print("\n💼 BUSINESS RECOMMENDATIONS:")
print(f"   ✓ Set capacity threshold at {capacity_threshold:.0f} units (20% buffer)")

if (future_max/current_mean) > 1.5:
    print("   ⚠️  HIGH GROWTH ALERT: Consider infrastructure expansion")
elif (future_max/current_mean) > 1.2:
    print("   📊 MODERATE GROWTH: Monitor capacity closely")
else:
    print("   ✅ STABLE GROWTH: Current capacity should be sufficient")

print(f"   ✓ Plan maintenance during low-traffic periods ({day_names[low_day]}s)")
print(f"   ✓ Prepare for peak loads on {day_names[peak_day]}s")

if mape < 15:
    print("   ✓ Model is reliable for operational planning")
else:
    print("   ⚠️  Consider model refinements for better accuracy")

print("\n🎯 NEXT STEPS:")
print("   1. Integrate forecasts into capacity planning workflows")
print("   2. Set up automated alerts for predicted peak periods")
print("   3. Update model monthly with new data")
print("   4. Consider external factors (holidays, events) for enhanced accuracy")
print("   5. Validate predictions against actual outcomes")

print("\n" + "="*60)
print("     FORECAST GENERATED WITH FACEBOOK PROPHET")
print(f"     Model training completed with {len(df_train)} data points")
print("="*60)

## 12. Export Results and Model Artifacts

In [None]:
# Save forecast results to CSV for further analysis
forecast_export = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
forecast_export.columns = ['Date', 'Predicted_Traffic', 'Lower_Bound', 'Upper_Bound']
forecast_export.to_csv('network_traffic_forecast.csv', index=False)
print("✅ Forecast results exported to 'network_traffic_forecast.csv'")

# Create summary report
summary_report = {
    'model_performance': {
        'mae': float(mae),
        'rmse': float(rmse),
        'mape': float(mape),
        'r2_score': float(r2_score)
    },
    'data_info': {
        'training_period': f"{df_train['ds'].min()} to {df_train['ds'].max()}",
        'training_points': len(df_train),
        'test_points': len(df_test),
        'forecast_periods': len(future_predictions)
    },
    'business_insights': {
        'peak_traffic_day': day_names[peak_day],
        'lowest_traffic_day': day_names[low_day],
        'projected_growth_percent': float((future_mean/current_mean - 1) * 100),
        'recommended_capacity': float(capacity_threshold)
    }
}

import json
with open('forecast_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print("✅ Summary report exported to 'forecast_summary.json'")
print("\n📁 Generated Files:")
print("   • network_traffic_forecast.csv - Complete forecast data")
print("   • forecast_summary.json - Performance metrics and insights")

print("\n🎉 Network Traffic Forecasting Analysis Complete!")
print("   Use these forecasts for capacity planning and resource allocation.")