<a href="https://colab.research.google.com/github/bahing-rai/BigData/blob/main/busprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BUS DELAY PREDICTION SYSTEM - COMPLETE WITH INLINE VISUALIZATIONS**




Step 0: Setting up all imports and initializing the Spark session

This file contains all the necessary imports and configurations

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, expr, length, rand, randn, lit,
    avg, count, stddev, min as spark_min, max as spark_max,
    row_number, lag, dense_rank, round as spark_round, regexp_extract,
    trim
)
from pyspark.sql.window import Window
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from sklearn.metrics import confusion_matrix, classification_report
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Enable inline plotting
%matplotlib inline

print("✓ All imports loaded successfully")

# Initialize PySpark session
spark = SparkSession.builder \
    .appName("BusDelayPrediction") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

print("✓ Spark session initialized and ready to go")


Step 1: Loading the bus delay dataset

This step reads the CSV file and prepares it for processing

In [None]:
# Load the CSV file
CSV_FILE_PATH = "/content/bus_data_combined_s.csv"

df_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(CSV_FILE_PATH)

print("\nLoading the bus delay dataset...")
print(f"✓ Data loaded successfully")
print(f"  Total records: {df_raw.count():,}")
print(f"  Number of columns: {len(df_raw.columns)}")

print("\nDataset structure:")
df_raw.printSchema()

print("\nFirst 5 rows of data:")
df_raw.show(5, truncate=False)


Step 2: Cleaning and preparing the data

Removing duplicates, trimming whitespace, and handling missing values

In [None]:
print("\nCleaning the dataset...")

# Remove duplicate records
initial_count = df_raw.count()
df_clean = df_raw.dropDuplicates()
duplicates_removed = initial_count - df_clean.count()

if duplicates_removed > 0:
    print(f"✓ Removed {duplicates_removed} duplicate records")
else:
    print("✓ No duplicates found in the dataset")

# Trim whitespace from all columns
for col_name in df_clean.columns:
    df_clean = df_clean.withColumn(col_name, trim(col(col_name)))

print("✓ Whitespace trimmed from all columns")

# Handle missing values with reasonable defaults
df_clean = df_clean.fillna({
    'AnnotatedStopPointRef_Indicator': 'Unknown',
    'AnnotatedStopPointRef_LocalityQualifier': 'Unknown',
    'AnnotatedStopPointRef_CommonName': 'Unknown',
    'AnnotatedStopPointRef_LocalityName': 'Unknown'
})

print("✓ Missing values handled")
print(f"\nFinal clean dataset: {df_clean.count():,} records ready for processing")


Cleaning the dataset...
✓ No duplicates found in the dataset
✓ Whitespace trimmed from all columns
✓ Missing values handled

Final clean dataset: 569 records ready for processing


Step 3: Creating features for the machine learning models

Engineering 11 features from the raw data

In [None]:
print("\nCreating features for the models...")

# Extract route information
df_features = df_clean.withColumn(
    'route_id',
    regexp_extract(col('source_file'), r'tfl_(\d+)', 1)
)

# Create stop sequence
window_spec = Window.partitionBy('source_file').orderBy('AnnotatedStopPointRef_StopPointRef')
df_features = df_features.withColumn('stop_sequence', row_number().over(window_spec))

# Count total stops per route
route_stats = df_features.groupBy('source_file').agg(
    count('*').alias('total_stops_in_route')
)
df_features = df_features.join(route_stats, on='source_file')

print("✓ Route-based features created")

# Stop-level features
df_features = df_features.withColumn('stop_name_length', length(col('AnnotatedStopPointRef_CommonName')))
df_features = df_features.withColumn('has_indicator', when(col('AnnotatedStopPointRef_Indicator') != 'Unknown', 1).otherwise(0))
df_features = df_features.withColumn('is_london', when(col('AnnotatedStopPointRef_LocalityQualifier') == 'Greater London', 1).otherwise(0))

print("✓ Stop-level features added")

# Route characteristics
df_features = df_features.withColumn('route_complexity', when(col('total_stops_in_route') > 30, 2).when(col('total_stops_in_route') > 15, 1).otherwise(0))
df_features = df_features.withColumn('stop_position', when(col('stop_sequence') <= 3, 0).when(col('stop_sequence') >= col('total_stops_in_route') - 2, 2).otherwise(1))

print("✓ Route characteristics created")

# Temporal features
df_features = df_features.withColumn('hour_of_day', (rand() * 24).cast('int'))
df_features = df_features.withColumn('day_of_week', (rand() * 7).cast('int'))
df_features = df_features.withColumn('is_peak_hour', when((col('hour_of_day').between(7, 9)) | (col('hour_of_day').between(17, 19)), 1).otherwise(0))
df_features = df_features.withColumn('is_weekend', when(col('day_of_week').isin([5, 6]), 1).otherwise(0))

print("✓ Temporal features added")

# Environmental conditions
df_features = df_features.withColumn('traffic_level', (rand() * 3).cast('int'))
df_features = df_features.withColumn('weather_condition', (rand() * 3).cast('int'))

print("✓ Environmental features added")

# Create the target variable (whether bus is delayed over 5 minutes)
df_features = df_features.withColumn(
    'delay_minutes',
    lit(2.0) +
    when(col('is_peak_hour') == 1, 8.0).otherwise(0.0) +
    when(col('is_london') == 1, 3.0).otherwise(0.0) +
    when(col('traffic_level') == 2, 6.0).when(col('traffic_level') == 1, 3.0).otherwise(0.0) +
    when(col('weather_condition') == 2, 5.0).when(col('weather_condition') == 1, 2.0).otherwise(0.0) +
    when(col('route_complexity') == 2, 4.0).when(col('route_complexity') == 1, 2.0).otherwise(0.0) +
    when(col('stop_position') == 1, 2.0).otherwise(0.0) +
    (randn() * 3.0)
)

df_features = df_features.withColumn('delay_minutes', when(col('delay_minutes') < 0, 0.0).otherwise(col('delay_minutes')))
df_features = df_features.withColumn('is_delayed', when(col('delay_minutes') > 5, 1).otherwise(0))

print("✓ Delay target variable created")

print(f"\nFeature engineering complete with {len(df_features.columns)} total fields")
print("✓ Dataset is now ready for analysis")



Creating features for the models...
✓ Route-based features created
✓ Stop-level features added
✓ Route characteristics created
✓ Temporal features added
✓ Environmental features added
✓ Delay target variable created

Feature engineering complete with 22 total fields
✓ Dataset is now ready for analysis


Step 4: Exploratory Data Analysis

Creating visualizations to understand the data patterns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print("\nAnalyzing the data with visualizations...")

# Convert to Pandas for visualization
df_eda = df_features.select(
    'delay_minutes', 'is_delayed', 'is_peak_hour', 'is_london',
    'traffic_level', 'weather_condition', 'route_complexity', 'stop_position'
).toPandas()

print("\nDelay statistics from the dataset:")
print(f"  Average delay: {df_eda['delay_minutes'].mean():.2f} minutes")
print(f"  Median delay: {df_eda['delay_minutes'].median():.2f} minutes")
print(f"  Maximum delay: {df_eda['delay_minutes'].max():.2f} minutes")

delayed_count = (df_eda['is_delayed'] == 1).sum()
on_time_count = (df_eda['is_delayed'] == 0).sum()

print(f"\nHow many buses are delayed vs on-time:")
print(f"  Delayed buses (>5 minutes): {delayed_count} ({delayed_count/len(df_eda)*100:.1f}%)")
print(f"  On-time buses: {on_time_count} ({on_time_count/len(df_eda)*100:.1f}%)")

print("\nGenerating visualization charts...")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Chart 1: How delays are distributed
axes[0, 0].hist(df_eda['delay_minutes'], bins=30, color='steelblue', edgecolor='black')
axes[0, 0].set_title('Distribution of Bus Delays', fontweight='bold', fontsize=14)
axes[0, 0].set_xlabel('Delay (minutes)')
axes[0, 0].set_ylabel('Number of buses')
axes[0, 0].grid(axis='y', alpha=0.3)

# Chart 2: Peak hour impact
peak_delay = df_eda.groupby('is_peak_hour')['delay_minutes'].mean()
axes[0, 1].bar(['Off-Peak Hours', 'Peak Hours'], peak_delay.values, color=['green', 'red'], edgecolor='black')
axes[0, 1].set_title('How Peak Hours Affect Delays', fontweight='bold', fontsize=14)
axes[0, 1].set_ylabel('Average delay (minutes)')
axes[0, 1].grid(axis='y', alpha=0.3)

# Chart 3: Traffic impact
traffic_delay = df_eda.groupby('traffic_level')['delay_minutes'].mean()
axes[1, 0].bar(['Low Traffic', 'Medium Traffic', 'Heavy Traffic'], traffic_delay.values, color=['green', 'orange', 'red'], edgecolor='black')
axes[1, 0].set_title('How Traffic Affects Delays', fontweight='bold', fontsize=14)
axes[1, 0].set_ylabel('Average delay (minutes)')
axes[1, 0].grid(axis='y', alpha=0.3)

# Chart 4: Location impact
location_delay = df_eda.groupby('is_london')['delay_minutes'].mean()
axes[1, 1].bar(['Other Areas', 'Greater London'], location_delay.values, color=['skyblue', 'navy'], edgecolor='black')
axes[1, 1].set_title('How Location Affects Delays', fontweight='bold', fontsize=14)
axes[1, 1].set_ylabel('Average delay (minutes)')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()
plt.close()

print("✓ Exploratory analysis complete")


Step 5: Preparing data for model training

Splitting into training and test sets, then standardizing features

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler

print("\nPreparing data for model training...")

numeric_features = [
    'stop_name_length', 'has_indicator', 'is_london', 'route_complexity',
    'stop_position', 'hour_of_day', 'is_peak_hour', 'is_weekend',
    'traffic_level', 'weather_condition', 'total_stops_in_route'
]

numeric_features = [f for f in numeric_features if f in df_features.columns]
print(f"✓ Using {len(numeric_features)} features for the models")

# Handle any missing numeric values
for col_name in numeric_features:
    df_features = df_features.fillna({col_name: 0})

print("✓ Missing values handled in numeric features")

# Create preprocessing pipeline: combine features into vectors and scale them
assembler = VectorAssembler(inputCols=numeric_features, outputCol='features')
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withMean=True, withStd=True)
preprocessing_pipeline = Pipeline(stages=[assembler, scaler])

# Apply the preprocessing pipeline
fitted_pipeline = preprocessing_pipeline.fit(df_features)
df_ml_ready = fitted_pipeline.transform(df_features)

print("✓ Feature preprocessing pipeline applied")

# Split the data into 80% training and 20% testing
train_data, test_data = df_ml_ready.randomSplit([0.8, 0.2], seed=42)

train_count = train_data.count()
test_count = test_data.count()
total = train_count + test_count

print(f"\nData split for training and testing:")
print(f"  Training set: {train_count:,} records ({train_count/total*100:.1f}%)")
print(f"  Test set: {test_count:,} records ({test_count/total*100:.1f}%)")

print("✓ Data is ready for model training")


Step 6: Training Logistic Regression (Baseline Model)

Using logistic regression as a baseline to compare against more advanced models

In [None]:
from pyspark.ml.classification import LogisticRegression

print("\nTraining the Logistic Regression model (baseline)...")

lr_classifier = LogisticRegression(
    labelCol='is_delayed',
    featuresCol='scaled_features',
    maxIter=100,
    regParam=0.01
)

lr_model = lr_classifier.fit(train_data)
print("✓ Logistic Regression model trained successfully")

# Generate predictions on both training and test data
lr_train_pred = lr_model.transform(train_data)
lr_test_pred = lr_model.transform(test_data)

print("✓ Predictions generated on training and test data")
print("\nLogistic Regression is ready for evaluation")

Step 7: Training Gradient Boosted Trees (Primary Model)

A more advanced ensemble model that typically performs better than logistic regression

In [None]:
from pyspark.ml.classification import GBTClassifier

print("\nTraining the Gradient Boosted Trees model (main model)...")

gbt_classifier = GBTClassifier(
    labelCol='is_delayed',
    featuresCol='scaled_features',
    maxIter=50,
    maxDepth=5,
    seed=42
)

gbt_model = gbt_classifier.fit(train_data)
print("✓ Gradient Boosted Trees model trained successfully")

# Generate predictions on both training and test data
gbt_train_pred = gbt_model.transform(train_data)
gbt_test_pred = gbt_model.transform(test_data)

print("✓ Predictions generated on training and test data")
print("\nGradient Boosted Trees model is ready for evaluation")


Step 8: Evaluating and comparing both models

Measuring accuracy, precision, recall, and other performance metrics

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import pandas as pd

print("\nEvaluating model performance...")

# Set up evaluators for different metrics
auc_evaluator = BinaryClassificationEvaluator(labelCol='is_delayed', rawPredictionCol='rawPrediction')
acc_evaluator = MulticlassClassificationEvaluator(labelCol='is_delayed', predictionCol='prediction', metricName='accuracy')
f1_evaluator = MulticlassClassificationEvaluator(labelCol='is_delayed', predictionCol='prediction', metricName='f1')

# Evaluate Logistic Regression
lr_train_auc = auc_evaluator.evaluate(lr_train_pred)
lr_test_auc = auc_evaluator.evaluate(lr_test_pred)
lr_test_acc = acc_evaluator.evaluate(lr_test_pred)
lr_test_f1 = f1_evaluator.evaluate(lr_test_pred)

print("\nLogistic Regression Performance:")
print(f"  Training AUC-ROC: {lr_train_auc:.4f}")
print(f"  Test AUC-ROC: {lr_test_auc:.4f}")
print(f"  Test Accuracy: {lr_test_acc:.4f} ({lr_test_acc*100:.2f}%)")
print(f"  Test F1-Score: {lr_test_f1:.4f}")

# Evaluate Gradient Boosted Trees
gbt_train_auc = auc_evaluator.evaluate(gbt_train_pred)
gbt_test_auc = auc_evaluator.evaluate(gbt_test_pred)
gbt_test_acc = acc_evaluator.evaluate(gbt_test_pred)
gbt_test_f1 = f1_evaluator.evaluate(gbt_test_pred)

print("\nGradient Boosted Trees Performance:")
print(f"  Training AUC-ROC: {gbt_train_auc:.4f}")
print(f"  Test AUC-ROC: {gbt_test_auc:.4f}")
print(f"  Test Accuracy: {gbt_test_acc:.4f} ({gbt_test_acc*100:.2f}%)")
print(f"  Test F1-Score: {gbt_test_f1:.4f}")

# Create comparison table
print("\n" + "="*80)
print("COMPARING THE TWO MODELS")
print("="*80)

comparison_df = pd.DataFrame({
    'Metric': ['Train AUC', 'Test AUC', 'Accuracy', 'F1-Score'],
    'Logistic Regression': [f"{lr_train_auc:.4f}", f"{lr_test_auc:.4f}", f"{lr_test_acc:.4f}", f"{lr_test_f1:.4f}"],
    'Gradient Boosting': [f"{gbt_train_auc:.4f}", f"{gbt_test_auc:.4f}", f"{gbt_test_acc:.4f}", f"{gbt_test_f1:.4f}"],
})

print("\n" + comparison_df.to_string(index=False))

# Determine winner
if gbt_test_f1 > lr_test_f1:
    print(f"\n✓ WINNER: Gradient Boosted Trees (F1-Score: {gbt_test_f1:.4f} vs {lr_test_f1:.4f})")
else:
    print(f"\n✓ WINNER: Logistic Regression (F1-Score: {lr_test_f1:.4f} vs {gbt_test_f1:.4f})")


Step 9: Creating and visualizing confusion matrices

Understanding true positives, false positives, and other prediction details

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

print("\nAnalyzing detailed prediction patterns...")

# Convert predictions to pandas for confusion matrix
lr_pred_pd = lr_test_pred.select('is_delayed', 'prediction').toPandas()
gbt_pred_pd = gbt_test_pred.select('is_delayed', 'prediction').toPandas()

# Calculate confusion matrices
lr_cm = confusion_matrix(lr_pred_pd['is_delayed'], lr_pred_pd['prediction'], labels=[0, 1])
gbt_cm = confusion_matrix(gbt_pred_pd['is_delayed'], gbt_pred_pd['prediction'], labels=[0, 1])

print("\nLogistic Regression - Confusion Matrix:")
print(f"                  Predicted On-Time  Predicted Delayed")
print(f"  Actual On-Time        {lr_cm[0,0]:4d}              {lr_cm[0,1]:4d}")
print(f"  Actual Delayed        {lr_cm[1,0]:4d}              {lr_cm[1,1]:4d}")

print("\nGradient Boosted Trees - Confusion Matrix:")
print(f"                  Predicted On-Time  Predicted Delayed")
print(f"  Actual On-Time        {gbt_cm[0,0]:4d}              {gbt_cm[0,1]:4d}")
print(f"  Actual Delayed        {gbt_cm[1,0]:4d}              {gbt_cm[1,1]:4d}")

print("\nLogistic Regression - Detailed Classification Report:")
print(classification_report(lr_pred_pd['is_delayed'], lr_pred_pd['prediction'],
                           labels=[0, 1], target_names=['On-Time', 'Delayed'], zero_division=0))

print("\nGradient Boosted Trees - Detailed Classification Report:")
print(classification_report(gbt_pred_pd['is_delayed'], gbt_pred_pd['prediction'],
                           labels=[0, 1], target_names=['On-Time', 'Delayed'], zero_division=0))

# Visualize both confusion matrices
print("\nGenerating confusion matrix visualizations...")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Logistic Regression confusion matrix
im1 = axes[0].imshow(lr_cm, cmap='Blues', aspect='auto')
axes[0].set_title('Logistic Regression - Confusion Matrix', fontweight='bold', fontsize=14)
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_xticks([0, 1])
axes[0].set_yticks([0, 1])
axes[0].set_xticklabels(['On-Time', 'Delayed'])
axes[0].set_yticklabels(['On-Time', 'Delayed'])
for i in range(2):
    for j in range(2):
        text_color = 'white' if lr_cm[i, j] > lr_cm.max() / 2 else 'black'
        axes[0].text(j, i, str(lr_cm[i, j]), ha='center', va='center',
                    color=text_color, fontweight='bold', fontsize=14)

# Gradient Boosted Trees confusion matrix
im2 = axes[1].imshow(gbt_cm, cmap='Greens', aspect='auto')
axes[1].set_title('Gradient Boosted Trees - Confusion Matrix', fontweight='bold', fontsize=14)
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_xticks([0, 1])
axes[1].set_yticks([0, 1])
axes[1].set_xticklabels(['On-Time', 'Delayed'])
axes[1].set_yticklabels(['On-Time', 'Delayed'])
for i in range(2):
    for j in range(2):
        text_color = 'white' if gbt_cm[i, j] > gbt_cm.max() / 2 else 'black'
        axes[1].text(j, i, str(gbt_cm[i, j]), ha='center', va='center',
                    color=text_color, fontweight='bold', fontsize=14)

plt.tight_layout()
plt.show()
plt.close()

print("✓ Confusion matrix analysis complete")


Step 10: Analyzing feature importance

Understanding which features have the most influence on predictions

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

print("\nAnalyzing which features matter most in the model...")

# Extract feature importance from the Gradient Boosted Trees model
gbt_importances = gbt_model.featureImportances.toArray()

numeric_features = [
    'stop_name_length', 'has_indicator', 'is_london', 'route_complexity',
    'stop_position', 'hour_of_day', 'is_peak_hour', 'is_weekend',
    'traffic_level', 'weather_condition', 'total_stops_in_route'
]

numeric_features = [f for f in numeric_features if f in df_features.columns]

importance_df = pd.DataFrame({
    'Feature': numeric_features,
    'Importance': gbt_importances
}).sort_values('Importance', ascending=False)

print("\nFeature Importance Ranking:")
print(importance_df.to_string(index=False))

print("\nGenerating feature importance chart...")

fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.viridis(np.linspace(0, 1, len(importance_df)))
ax.barh(importance_df['Feature'], importance_df['Importance'], color=colors, edgecolor='black')
ax.set_xlabel('Importance Score', fontweight='bold', fontsize=12)
ax.set_title('Feature Importance - Gradient Boosted Trees Model', fontweight='bold', fontsize=14)
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()
plt.close()

print("✓ Feature importance analysis complete")


Step 11: Visualizing model comparison

Creating a chart to compare performance metrics side-by-side

In [None]:
import matplotlib.pyplot as plt
import numpy as np

print("\nCreating a visual comparison of both models...")

metrics = ['Train AUC', 'Test AUC', 'Test Accuracy', 'Test F1-Score']
lr_values = [lr_train_auc, lr_test_auc, lr_test_acc, lr_test_f1]
gbt_values = [gbt_train_auc, gbt_test_auc, gbt_test_acc, gbt_test_f1]

fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(metrics))
width = 0.35

bars1 = ax.bar(x - width/2, lr_values, width, label='Logistic Regression',
               color='#3498db', edgecolor='black')
bars2 = ax.bar(x + width/2, gbt_values, width, label='Gradient Boosted Trees',
               color='#2ecc71', edgecolor='black')

ax.set_xlabel('Performance Metrics', fontweight='bold', fontsize=12)
ax.set_ylabel('Score', fontweight='bold', fontsize=12)
ax.set_title('Model Comparison - Performance Across All Metrics', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend(fontsize=11)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 1.1])

# Add value labels on each bar
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontweight='bold', fontsize=9)

plt.tight_layout()
plt.show()
plt.close()

print("✓ Model comparison visualization complete")


Step 12: Testing predictions on new data

Making predictions on example scenarios to show how the models work in practice

In [None]:
import pandas as pd

print("\nTesting the models with example scenarios...")

scenarios = [
    {
        'stop_name_length': 15, 'has_indicator': 1, 'is_london': 1,
        'route_complexity': 2, 'stop_position': 1, 'hour_of_day': 8,
        'is_peak_hour': 1, 'is_weekend': 0, 'traffic_level': 2,
        'weather_condition': 1, 'total_stops_in_route': 35,
        'description': 'Peak hour, Greater London, High traffic, Rainy'
    },
    {
        'stop_name_length': 12, 'has_indicator': 1, 'is_london': 0,
        'route_complexity': 1, 'stop_position': 0, 'hour_of_day': 14,
        'is_peak_hour': 0, 'is_weekend': 0, 'traffic_level': 1,
        'weather_condition': 0, 'total_stops_in_route': 20,
        'description': 'Afternoon, Other area, Medium traffic, Clear skies'
    },
    {
        'stop_name_length': 18, 'has_indicator': 1, 'is_london': 1,
        'route_complexity': 2, 'stop_position': 2, 'hour_of_day': 18,
        'is_peak_hour': 1, 'is_weekend': 0, 'traffic_level': 2,
        'weather_condition': 2, 'total_stops_in_route': 40,
        'description': 'Evening peak hour, Greater London, Heavy traffic, Storm warning'
    }
]

print("\nTesting predictions on example journeys:\n")

for i, scenario in enumerate(scenarios, 1):
    description = scenario.pop('description')

    new_sample_data = pd.DataFrame([scenario])
    new_sample = spark.createDataFrame(new_sample_data)
    new_sample_processed = fitted_pipeline.transform(new_sample)

    lr_new_pred = lr_model.transform(new_sample_processed)
    gbt_new_pred = gbt_model.transform(new_sample_processed)

    lr_prediction = lr_new_pred.select('prediction', 'probability').collect()[0]
    gbt_prediction = gbt_new_pred.select('prediction', 'probability').collect()[0]

    lr_status = "DELAYED" if lr_prediction['prediction'] == 1 else "ON-TIME"
    gbt_status = "DELAYED" if gbt_prediction['prediction'] == 1 else "ON-TIME"

    print(f"Scenario {i}: {description}")
    print(f"  Logistic Regression: {lr_status} (Confidence: {max(lr_prediction['probability']):.1%})")
    print(f"  Gradient Boosting:   {gbt_status} (Confidence: {max(gbt_prediction['probability']):.1%})\n")

print("✓ Prediction examples complete")


Step 13: Saving results to files

Exporting model performance and feature importance data to CSV files

In [None]:
import pandas as pd

print("\nSaving results to CSV files...")

# Save model performance results
results_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Gradient Boosted Trees'],
    'Train_AUC': [lr_train_auc, gbt_train_auc],
    'Test_AUC': [lr_test_auc, gbt_test_auc],
    'Test_Accuracy': [lr_test_acc, gbt_test_acc],
    'Test_F1': [lr_test_f1, gbt_test_f1]
})

results_df.to_csv('model_comparison_results.csv', index=False)
print("✓ Saved model_comparison_results.csv")

# Save feature importance results
importance_df.to_csv('feature_importance_results.csv', index=False)
print("✓ Saved feature_importance_results.csv")

print("\nResults saved successfully!")


Step 14: Interactive Dashboard GUI

An interactive Jupyter interface for making predictions with the trained models


In [None]:
print("\nBuilding the interactive prediction dashboard...")

try:
    from ipywidgets import interact, interactive, IntSlider, Dropdown, Button, VBox, HBox, Output, HTML, Layout
    from IPython.display import display, clear_output
    import time

    # Create output widget for predictions
    prediction_output = Output()

    # Define consistent styling
    style = {'description_width': '120px'}
    layout_full = Layout(width='700px', padding='10px')
    layout_half = Layout(width='335px', padding='10px')

    # Dashboard title
    dashboard_title = HTML(
        "<div style='width: 700px; text-align: left; margin-bottom: 15px; padding: 15px 25px; background: #fafafa; border-radius: 6px; border: 1px solid #e0e0e0;'>"
        "<h1 style='color: #1a1a1a; margin: 0 0 5px 0; font-size: 24px; font-weight: 600;'>"
        "Bus Delay Prediction</h1>"
        "<p style='color: #666; margin: 0; font-size: 12px; font-weight: 400;'>"
        "ML-powered prediction system</p>"
        "</div>"
    )

    # Stop information section
    section1_title = HTML(
        "<h4 style='color: #1a1a1a; margin: 0 0 15px 0; font-size: 14px; font-weight: 600; "
        "text-transform: uppercase; letter-spacing: 0.5px;'>Stop Information</h4>"
    )

    stop_options = [
        ('Waterloo Bridge', 5),
        ('King\'s Cross', 8),
        ('Oxford Circus', 10),
        ('Piccadilly Circus', 15),
        ('Covent Garden', 12),
        ('Bank Station', 7),
        ('Liverpool Street', 14),
        ('Tower Bridge', 12),
        ('London Bridge Station', 20),
        ('Victoria Station', 14),
        ('Charing Cross', 13),
        ('Holborn', 7),
        ('Leicester Square', 14),
        ('Green Park', 10),
        ('South Kensington', 16),
        ('Knightsbridge', 12),
        ('Hyde Park Corner', 15),
        ('Sloane Square', 12),
        ('Chelsea Embankment', 18),
        ('Westminster Station', 18)
    ]

    stop_name_length_widget = Dropdown(
        options=stop_options,
        value=15,
        description='Stop:', style=style, layout=layout_full
    )
    has_indicator_widget = Dropdown(
        options=[('Yes', 1), ('No', 0)], value=1,
        description='Indicator:', style=style, layout=layout_half
    )
    is_london_widget = Dropdown(
        options=[('Greater London', 1), ('Other Areas', 0)], value=1,
        description='Location:', style=style, layout=layout_half
    )

    row1_section1 = HBox([has_indicator_widget, is_london_widget])
    section1_widgets = VBox([section1_title, stop_name_length_widget, row1_section1])

    # Route characteristics section
    section2_title = HTML(
        "<h4 style='color: #1a1a1a; margin: 20px 0 15px 0; font-size: 14px; font-weight: 600; "
        "text-transform: uppercase; letter-spacing: 0.5px;'>Route Characteristics</h4>"
    )

    route_complexity_widget = Dropdown(
        options=[('Low (5-15)', 0), ('Medium (15-30)', 1), ('High (30+)', 2)],
        value=1, description='Complexity:', style=style, layout=layout_half
    )
    stop_position_widget = Dropdown(
        options=[('Start', 0), ('Middle', 1), ('End', 2)],
        value=1, description='Position:', style=style, layout=layout_half
    )
    row2_section2 = HBox([route_complexity_widget, stop_position_widget])

    total_stops_widget = IntSlider(
        min=10, max=50, value=30,
        description='Total Stops:', style=style, layout=layout_full
    )

    section2_widgets = VBox([section2_title, row2_section2, total_stops_widget])

    # Temporal conditions section
    section3_title = HTML(
        "<h4 style='color: #1a1a1a; margin: 20px 0 15px 0; font-size: 14px; font-weight: 600; "
        "text-transform: uppercase; letter-spacing: 0.5px;'>Temporal Conditions</h4>"
    )

    hour_widget = IntSlider(
        min=0, max=23, value=8,
        description='Hour:', style=style, layout=layout_half
    )
    is_weekend_widget = Dropdown(
        options=[('Weekday', 0), ('Weekend', 1)], value=0,
        description='Day Type:', style=style, layout=layout_half
    )
    row3_section3 = HBox([hour_widget, is_weekend_widget])

    section3_widgets = VBox([section3_title, row3_section3])

    # Environmental conditions section
    section4_title = HTML(
        "<h4 style='color: #1a1a1a; margin: 20px 0 15px 0; font-size: 14px; font-weight: 600; "
        "text-transform: uppercase; letter-spacing: 0.5px;'>Environmental Conditions</h4>"
    )

    traffic_widget = Dropdown(
        options=[('Low', 0), ('Medium', 1), ('High', 2)],
        value=1, description='Traffic:', style=style, layout=layout_half
    )
    weather_widget = Dropdown(
        options=[('Clear', 0), ('Rain', 1), ('Storm', 2)],
        value=0, description='Weather:', style=style, layout=layout_half
    )
    row4_section4 = HBox([traffic_widget, weather_widget])

    section4_widgets = VBox([section4_title, row4_section4])

    # Combine all input sections
    all_inputs = VBox([
        section1_widgets,
        section2_widgets,
        section3_widgets,
        section4_widgets,
    ], layout=Layout(
        border='1px solid #e0e0e0',
        padding='25px',
        border_radius='6px',
        width='700px',
        background_color='#fafafa'
    ))

    # Prediction button
    predict_button = Button(
        description='Get Predictions',
        button_style='info',
        tooltip='Click to generate predictions',
        layout=Layout(width='700px', height='45px')
    )
    predict_button.style.font_size = '14px'

    # Prediction function
    def predict_delay(stop_name_length, has_indicator, is_london, route_complexity,
                     stop_position, hour, is_weekend, traffic, weather, total_stops):

        with prediction_output:
            clear_output(wait=True)

            # Determine peak hour
            is_peak_hour = 1 if (7 <= hour <= 9) or (17 <= hour <= 19) else 0

            # Create input data
            input_data = pd.DataFrame([{
                'stop_name_length': stop_name_length,
                'has_indicator': has_indicator,
                'is_london': is_london,
                'route_complexity': route_complexity,
                'stop_position': stop_position,
                'hour_of_day': hour,
                'is_peak_hour': is_peak_hour,
                'is_weekend': is_weekend,
                'traffic_level': traffic,
                'weather_condition': weather,
                'total_stops_in_route': total_stops
            }])

            # Make prediction
            sample_spark = spark.createDataFrame(input_data)
            sample_processed = fitted_pipeline.transform(sample_spark)

            lr_pred = lr_model.transform(sample_processed)
            gbt_pred = gbt_model.transform(sample_processed)

            lr_result = lr_pred.select('prediction', 'probability').collect()[0]
            gbt_result = gbt_pred.select('prediction', 'probability').collect()[0]

            # Extract probabilities
            lr_prob = max(lr_result['probability'])
            gbt_prob = max(gbt_result['probability'])

            # Display header
            header_html = (
                "<div style='width: 700px; padding: 15px 25px; background: #fafafa; border: 1px solid #e0e0e0; "
                "border-radius: 6px; margin-bottom: 15px;'>"
                "<h3 style='margin: 0; color: #1a1a1a; font-size: 16px; font-weight: 600;'>Prediction Results</h3>"
                "</div>"
            )
            display(HTML(header_html))

            # Configuration summary
            config_html = (
                f"<div style='width: 700px; background: #fafafa; padding: 15px 25px; border: 1px solid #e0e0e0; "
                f"border-radius: 6px; margin-bottom: 15px;'>"
                f"<table style='width: 100%; border-collapse: collapse; font-size: 13px;'>"
                f"<tr><td style='padding: 8px 0; font-weight: 600; width: 40%; color: #1a1a1a;'>Time:</td>"
                f"<td style='padding: 8px 0; color: #1a1a1a;'>{hour:02d}:00 ({'Peak Hours' if is_peak_hour else 'Off-Peak'})</td></tr>"
                f"<tr><td style='padding: 8px 0; font-weight: 600; color: #1a1a1a;'>Location:</td>"
                f"<td style='padding: 8px 0; color: #1a1a1a;'>{'Greater London' if is_london else 'Other Areas'}</td></tr>"
                f"<tr><td style='padding: 8px 0; font-weight: 600; color: #1a1a1a;'>Day:</td>"
                f"<td style='padding: 8px 0; color: #1a1a1a;'>{'Weekend' if is_weekend else 'Weekday'}</td></tr>"
                f"<tr><td style='padding: 8px 0; font-weight: 600; color: #1a1a1a;'>Traffic:</td>"
                f"<td style='padding: 8px 0; color: #1a1a1a;'>{['Low', 'Medium', 'High'][traffic]}</td></tr>"
                f"<tr><td style='padding: 8px 0; font-weight: 600; color: #1a1a1a;'>Weather:</td>"
                f"<td style='padding: 8px 0; color: #1a1a1a;'>{['Clear', 'Rain', 'Storm'][weather]}</td></tr>"
                f"</table></div>"
            )
            display(HTML(config_html))

            # Model predictions
            lr_status = "On-Time" if lr_result['prediction'] == 0 else "Delayed"
            gbt_status = "On-Time" if gbt_result['prediction'] == 0 else "Delayed"

            # LR Card
            lr_color = "#fafafa" if lr_result['prediction'] == 0 else "#fafafa"
            lr_border = "#999999" if lr_result['prediction'] == 0 else "#f39c12"
            lr_html = (
                f"<div style='width: 700px; background: {lr_color}; border-left: 4px solid {lr_border}; padding: 15px 25px; "
                f"border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 12px;'>"
                f"<div style='font-weight: 600; color: #1a1a1a; margin-bottom: 8px; font-size: 14px;'>Logistic Regression</div>"
                f"<div style='font-size: 20px; font-weight: 700; color: #1a1a1a; margin-bottom: 8px;'>{lr_status}</div>"
                f"<div style='font-size: 13px; color: #666666;'>Confidence: {lr_prob*100:.1f}%</div>"
                f"</div>"
            )
            display(HTML(lr_html))

            # GBT Card
            gbt_color = "#fafafa" if gbt_result['prediction'] == 0 else "#fafafa"
            gbt_border = "#999999" if gbt_result['prediction'] == 0 else "#f39c12"
            gbt_html = (
                f"<div style='width: 700px; background: {gbt_color}; border-left: 4px solid {gbt_border}; padding: 15px 25px; "
                f"border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 15px;'>"
                f"<div style='font-weight: 600; color: #1a1a1a; margin-bottom: 8px; font-size: 14px;'>Gradient Boosted Trees</div>"
                f"<div style='font-size: 20px; font-weight: 700; color: #1a1a1a; margin-bottom: 8px;'>{gbt_status}</div>"
                f"<div style='font-size: 13px; color: #666666;'>Confidence: {gbt_prob*100:.1f}%</div>"
                f"</div>"
            )
            display(HTML(gbt_html))

            # Consensus prediction
            consensus = (lr_result['prediction'] + gbt_result['prediction']) / 2
            consensus_pred = "Delayed" if consensus >= 0.5 else "On-Time"
            consensus_tag_color = "#856404" if consensus >= 0.5 else "#155724"
            consensus_bg = "#fff3cd" if consensus >= 0.5 else "#d4edda"
            agreement = max(consensus, 1 - consensus) * 100

            consensus_html = (
                f"<div style='width: 700px; background: {consensus_bg}; border-left: 4px solid {consensus_tag_color}; "
                f"padding: 15px 25px; border: 1px solid #e0e0e0; border-radius: 6px; margin-bottom: 15px;'>"
                f"<div style='font-weight: 600; color: #1a1a1a; margin-bottom: 12px; font-size: 14px;'>Ensemble Consensus</div>"
                f"<div style='font-size: 24px; font-weight: 700; color: #1a1a1a; margin-bottom: 10px;'>"
                f"{consensus_pred}</div>"
                f"<table style='width: 100%; font-size: 13px; color: #1a1a1a;'>"
                f"<tr><td><strong>Model Agreement:</strong> {agreement:.1f}%</td>"
                f"<td style='text-align: right;'><strong>Avg Confidence:</strong> {(lr_prob + gbt_prob)/2*100:.1f}%</td></tr>"
                f"</table></div>"
            )
            display(HTML(consensus_html))

            # Recommendation
            if consensus >= 0.5:
                rec_bg = "#fafafa"
                rec_border = "#f39c12"
                rec_text = "This bus is likely to have delays. Plan extra travel time or consider alternatives."
            else:
                rec_bg = "#fafafa"
                rec_border = "#27ae60"
                rec_text = "This bus is predicted to be on time."

            rec_html = (
                f"<div style='width: 700px; background: {rec_bg}; border-left: 4px solid {rec_border}; padding: 15px 25px; "
                f"border: 1px solid #e0e0e0; border-radius: 6px; font-size: 13px; color: #1a1a1a;'>"
                f"<strong>Recommendation:</strong> {rec_text}</div>"
            )
            display(HTML(rec_html))

    # Create interactive GUI
    gui = interactive(
        predict_delay,
        stop_name_length=stop_name_length_widget,
        has_indicator=has_indicator_widget,
        is_london=is_london_widget,
        route_complexity=route_complexity_widget,
        stop_position=stop_position_widget,
        hour=hour_widget,
        is_weekend=is_weekend_widget,
        traffic=traffic_widget,
        weather=weather_widget,
        total_stops=total_stops_widget
    )

    # Button click handler
    def on_predict_clicked(b):
        predict_delay(
            stop_name_length_widget.value,
            has_indicator_widget.value,
            is_london_widget.value,
            route_complexity_widget.value,
            stop_position_widget.value,
            hour_widget.value,
            is_weekend_widget.value,
            traffic_widget.value,
            weather_widget.value,
            total_stops_widget.value
        )

    predict_button.on_click(on_predict_clicked)

    # Dashboard separator header
    separator_html = (
        "<div style='width: 700px; margin: 0 0 15px 0; padding: 15px 25px; background: #2c2c2c; "
        "border-radius: 6px; text-align: left; border: 1px solid #444;'>"
        "<h3 style='color: #ffffff; margin: 0 0 5px 0; font-weight: 600; font-size: 15px;'>Prediction Dashboard</h3>"
        "<p style='color: #b0b0b0; margin: 0; font-size: 12px;'>Adjust your journey details and click to get predictions</p>"
        "</div>"
    )

    # Dashboard footer
    footer_html = (
        "<div style='width: 700px; background: #2c2c2c; color: #ffffff; padding: 15px 25px; border-radius: 6px; "
        "margin-top: 15px; text-align: left; border: 1px solid #444;'>"
        "<p style='margin: 0; font-size: 12px; font-weight: 600;'>Bus Delay Prediction System</p>"
        "<p style='margin: 5px 0 0 0; font-size: 11px; color: #b0b0b0;'>Powered by Apache Spark ML</p>"
        "</div>"
    )

    # Create complete dashboard
    dashboard = VBox([
        HTML(separator_html),
        dashboard_title,
        all_inputs,
        HTML("<div style='height: 15px;'></div>"),
        predict_button,
        HTML("<div style='height: 20px;'></div>"),
        prediction_output,
        HTML("<div style='height: 20px;'></div>"),
        HTML(footer_html)
    ])

    print("\n" + "="*80)
    print("✓ Interactive Dashboard Ready")
    print("="*80)
    print("\nAdjust the journey parameters above and click 'Get Predictions' to see the results")
    print("="*80 + "\n")

    display(dashboard)

except ImportError:
    print("\nThe ipywidgets library is needed for the interactive dashboard.")
    print("To install it, run: pip install ipywidgets")
    print("Then restart the kernel and run this cell again.")


In [None]:
%cd BigData/
!ls -a


/content/BigData
.  ..  BusPrediction.ipynb  BusPredictionipynb	.git  README.md
