In [5]:
import pandas as pd

df = pd.read_csv("enhanced_tram_data.csv")
df2 = pd.read_csv("enhanced_tram_data_2.csv")

df.merge(df2)

df.head()

Unnamed: 0,hour_of_day,day_of_week,is_weekend,month,day_of_month,temperature,precipitation,weather_condition,humidity,wind_speed,...,vehicle_number,brigade,route_segment,direction,stop_name,planned_travel_time_min,delay_minutes,delay_category,delay_ratio,is_delayed
0,12,5,True,5,31,12.4,1.2,rain,75,3.2,...,HY721,10-05,Kurdwanów–Pleszów,Kurdwanów P+R,Kurdwanów P+R 03,25,23.0,major_delay,0.92,True
1,12,5,True,5,31,12.4,1.2,rain,75,3.2,...,HG919,52-04,Czerwone Maki–Os.Piastów,Czerwone Maki P+R,Czerwone Maki P+R 01,30,19.0,major_delay,0.633333,True
2,12,5,True,5,31,12.4,1.2,rain,75,3.2,...,RY882,22-07,Borek Fałęcki–Kopiec Wandy,Borek Fałęcki,Borek Fałęcki 01,40,18.0,major_delay,0.45,True
3,12,5,True,5,31,12.4,1.2,rain,75,3.2,...,RY879,20-05,Mały Płaszów–Cichy Kącik,Mały Płaszów P+R,Mały Płaszów P+R 01,20,15.0,major_delay,0.75,True
4,12,5,True,5,31,12.4,1.2,rain,75,3.2,...,HY722,18-07,Górka Narodowa–Czerwone Maki,Górka Narodowa P+R,Górka Narodowa P+R 01,35,15.0,major_delay,0.428571,True


In [6]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [14]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load datasets
synthetic_df = pd.read_csv('synthetic_tram_data_2000.csv')
real_df = pd.read_csv('enhanced_tram_data.csv')
df2 = pd.read_csv("enhanced_tram_data_2.csv")
real_df.merge(df2)

# Define categorical features
categorical_features = [
    'weather_condition', 'vehicle_type', 'brigade', 'route_segment',
    'direction', 'stop_name', 'delay_category'
]

# Prepare features
def prepare_features(df):
    # Select features for training
    features = [
        'hour_of_day', 'day_of_week', 'month', 'day_of_month',
        'temperature', 'precipitation', 'humidity', 'wind_speed',
        'line_number', 'planned_travel_time_min', 'is_weekend', 'is_delayed'
    ] + categorical_features

    # Remove features not in dataset
    features = [f for f in features if f in df.columns and f not in ['delay_minutes', 'delay_ratio', 'delay_category', 'is_delayed']]

    X = df[features].copy()
    y = df['delay_minutes'].copy()

    return X, y

# Prepare training and test data
X_train, y_train = prepare_features(synthetic_df)
X_test, y_test = prepare_features(real_df)

# Ensure same features in both datasets
common_features = [col for col in X_train.columns if col in X_test.columns]
X_train = X_train[common_features]
X_test = X_test[common_features]
categorical_features = [f for f in categorical_features if f in common_features]

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSE',
    cat_features=categorical_features,
    random_seed=42,
    verbose=100,
    early_stopping_rounds=50
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    plot=False
)

# Make predictions and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f} minutes")
print(f"MAE: {mae:.2f} minutes")
print(f"R² Score: {r2:.3f}")

# Show feature importance
feature_importance = pd.DataFrame({
    'feature': common_features,
    'importance': model.get_feature_importance()
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

0:	learn: 3.0376077	test: 4.0676297	best: 4.0676297 (0)	total: 3.53ms	remaining: 3.52s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 4.060383493
bestIteration = 6

Shrink model to first 7 iterations.
RMSE: 4.06 minutes
MAE: 2.60 minutes
R² Score: 0.003

Top 10 Most Important Features:
              feature  importance
11  weather_condition   61.300102
0         hour_of_day   12.018108
1         day_of_week    6.772088
4         temperature    6.223065
10         is_weekend    6.035661
5       precipitation    5.852902
14      route_segment    1.131243
8         line_number    0.505740
15          direction    0.107818
16          stop_name    0.053274
