### Prediction pipeline

#### Imports

In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score

### With crossval

In [2]:
def predict(data, model_path):
    """
    Parameters:
    - data: pandas DataFrame containing the feature columns and the label column "delay_label"
    """
    # Define the feature columns and the target column
    feature_cols = ["bpuic", "stop_lon", "stop_lat", "avg_delay", "stddev_delay", "temp", "max_precip_hrly", "ankunftszeit"]
    target_col = "arrival_delay"

    # Split the data into features and target
    X = data[feature_cols]
    y = data[target_col]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # define the parameter grid
    param_grid = {
        'classifier__n_estimators': [20, 50, 100, 200],
        'classifier__max_depth': [5, 10, 20]
    }

    # configure cross-validation
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

    # Train the model
    grid_search.fit(X_train, y_train)

    # Save the best model
    best_model = grid_search.best_estimator_
    with open("XXX.pkl", 'wb') as model_file:
        pickle.dump(best_model, model_file)

    # Make predictions on the test data
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)

    return best_model, y_test, y_pred, y_prob

### Without crossval

### Load data

In [3]:
train_df = pd.read_csv('old_data/delay_data.csv')
train_df.columns = [
        'bpuic', 'year', 'month', 'dayofmonth', 'site', 'hour', 'ankunftszeit', 
        'linien_id', 'verkehrsmittel_text', 'delay_label', 'timestamp', 
        'arrival_delay', 'stop_lon', 'stop_lat', 'positive_delay', 'rank', 
        'temp', 'max_precip_hrly', 'avg_delay', 'stddev_delay'
    ]

def round_x(x):
    if x < 0:
        return 0
    if x > 15:
        return 15
    return x

train_df = train_df.dropna()  
train_df.ankunftszeit = pd.to_datetime(train_df.ankunftszeit, format='%d.%m.%Y %H:%M').apply(lambda x: int(x.timestamp()))
train_df.positive_delay = train_df.positive_delay.apply(lambda x: int(round(x)))
train_df.arrival_delay = train_df.delay_label.apply(lambda x: round_x(x))


In [4]:
train_df.head()

Unnamed: 0,bpuic,year,month,dayofmonth,site,hour,ankunftszeit,linien_id,verkehrsmittel_text,delay_label,timestamp,arrival_delay,stop_lon,stop_lat,positive_delay,rank,temp,max_precip_hrly,avg_delay,stddev_delay
24,8500051,2023,3,7,LSPD,16,1678205580,85:37:811,T,1,2023-03-07T16:13:00.000+01:00,1,7.589183,47.492862,0,1,6.0,0.0,0.5375,1.164631
25,8500051,2023,3,7,LSPD,6,1678171260,85:37:11,T,1,2023-03-07T06:41:00.000+01:00,1,7.589183,47.492862,1,1,1.0,0.0,0.5375,1.164631
26,8500051,2023,3,7,LSPD,17,1678210980,85:37:811,T,0,2023-03-07T17:43:00.000+01:00,0,7.589183,47.492862,0,1,6.0,0.0,0.5375,1.164631
27,8500051,2023,3,7,LSPD,7,1678174860,85:37:11,T,2,2023-03-07T07:41:00.000+01:00,2,7.589183,47.492862,2,1,1.0,0.0,0.5375,1.164631
28,8500051,2023,8,23,LSPD,7,1692774180,85:37:62,B,2,2023-08-23T07:03:00.000+02:00,2,7.589183,47.492862,1,1,21.0,0.0,0.445833,0.441247


In [5]:
train_df = train_df.dropna()
train_df = train_df.drop(["year", "month", "dayofmonth", "site", "hour", "linien_id", "verkehrsmittel_text", "delay_label", "timestamp", "positive_delay", "rank", ], axis=1)

In [6]:
train_df.head()

Unnamed: 0,bpuic,ankunftszeit,arrival_delay,stop_lon,stop_lat,temp,max_precip_hrly,avg_delay,stddev_delay
24,8500051,1678205580,1,7.589183,47.492862,6.0,0.0,0.5375,1.164631
25,8500051,1678171260,1,7.589183,47.492862,1.0,0.0,0.5375,1.164631
26,8500051,1678210980,0,7.589183,47.492862,6.0,0.0,0.5375,1.164631
27,8500051,1678174860,2,7.589183,47.492862,1.0,0.0,0.5375,1.164631
28,8500051,1692774180,2,7.589183,47.492862,21.0,0.0,0.445833,0.441247


In [7]:
train_df.arrival_delay.max()

15

### Train

In [8]:
model, y_true, y_pred, y_prob = predict(train_df, "models/random_forest_model.pkl")

### Evaluate

In [11]:
def evaluate(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_true, y_pred))
    precision = precision_score(y_true, y_pred, average='weighted')
    print(f"Precision: {precision}", "\n")
    recall = recall_score(y_true, y_pred, average='weighted')
    print(f"Recall: {recall}", "\n")
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"F1: {f1}", "\n")
    auc = roc_auc_score(y_true, y_prob, multi_class='ovr')
    print(f"AUC: {auc}")


In [12]:
print("With cross-validation")
evaluate(y_true, y_pred, y_prob)

With cross-validation
Accuracy: 0.48923910268284015


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.57      0.62      0.59     65023
           1       0.47      0.69      0.56    122031
           2       0.48      0.56      0.52    107160
           3       0.48      0.20      0.28     53291
           4       0.38      0.01      0.01     24577
           5       0.11      0.00      0.00     11795
           6       0.00      0.00      0.00      5933
           7       0.00      0.00      0.00      3228
           8       0.00      0.00      0.00      1867
           9       0.00      0.00      0.00      1238
          10       0.00      0.00      0.00       722
          11       0.00      0.00      0.00       543
          12       0.00      0.00      0.00       389
          13       0.00      0.00      0.00       294
          14       0.00      0.00      0.00       221
          15       0.46      0.14      0.21      1190

    accuracy                           0.49    399502
   macro avg       0.18   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AUC: 0.9252304701067822


### Inference locally

In [16]:
print("Mean std dev:", train_df['stddev_delay'].mean())
print("Max std dev:", train_df['stddev_delay'].max())
print("Min std dev:", train_df['stddev_delay'].min())
print("Mean avg delay:", train_df['avg_delay'].mean())
print("Max avg delay:", train_df['avg_delay'].max())
print("Min avg delay:", train_df['avg_delay'].min())

average_delay = train_df['avg_delay'].mean()
std_dev_delay = train_df['stddev_delay'].mean()

Mean std dev: 2.229761204045168
Max std dev: 304.6230883040296
Min std dev: 0.0
Mean avg delay: 0.4411801368337622
Max avg delay: 10.994334650856391
Min avg delay: -65.85974358974359


In [31]:
def perform_inference(model_path, bpuic, desired_arrival_time, linien_id, weather, avg_delay, std_dev):
    """
    Parameters:
    - model_path: path to the trained model
    - bpuic: id of stop
    - desired_arrival_time: desired arrival time
    - linien_id: id of line
    - weather: weather data
    - stops_data: stops data
    - stop_times_data: stop times data
    """

    # Load the model
    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)

    if weather == "Sunny":
        temperature = 15
        precipitation_hrly = 0
    elif weather == "Rainy":
        temperature = 10
        precipitation_hrly = 0.5
    elif weather == "Cloudy":
        temperature = 12
        precipitation_hrly = 0.1
    prediction = model.predict([[bpuic, desired_arrival_time, linien_id, avg_delay, std_dev, temperature, precipitation_hrly]])[0]
  
    return prediction



In [42]:
model_path = "models/random_forest_model.pkl"
bpuic = 8591049
desired_arrival_time = 1676385660
linien_id = 24641.0
weather = "Cloudy"
print("Is it delayed?\n", bool(perform_inference(model_path, bpuic, desired_arrival_time, linien_id, weather, avg_delay=average_delay, std_dev=std_dev_delay)))

Is it delayed?
 False




### Inference in spark session

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import PipelineModel

def perform_inference(spark, model_path, stop_name, desired_arrival_time, weather, departure_stop_id):
    """ Examples of parameters:
     spark = SparkSession.builder.appName("LocalInference").getOrCreate()

     stops = spark.read.orc('/data/sbb/orc/timetables/stops/year=2024/month=5/day=16')
     stop_times = spark.read.orc('/data/sbb/orc/timetables/stop_times/year=2024/month=5/day=16') 
     
     """

    
    # define schema
    schema = StructType([
        StructField("stop_name", StringType(), True),
        StructField("desired_arrival_time", StringType(), True),
        StructField("temperature", DoubleType(), True),
        StructField("precip_hrly", DoubleType(), True)
    ])
    
    # create df
    data = [(stop_name, desired_arrival_time, weather['temperature'], weather['precip_hrly'])]
    input_df = spark.createDataFrame(data, schema=schema)
    
    # create desired_arrival_time column
    input_df = input_df.withColumn("desired_arrival_time", F.to_timestamp("desired_arrival_time", "dd.MM.yyyy HH:mm"))
    
    # extract arrival hour and minute and put in columns
    input_df = input_df.withColumn("arrival_hour", F.hour("desired_arrival_time"))
    input_df = input_df.withColumn("arrival_minute", F.minute("desired_arrival_time"))

  
    stop_info = stops.filter(F.col("stop_name") == stop_name).select("stop_id").first()
    if stop_info:
        stop_id = stop_info["stop_id"]
    else:
        raise ValueError(f"Stop name {stop_name} not found in the dataset")
    

    stop_time_info = stop_times.filter(F.col("stop_id") == stop_id).select("trip_id", "arrival_time").first()
    if stop_time_info:
        bpuic = stop_id
        linien_id = stop_time_info["trip_id"]
    else:
        raise ValueError(f"No stop time information found for stop_id {stop_id}")
    
    # functions lit() is used to add a new column to DataFrame by assigning a literal or constant value
    input_df = input_df.withColumn("bpuic", F.lit(bpuic))
    input_df = input_df.withColumn("linien_id", F.lit(linien_id))
    input_df = input_df.withColumn("avg_delay", F.lit(0.0))  # placeholder, replace with actual avg_delay if available
    input_df = input_df.withColumn("stddev_delay", F.lit(0.0))  # placeholder, replace with actual stddev_delay if available

    # Définir les colonnes de caractéristiques
    feature_cols = ["bpuic", "linien_id", "avg_delay", "stddev_delay", "temperature", "precip_hrly", "arrival_hour", "arrival_minute"]

    # assemble and scale
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")
    input_df = assembler.transform(input_df)

    scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=False)
    scaler_model = scaler.fit(input_df)
    input_df = scaler_model.transform(input_df)

    # load model
    cvModel = PipelineModel.load(model_path)

    # do inference
    predictions = cvModel.transform(input_df)

    # select 
    result = predictions.select("stop_name", "desired_arrival_time", "temperature", "precip_hrly", "arrival_hour", "arrival_minute", "prediction").first()

    if result:
        return {
            "stop_name": result["stop_name"],
            "desired_arrival_time": result["desired_arrival_time"],
            "temperature": result["temperature"],
            "precip_hrly": result["precip_hrly"],
            "arrival_hour": result["arrival_hour"],
            "arrival_minute": result["arrival_minute"],
            "predicted_delay": result["prediction"]
        }
    else:
        return None