In [91]:
# https://platform.olimpiada-ai.ro/problems/52

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [92]:
train = pd.read_csv("/kaggle/input/train-delay-prediction/train.csv")
test = pd.read_csv("/kaggle/input/train-delay-prediction/test.csv")

train.shape, test.shape

((4000, 12), (1000, 11))

In [93]:
train.head(3)

Unnamed: 0,SampleID,departure_time,distance_km,avg_speed_kmh,num_stops,weather,weekday,special_events,num_cars,ticket_price,comfort_class,delay_minutes
0,4227,02:58,788.12,103.94,2,sunny,Fri,0,3,61.463731,intermediate,11
1,4676,05:19,408.42,96.6,7,sunny,Sat,0,13,125.058439,premium,17
2,800,18:44,440.24,92.54,1,sunny,Sun,0,12,178.797255,standard,0


In [94]:
def get_time(time_str):
    hours = (ord(time_str[0])-ord('0'))*10 + (ord(time_str[1])-ord('0'))
    minutes = (ord(time_str[3])-ord('0'))*10 + (ord(time_str[4])-ord('0'))
    time = hours*60 + minutes
    return time

def process_df(df):
    df['departure_time'] = df['departure_time'].map(get_time)
    df['hour_of_day'] = [f'hour_{(v+59)//60}' for v in df['departure_time']]
    df['supposed_time'] = (df['distance_km'] / df['avg_speed_kmh']) * 60
    df['supposed_arrival_time'] = [f"hour_{int(((v+ex)%1440+59)//60)}" if v+ex!=1440 else f"hour_24" for v, ex in zip(df['departure_time'], df['supposed_time'])]
    return df

train = process_df(train)
test = process_df(test)

In [95]:
from sklearn.model_selection import train_test_split
from catboost import Pool

num_cols = ['departure_time', 'distance_km', 'avg_speed_kmh', 'num_stops',
            'special_events', 'num_cars', 'ticket_price', 'supposed_time']
cat_cols = ['weather', 'weekday', 'comfort_class', 'hour_of_day', 'supposed_arrival_time']
features = num_cols + cat_cols
target_col = 'delay_minutes'

X, y = train[features], train[target_col]
X_test = test[features]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

In [100]:
from catboost import CatBoostRegressor

params = {
    'iterations': 300,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 100,
    'max_depth': 5,
    'random_state': 42
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 8.6300178	test: 8.5764616	best: 8.5764616 (0)	total: 5.36ms	remaining: 1.6s
100:	learn: 4.0328160	test: 4.2036714	best: 4.2036714 (100)	total: 379ms	remaining: 747ms
200:	learn: 3.6937388	test: 4.0182857	best: 4.0182857 (200)	total: 749ms	remaining: 369ms
299:	learn: 3.5735983	test: 4.0115073	best: 4.0115073 (299)	total: 1.11s	remaining: 0us

bestTest = 4.011507337
bestIteration = 299



<catboost.core.CatBoostRegressor at 0x7f6556b860d0>

In [101]:
y_pred = model.predict(X_test)

subm = pd.DataFrame({
    'SampleID': test['SampleID'],
    'delay_minutes': y_pred
})

subm.to_csv("submission.csv", index=False)

subm.head()

Unnamed: 0,SampleID,delay_minutes
0,1501,26.134651
1,2586,10.427326
2,2653,1.640575
3,1055,14.594347
4,705,5.400649
