In [13]:
# https://platform.olimpiada-ai.ro/en/problems/75

import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [14]:
train = pd.read_csv("/kaggle/input/smart-cargo/train_data.csv")
test = pd.read_csv("/kaggle/input/smart-cargo/test_data.csv")
subm = pd.read_csv("/kaggle/input/smart-cargo/sample_output.csv")

train.shape, test.shape, subm.shape

((10000, 10), (2000, 9), (2001, 4))

In [15]:
subm.loc[0, 'answer'] = ((test['City A'] == 'Barlad') & (test['Weather'] == 'Fog')).sum().item()

In [16]:
train.head()

Unnamed: 0,ID,City A,City B,Distance,Time of Day,Weather,Traffic,Road Quality,Driver Experience,deliver_time
0,1,Satu Mare,Suceava,352,452,Fog,154.014691,370,30,355
1,2,Ploiesti,Timisoara,519,1386,Clear,949.697532,701,2,529
2,3,Deva,Bacau,457,91,Fog,387.019309,45,26,465
3,4,Hunedoara,Focsani,447,1120,Clear,130.544017,643,6,441
4,5,Hunedoara,Arad,201,1096,Clear,619.557737,375,20,230


In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 10000 non-null  int64  
 1   City A             10000 non-null  object 
 2   City B             10000 non-null  object 
 3   Distance           10000 non-null  int64  
 4   Time of Day        10000 non-null  int64  
 5   Weather            10000 non-null  object 
 6   Traffic            10000 non-null  float64
 7   Road Quality       10000 non-null  int64  
 8   Driver Experience  10000 non-null  int64  
 9   deliver_time       10000 non-null  int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 781.4+ KB


In [18]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 2000 non-null   int64  
 1   City A             2000 non-null   object 
 2   City B             2000 non-null   object 
 3   Distance           2000 non-null   int64  
 4   Time of Day        2000 non-null   int64  
 5   Weather            2000 non-null   object 
 6   Traffic            2000 non-null   float64
 7   Road Quality       2000 non-null   int64  
 8   Driver Experience  2000 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 140.8+ KB


In [21]:
train['Weather'].value_counts(normalize=True)

Weather
Snow     0.2504
Fog      0.2502
Clear    0.2499
Rain     0.2495
Name: proportion, dtype: float64

In [23]:
train.groupby('Weather')['deliver_time'].agg(['mean', 'median'])

Unnamed: 0_level_0,mean,median
Weather,Unnamed: 1_level_1,Unnamed: 2_level_1
Clear,376.698279,366.0
Fog,388.298561,377.5
Rain,386.534669,383.0
Snow,388.978435,379.0


In [25]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = ['City A', 'City B', 'Distance', 'Time of Day', 'Weather', 'Traffic', 'Road Quality', 'Driver Experience']
cat_features = ['City A', 'City B', 'Weather']
target_col = 'deliver_time'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

X_train.shape, X_valid.shape

((9000, 8), (1000, 8))

In [30]:
from catboost import CatBoostRegressor

params = {
    'iterations': 10000,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 1000,
    'max_depth': 6,
    'random_state': 42
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 134.3666713	test: 128.0829891	best: 128.0829891 (0)	total: 8.98ms	remaining: 1m 29s
1000:	learn: 2.3892511	test: 2.6838088	best: 2.6838088 (1000)	total: 7.71s	remaining: 1m 9s
2000:	learn: 1.9782811	test: 2.4544048	best: 2.4544048 (2000)	total: 15s	remaining: 1m
3000:	learn: 1.7954064	test: 2.4121287	best: 2.4121287 (3000)	total: 22.1s	remaining: 51.6s
4000:	learn: 1.6850595	test: 2.3874202	best: 2.3874202 (4000)	total: 29.3s	remaining: 43.9s
5000:	learn: 1.6045477	test: 2.3720998	best: 2.3720998 (5000)	total: 36.6s	remaining: 36.5s
6000:	learn: 1.5473210	test: 2.3576345	best: 2.3576345 (6000)	total: 43.7s	remaining: 29.1s
7000:	learn: 1.5013748	test: 2.3511923	best: 2.3511923 (7000)	total: 50.7s	remaining: 21.7s
8000:	learn: 1.4623798	test: 2.3450467	best: 2.3450467 (8000)	total: 57.9s	remaining: 14.5s
9000:	learn: 1.4278478	test: 2.3417926	best: 2.3417926 (9000)	total: 1m 5s	remaining: 7.22s
9999:	learn: 1.3955237	test: 2.3386972	best: 2.3386972 (9999)	total: 1m 12s	remaini

<catboost.core.CatBoostRegressor at 0x7e85ab3312b0>

In [31]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)

print(f'MAE: {mae:.5f}')

MAE: 2.33870


In [32]:
y_pred = model.predict(X_test)
subm.loc[1:, 'answer'] = y_pred

subm.to_csv("submission.csv", index=False)
subm.head()

Unnamed: 0.1,Unnamed: 0,subtaskID,datapointID,answer
0,0,1,1,15.0
1,1,2,10001,387.855082
2,2,2,10002,241.468497
3,3,2,10003,478.246181
4,4,2,10004,247.594132
