In [1]:
# https://platform.olimpiada-ai.ro/problems/60

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
train = pd.read_csv('/kaggle/input/flight-delay-prediction/train.csv')
test = pd.read_csv('/kaggle/input/flight-delay-prediction/test.csv')

train.shape, test.shape

((254413, 17), (63604, 16))

In [3]:
train.head(3)

Unnamed: 0,sample_id,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,delay
0,194048,2010,6,YV,Mesa Airlines Inc.,MEM,"Memphis, TN: Memphis International",258.0,63.0,34.0,2.0,9.57,0.0,17.43,7.0,0.0,3514.0
1,200607,2010,1,MQ,American Eagle Airlines Inc.,SAN,"San Diego, CA: San Diego International",512.0,61.0,17.47,5.49,16.85,0.0,21.19,15.0,0.0,2444.0
2,253640,2007,3,OO,SkyWest Airlines Inc.,BTV,"Burlington, VT: Burlington International",66.0,27.0,11.0,0.0,0.0,0.0,16.0,3.0,0.0,1421.0


In [4]:
train.isna().sum()

sample_id             0
year                  0
month                 0
carrier               4
carrier_name          4
airport               2
airport_name          0
arr_flights         397
arr_del15           576
carrier_ct          394
weather_ct          395
nas_ct              392
security_ct         392
late_aircraft_ct    392
arr_cancelled       392
arr_diverted        393
delay               397
dtype: int64

In [5]:
test.isna().sum()

sample_id             0
year                  0
month                 0
carrier               0
carrier_name          0
airport               1
airport_name          0
arr_flights          96
arr_del15           156
carrier_ct           98
weather_ct           99
nas_ct               96
security_ct          96
late_aircraft_ct     96
arr_cancelled        96
arr_diverted         97
dtype: int64

In [6]:
from sklearn.impute import SimpleImputer

def process_df(df):
    df['month_str'] = df['month'].astype(str)
    df['year_str'] = df['year'].astype(str)
    df.drop(columns=['airport_name'], inplace=True)
    return df

train = process_df(train)
test = process_df(test)

num_cols = ['year', 'month', 'arr_flights', 'security_ct', 'late_aircraft_ct', 'arr_diverted',
            'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'arr_cancelled']
cat_cols = ['month_str', 'year_str', 'airport', 'carrier_name', 'carrier']
target_col = ['delay']

features = num_cols + cat_cols

imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

train[num_cols] = imputer_num.fit_transform(train[num_cols])
test[num_cols] = imputer_num.transform(test[num_cols])

train[cat_cols] = imputer_cat.fit_transform(train[cat_cols])
test[cat_cols] = imputer_cat.transform(test[cat_cols])

train.dropna(subset=['delay'], inplace=True)

train.shape, test.shape

((254016, 18), (63604, 17))

In [7]:
from sklearn.model_selection import train_test_split
from catboost import Pool

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42, test_size=0.1)

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
full_pool = Pool(X, y, cat_features=cat_cols)

In [8]:
from catboost import CatBoostRegressor

params = {
    'iterations': 20000,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 200,
    'max_depth': 6,
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 3664.3148335	test: 3835.9155551	best: 3835.9155551 (0)	total: 287ms	remaining: 23m 56s
200:	learn: 746.9620707	test: 792.0494634	best: 792.0494634 (200)	total: 35.9s	remaining: 14m 16s
400:	learn: 614.7047163	test: 649.7496198	best: 649.7496198 (400)	total: 1m 10s	remaining: 13m 27s
600:	learn: 566.0993582	test: 600.7841944	best: 600.7841944 (600)	total: 1m 46s	remaining: 12m 59s
800:	learn: 540.4592626	test: 576.3529877	best: 576.3529877 (800)	total: 2m 22s	remaining: 12m 25s
1000:	learn: 522.6648799	test: 559.5325006	best: 559.5325006 (1000)	total: 2m 57s	remaining: 11m 49s
1200:	learn: 510.2312252	test: 549.1311314	best: 549.1311314 (1200)	total: 3m 31s	remaining: 11m 10s
1400:	learn: 500.8374374	test: 541.6086520	best: 541.6086520 (1400)	total: 4m 6s	remaining: 10m 32s
1600:	learn: 492.5940797	test: 535.0144327	best: 535.0144327 (1600)	total: 4m 40s	remaining: 9m 56s
1800:	learn: 484.8162262	test: 529.7193607	best: 529.7193607 (1800)	total: 5m 15s	remaining: 9m 20s
2000:	

<catboost.core.CatBoostRegressor at 0x7cb0d4802650>

In [10]:
y_pred = model.predict(X_test).round().astype(int)

subm = pd.DataFrame({
    'sample_id': test['sample_id'],
    'delay': y_pred
})

subm.to_csv("submission.csv", index=False)

subm.head()

Unnamed: 0,sample_id,delay
0,184678,762
1,78226,1361
2,124301,5215
3,206506,900
4,133174,1892
