In [89]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.preprocessing import LabelEncoder, StandardScaler

from catboost import CatBoostRegressor

In [90]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [91]:
print(test.head)

<bound method NDFrame.head of       row_id                 time  x  y direction
0     848835  1991-09-30 12:00:00  0  0        EB
1     848836  1991-09-30 12:00:00  0  0        NB
2     848837  1991-09-30 12:00:00  0  0        SB
3     848838  1991-09-30 12:00:00  0  1        EB
4     848839  1991-09-30 12:00:00  0  1        NB
...      ...                  ... .. ..       ...
2335  851170  1991-09-30 23:40:00  2  3        NB
2336  851171  1991-09-30 23:40:00  2  3        NE
2337  851172  1991-09-30 23:40:00  2  3        SB
2338  851173  1991-09-30 23:40:00  2  3        SW
2339  851174  1991-09-30 23:40:00  2  3        WB

[2340 rows x 5 columns]>


In [92]:
train = pd.read_csv('train.csv', index_col='row_id', parse_dates=['time'])
test = pd.read_csv('test.csv', index_col='row_id', parse_dates=['time'])

In [93]:
print(test.head)

<bound method NDFrame.head of                       time  x  y direction
row_id                                    
848835 1991-09-30 12:00:00  0  0        EB
848836 1991-09-30 12:00:00  0  0        NB
848837 1991-09-30 12:00:00  0  0        SB
848838 1991-09-30 12:00:00  0  1        EB
848839 1991-09-30 12:00:00  0  1        NB
...                    ... .. ..       ...
851170 1991-09-30 23:40:00  2  3        NB
851171 1991-09-30 23:40:00  2  3        NE
851172 1991-09-30 23:40:00  2  3        SB
851173 1991-09-30 23:40:00  2  3        SW
851174 1991-09-30 23:40:00  2  3        WB

[2340 rows x 4 columns]>


In [94]:
print(train.head)

<bound method NDFrame.head of                       time  x  y direction  congestion
row_id                                                
0      1991-04-01 00:00:00  0  0        EB          70
1      1991-04-01 00:00:00  0  0        NB          49
2      1991-04-01 00:00:00  0  0        SB          24
3      1991-04-01 00:00:00  0  1        EB          18
4      1991-04-01 00:00:00  0  1        NB          60
...                    ... .. ..       ...         ...
848830 1991-09-30 11:40:00  2  3        NB          54
848831 1991-09-30 11:40:00  2  3        NE          28
848832 1991-09-30 11:40:00  2  3        SB          68
848833 1991-09-30 11:40:00  2  3        SW          17
848834 1991-09-30 11:40:00  2  3        WB          24

[848835 rows x 5 columns]>


In [95]:
def splitTime(df, column):
    df['weekday']=df[column].dt.weekday
    df['month']=df[column].dt.month
    df['day']=df[column].dt.day
    df['hour']=df[column].dt.hour + df[column].dt.minute /60
    
    df = df.drop([column], axis =1)
    
    return df

In [96]:
train = splitTime(train, 'time')
test = splitTime(test, 'time')

In [97]:
print(train.head)

<bound method NDFrame.head of         x  y direction  congestion  weekday  month  day       hour
row_id                                                            
0       0  0        EB          70        0      4    1   0.000000
1       0  0        NB          49        0      4    1   0.000000
2       0  0        SB          24        0      4    1   0.000000
3       0  1        EB          18        0      4    1   0.000000
4       0  1        NB          60        0      4    1   0.000000
...    .. ..       ...         ...      ...    ...  ...        ...
848830  2  3        NB          54        0      9   30  11.666667
848831  2  3        NE          28        0      9   30  11.666667
848832  2  3        SB          68        0      9   30  11.666667
848833  2  3        SW          17        0      9   30  11.666667
848834  2  3        WB          24        0      9   30  11.666667

[848835 rows x 8 columns]>


In [98]:
features = [col for col in train.columns if col != 'congestion']
print(features)
normalFeatures =features

['x', 'y', 'direction', 'weekday', 'month', 'day', 'hour']


In [99]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [100]:
train = encode(train, 'hour', 23.99)
test = encode(test, 'hour', 23.99)
train = encode(train, 'weekday', 7)
test = encode(test, 'weekday', 7)
train = encode (train, 'day', 365)
test = encode (test, 'day', 365)
train = encode(train, 'month', 12)
test = encode(test, 'month', 12)
print(train.head)

<bound method NDFrame.head of         x  y direction  congestion  weekday  month  day       hour  hour_sin  \
row_id                                                                         
0       0  0        EB          70        0      4    1   0.000000  0.000000   
1       0  0        NB          49        0      4    1   0.000000  0.000000   
2       0  0        SB          24        0      4    1   0.000000  0.000000   
3       0  1        EB          18        0      4    1   0.000000  0.000000   
4       0  1        NB          60        0      4    1   0.000000  0.000000   
...    .. ..       ...         ...      ...    ...  ...        ...       ...   
848830  2  3        NB          54        0      9   30  11.666667  0.085887   
848831  2  3        NE          28        0      9   30  11.666667  0.085887   
848832  2  3        SB          68        0      9   30  11.666667  0.085887   
848833  2  3        SW          17        0      9   30  11.666667  0.085887   
848834  2 

In [101]:
MODEL_MAX_DEPTH = 12
MODEL_TASK_TYPE = 'GPU'
MODEL_RL = 0.02
MODEL_EVAL_METRIC ='MAE'
MODEL_LOSS_FUNCTION = 'MAE'
MODEL_ESR = 10
MODEL_VERBOSE = 1000
MODEL_ITERATIONS = 10000
features = [col for col in train.columns if col != 'congestion']
print(features)

['x', 'y', 'direction', 'weekday', 'month', 'day', 'hour', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos']


In [102]:
X_train = pd.get_dummies(train[features])
X_test = pd.get_dummies(test[features])
print(X_train.head)

<bound method NDFrame.head of         x  y  weekday  month  day       hour  hour_sin  hour_cos  weekday_sin  \
row_id                                                                          
0       0  0        0      4    1   0.000000  0.000000  1.000000          0.0   
1       0  0        0      4    1   0.000000  0.000000  1.000000          0.0   
2       0  0        0      4    1   0.000000  0.000000  1.000000          0.0   
3       0  1        0      4    1   0.000000  0.000000  1.000000          0.0   
4       0  1        0      4    1   0.000000  0.000000  1.000000          0.0   
...    .. ..      ...    ...  ...        ...       ...       ...          ...   
848830  2  3        0      9   30  11.666667  0.085887 -0.996305          0.0   
848831  2  3        0      9   30  11.666667  0.085887 -0.996305          0.0   
848832  2  3        0      9   30  11.666667  0.085887 -0.996305          0.0   
848833  2  3        0      9   30  11.666667  0.085887 -0.996305          0.0  

In [103]:
meh = []
for col in X_train.columns:
    if len(col) > 4:
        if col[-4]=="_":
            continue
    meh.append(col)
X_normal = X_train[meh]

a = set([col for col in X_train.columns])
b=set(['weekday', 'month', 'day', 'hour'])
X_cycle = X_train[list(a-b)]

X_test_normal = X_test[meh]
X_test_cycle = X_test[list(a-b)]

print(X_cycle.head)
            

<bound method NDFrame.head of         direction_NE  x  y  hour_cos  weekday_cos   day_sin  direction_NW  \
row_id                                                                      
0                  0  0  0  1.000000          1.0  0.017213             0   
1                  0  0  0  1.000000          1.0  0.017213             0   
2                  0  0  0  1.000000          1.0  0.017213             0   
3                  0  0  1  1.000000          1.0  0.017213             0   
4                  0  0  1  1.000000          1.0  0.017213             0   
...              ... .. ..       ...          ...       ...           ...   
848830             0  2  3 -0.996305          1.0  0.493776             0   
848831             1  2  3 -0.996305          1.0  0.493776             0   
848832             0  2  3 -0.996305          1.0  0.493776             0   
848833             0  2  3 -0.996305          1.0  0.493776             0   
848834             0  2  3 -0.996305          

In [104]:
y = train['congestion']

In [105]:
SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

model = CatBoostRegressor(
    verbose=MODEL_VERBOSE,
    early_stopping_rounds=MODEL_ESR,
    random_seed=SEED,
    max_depth=10,
    learning_rate=MODEL_RL,
    iterations=5000,
    loss_function=MODEL_LOSS_FUNCTION,
    eval_metric= MODEL_EVAL_METRIC
)

In [106]:
model.fit(X_cycle, y)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 13.7202261	total: 146ms	remaining: 12m 12s
1000:	learn: 6.3810789	total: 1m 5s	remaining: 4m 20s
2000:	learn: 6.1145169	total: 2m 11s	remaining: 3m 17s
3000:	learn: 5.9616850	total: 3m 16s	remaining: 2m 11s
4000:	learn: 5.8532193	total: 4m 22s	remaining: 1m 5s
4999:	learn: 5.7688955	total: 5m 28s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7fdbc5b1d590>

In [107]:
pred_test = model.predict(X_test_cycle)

In [111]:
sub = pd.read_csv('sample_submission.csv')
sub['congestion'] = np.rint(pred_test).astype(int)
sub.to_csv('output.csv',index=False)
sub.head()

Unnamed: 0,row_id,congestion
0,848835,50
1,848836,35
2,848837,50
3,848838,25
4,848839,69
