In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
def import_data_kaggle(path=os.path.join('data'), files=('orders.csv', 'nodes.csv')):
    return (pd.read_csv(os.path.join(path, file)) for file in files)

raw_orders, raw_nodes = import_data_kaggle()

def convert_to_datetime(df):
    df['running_time'] = pd.to_datetime(df['running_time'])
    if 'completed_time' in df.columns:
        df['completed_time'] = pd.to_datetime(df['completed_time'])
    return df

raw_orders = convert_to_datetime(raw_orders)

In [3]:
class FeatureExtractor:
    def __init__(self):
        pass

    def time_to_circle(self, value, max_value):
        return np.cos(value / max_value * 2 * np.pi), np.sin(value / max_value * 2 * np.pi)

    def datetime_to_xy(self, df):
        tmp = df.running_time
        tmp = tmp.dt.hour*3600 + tmp.dt.minute*60 + tmp.dt.second

        xs, ys = self.time_to_circle(tmp, 86400)
        return xs, ys

    def transform(self, orders, nodes):
        # Getting feature "nodes_delta_time"
        nodes.speed.fillna(31.277917, inplace=True)
        nodes["time"] = (nodes["distance"]*3.6)/nodes['speed']
        eta_per_order = nodes[['Id', 'time']].groupby('Id').sum('time').reset_index()
        orders['nodes_delta_time'] = orders.merge(eta_per_order).time

        # Getting feature "set_difference" - it represents number of 'unique nodes', that are used as a start, but not as a finish;
        # and that are used as a finish, but not as a start.
        set_differences = nodes.groupby('Id').apply(lambda ord: np.concatenate([np.setdiff1d(ord.node_start.unique(), ord.node_finish.unique()),
                                                                                np.setdiff1d(ord.node_finish.unique(), ord.node_start.unique())]))
        set_differences = set_differences.apply(lambda x: len(x)).rename('set_diff').reset_index()
        orders = orders.merge(set_differences, on ='Id')

        # Getting the "distance_difference" feature. Difference between nodes cumulative distance and route distance.
        orders['nodes_distance_km'] = orders.merge(nodes.groupby('Id').sum()['distance'] / 1000, on = 'Id')['distance']
        orders['distance_difference'] = orders['route_distance_km'] - orders['nodes_distance_km']

        # Transforming "running time" into 2 dimensions.
        orders['xs'], orders['ys'] = self.datetime_to_xy(orders)
        return orders.set_index('Id')


In [4]:
orders, nodes = raw_orders.copy(), raw_nodes.copy()

data = FeatureExtractor().transform(orders, nodes)

In [5]:
def get_useful(df, to_get=['nodes_delta_time', 'xs', 'set_diff', 'distance_difference', 'delta_time']):
    return df[to_get]

data = get_useful(data)

In [6]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

X, y = data.drop(['delta_time'], axis='columns'), data['delta_time']

scaler = StandardScaler()
X = scaler.fit_transform(X)

pf = PolynomialFeatures(degree=2)
X = pf.fit_transform(X)

In [7]:
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow import keras

def train_model(X_train, y_train):
    model = keras.models.Sequential([
        keras.layers.Input(shape=[X_train.shape[1]]),
        keras.layers.Dense(128, activation="selu", kernel_initializer="lecun_normal"),
        keras.layers.Dense(64, activation="selu", kernel_initializer="lecun_normal"),
        keras.layers.Dense(32, activation="selu", kernel_initializer="lecun_normal"),
        keras.layers.Dense(16, activation="selu", kernel_initializer="lecun_normal"),
        keras.layers.Dense(1)
    ])

    model.compile(
        loss=keras.losses.MeanSquaredError(),
        optimizer="nadam",
        metrics=[
            keras.metrics.RootMeanSquaredError(),
            tfa.metrics.RSquare()
        ]
    )
    with tf.device("/cpu:0"):
        history = model.fit(
            X_train,
            y_train,
            epochs=40,
            verbose=1,
            batch_size=64
        )

    return model

In [8]:
model = train_model(X, y)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [9]:
final_test = pd.read_csv(os.path.join('data','kaggle', 'final_test.csv'))
final_nodes = pd.read_csv(os.path.join('data', 'kaggle', 'nodes_test.csv'))

final_test = convert_to_datetime(final_test)

In [10]:
data = FeatureExtractor().transform(final_test, final_nodes)
X = data[['nodes_delta_time', 'xs', 'set_diff', 'distance_difference']]

X = scaler.transform(X)

X = pf.transform(X)

In [11]:
preds = model.predict(X)



In [12]:
answer = pd.DataFrame(preds, index=data.index, columns=['Predicted'])

In [13]:
answer

Unnamed: 0_level_0,Predicted
Id,Unnamed: 1_level_1
6198,559.864075
6417,721.417297
7054,562.094971
9628,742.758545
10283,826.504578
...,...
525706,417.739960
526604,508.036102
527213,545.068970
527520,193.761337


In [14]:
answer.to_csv('answer.csv')

In [15]:
import pickle

pickle.dump(model, open('model.pkl', 'wb'))

INFO:tensorflow:Assets written to: ram://9e39e33a-d9f0-4406-b8f9-7b496ad82b01/assets
