In [1]:
! pip install polyline python-geohash

Collecting polyline
  Downloading polyline-1.4.0-py2.py3-none-any.whl (4.4 kB)
Collecting python-geohash
  Downloading python-geohash-0.8.5.tar.gz (17 kB)
Building wheels for collected packages: python-geohash
  Building wheel for python-geohash (setup.py) ... [?25ldone
[?25h  Created wheel for python-geohash: filename=python_geohash-0.8.5-cp36-cp36m-linux_x86_64.whl size=46564 sha256=4dec027e7d9da9b467af7c431fdc825bbe66721015f55f2e22a7bc26331d77b9
  Stored in directory: /root/.cache/pip/wheels/6f/be/45/5e0a0ce5bf42f2081c5b2906d4f1f146f825ec00c3759d1bd3
Successfully built python-geohash
Installing collected packages: polyline, python-geohash
Successfully installed polyline-1.4.0 python-geohash-0.8.5


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/citymobil-eta-correction/validation.csv
/kaggle/input/citymobil-eta-correction/test.csv
/kaggle/input/citymobil-eta-correction/train.csv/train.csv


In [3]:
import itertools
import uuid

import dask.dataframe as dd
from dask.distributed import Client, progress
import geohash
import polyline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm.notebook import tqdm
from vowpalwabbit.sklearn_vw import VWRegressor

tqdm.pandas()



In [4]:
client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB')

# Feature transofrmers

In [5]:
def drop_consecutive_duplicates(path):
    return [point for point, _ in itertools.groupby(path)]

def decode_polyline(path):
    return drop_consecutive_duplicates(polyline.decode(path))

In [6]:
class GeohashBinarizer():
    """
    Convert polylines to sparse feature matrix
    Parameters
    ----------
    geohash_precision : int, default=6
        6 => ~10000 features.
        7 => ~100000 features.
    Attributes
    ----------
    n_features_ : int
    mlb : MultiLabelBinarizer
    """
    def __init__(self, geohash_precision=6):
        self.geohash_precision = geohash_precision
        self.mlb = MultiLabelBinarizer(sparse_output=True)

    def fit_transform(self, routes):
        """
        routes : polylines or lists of points
        """
        hashes = self.convert_to_hashes(routes)
        features = self.mlb.fit_transform(hashes)
        self.n_features_ = len(self.mlb.classes_)
        return features

    def transform(self, routes):
        hashes = self.convert_to_hashes(routes)
        features = self.mlb.transform(hashes)
        return features
    
    def get_hashes(self, path):
        return set(geohash.encode(*point, precision=self.precision) 
               for point in set(polyline.decode(path)))
    
    def convert_to_hashes(self, routes):
        if isinstance(routes[0], str):
            routes = [decode_polyline(path)
                      for path in tqdm(routes, desc='Decoding polylines')]
        hashes = [self.get_hashes(path)
                  for path in tqdm(routes, desc='Creating geohashes')]
        return hashes

    
class TurnAngleCounter:
    def __init__(self):
        pass
    
    def fit_transform(self, routes: pd.Series):
        return routes.progress_apply(self.convert_to_counts)
    
    def turn_angle_cos(self, points):
        (x1, y1), (x2, y2), (x3, y3) = points
        seg1_x = x2-x1
        seg1_y = y2-y1
        seg2_x = x3-x2
        seg2_y = y3-y2
        seg1_conj_x = -seg1_y
        seg1_conj_y = seg1_x
        seg1_len = np.sqrt(seg1_x ** 2 + seg1_y ** 2)
        seg2_len = np.sqrt(seg2_x ** 2 + seg2_y ** 2)
        dot = seg1_conj_x * seg2_x + seg1_conj_y * seg2_y
        return dot / (seg1_len * seg2_len)
    
    def get_path_angles(points):
        angles = pd.Series([turn_angle_cos(points[i:i+3]) for i in range(len(points)-2)])
        counts, _ = np.histogram(angles, bins=[-2, -0.95, -0.2, 0.2, 0.95, 2])
        return pd.Series({
            'left_60': counts[0],
            'left_30': counts[1],
            'straight': counts[2],
            'right_30': counts[3],
            'right_60': counts[4],
        })
    
    def convert_to_counts(self, routes):
        if isinstance(routes[0], str):
            routes = [decode_polyline(path)
                      for path in tqdm(routes, desc='Decoding polylines')]
        counts = [get_path_angles(points)
                  for points in tqdm(routes, desc='Counting angles')]
        return counts

In [7]:
def decode_polylines_to_dd(series):
    series = series.dropna()
    return dd.from_pandas(series.progress_apply(decode_polyline), chunksize=50000)

# In action

In [8]:
df = pd.read_csv("/kaggle/input/citymobil-eta-correction/train.csv/train.csv", 
                 usecols=("Id", "ETA", "EDA", "RTA", "RDA", "route", "track"),
                 index_col="Id")
df_test = pd.read_csv("/kaggle/input/citymobil-eta-correction/test.csv", 
                      usecols=("Id", "ETA", "EDA", "route"),
                      index_col="Id")
df_val = pd.read_csv("/kaggle/input/citymobil-eta-correction/validation.csv", 
                     usecols=("Id", "ETA", "EDA", "RTA", "RDA", "route"),
                     index_col="Id")
df.head()

Unnamed: 0_level_0,ETA,RTA,EDA,RDA,route,track
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1249.0,1076.0,7.0,8.0,stvqG{n|lFf@bCN|@NhBH~D?n@MvC_@rES`B??YFWCoAgE...,
1,1822.0,1812.0,15.0,14.0,ooamI}`qtIXtE??h@_@|@^??lAaN??jA`@??XwD`@_E|@u...,m_qtI}namIWEQASCO?@B@@A@??????????EAKEBKHGDA??...
2,410.0,336.0,2.0,2.0,u}ueI_hclHwBkGaDqJo@oA]OiCuHK[M{@??Ag@Ie@UYSIU...,wlclHk}ueIGJVH????LJEJ\AtCJbBTHFVB?n@hC\`@iBiA...
3,653.0,558.0,4.0,4.0,qy{yIshaqJuADoAS_Bb@??UiE??wIfCgOrD}FhB_Bd@??m...,upaqJ{b|yI??^Dl@RFRCTFPNP?PIXGVAH????AA?????A@...
4,518.0,463.0,6.0,6.0,gmtyI{haqJ\f@??]pA]|@i@t@[Z_@X??c@gDcAoI??aEzB...,


In [None]:
train_routes = decode_polylines_to_dd(df.route)
test_routes = decode_polylines_to_dd(df_test.route)
val_routes = decode_polylines_to_dd(df_val.route)

# too huge
# train_tracks = decode_polylines_to_dd(df.track)

HBox(children=(FloatProgress(value=0.0, max=655657.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=347516.0), HTML(value='')))

In [None]:
est_turns = df.route.progress_apply(process_route).add_prefix('E_turns_')

In [None]:
act_turns = df.track.progress_apply(process_route).add_prefix('R_turns_')

In [None]:
turns_data = est_turns.join(act_turns)

In [None]:
turns_data.to_csv('route_turns.csv')

In [None]:
turns_data.mean()

# Points one-hot encoding

In [None]:
df = pd.read_csv("/kaggle/input/citymobil-eta-correction/train.csv/train.csv", 
                 usecols=("Id", "ETA", "EDA", "RTA", "RDA", "route", "track"),
                 index_col="Id")

In [None]:
route_hashes = df.route.dropna().progress_apply(get_hashes)

In [None]:
track_hashes = df.track.dropna().progress_apply(get_hashes)

In [None]:
th = track_hashes.to_list()
rh = route_hashes.to_list()

In [None]:
len(route_hashes)

In [None]:
route_X = mlb.fit_transform(route_hashes.to_list())

In [None]:
route_X.shape

In [None]:
target = (df['ETA'] / df['RTA']).to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(route_X, target, test_size=0.2, random_state=256)

In [None]:
vwr = VWRegressor()

In [None]:
vwr.fit(X_train, y_train)

In [None]:
mean_absolute_error(vwr.predict(X_test), y_test)