In [1]:
! pip install -q polyline python-geohash

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/final-etarta/validation.csv
/kaggle/input/final-etarta/test_additional.csv
/kaggle/input/final-etarta/test.csv
/kaggle/input/final-etarta/train.csv/train.csv


In [19]:
import itertools

import geohash
import polyline
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm.autonotebook import tqdm
from vowpalwabbit.sklearn_vw import VWRegressor

tqdm.pandas()

# Feature transofrmers

In [4]:
def drop_consecutive_duplicates(path):
    return [point for point, _ in itertools.groupby(path)]

def decode_polyline(path):
    return drop_consecutive_duplicates(polyline.decode(path))

In [5]:
class GeohashBinarizer():
    """
    Convert polylines to sparse feature matrix
    Parameters
    ----------
    geohash_precision : int, default=6
        6 => ~10000 features.
        7 => ~100000 features.
    Attributes
    ----------
    n_features_ : int
    mlb : MultiLabelBinarizer
    """
    def __init__(self, geohash_precision=6):
        self.geohash_precision = geohash_precision
        self.mlb = MultiLabelBinarizer(sparse_output=True)

    def fit_transform(self, routes):
        """
        routes : polylines or lists of points
        """
        hashes = self.convert_to_hashes(routes)
        features = self.mlb.fit_transform(hashes)
        self.n_features_ = len(self.mlb.classes_)
        return features

    def transform(self, routes):
        hashes = self.convert_to_hashes(routes)
        features = self.mlb.transform(hashes)
        return features
    
    def get_hashes(self, path):
        return set(geohash.encode(*point, precision=self.geohash_precision) 
               for point in set(path))
    
    def convert_to_hashes(self, routes):
        if isinstance(routes[0], str):
            routes = [decode_polyline(path)
                      for path in tqdm(routes, desc='Decoding polylines')]
        hashes = [self.get_hashes(path)
                  for path in tqdm(routes, desc='Creating geohashes')]
        return hashes

    
class TurnAngleCounter:
    def __init__(self):
        pass
    
    def fit_transform(self, series):
        return pd.DataFrame(self.convert_to_counts(series), index=series.index)
    
    def turn_angle_cos(self, points):
        (x1, y1), (x2, y2), (x3, y3) = points
        seg1_x = x2-x1
        seg1_y = y2-y1
        seg2_x = x3-x2
        seg2_y = y3-y2
        seg1_conj_x = -seg1_y
        seg1_conj_y = seg1_x
        seg1_len = np.sqrt(seg1_x ** 2 + seg1_y ** 2)
        seg2_len = np.sqrt(seg2_x ** 2 + seg2_y ** 2)
        dot = seg1_conj_x * seg2_x + seg1_conj_y * seg2_y
        return dot / (seg1_len * seg2_len)
    
    def get_path_angles(self, points):
        angles = pd.Series([self.turn_angle_cos(points[i:i+3]) for i in range(len(points)-2)])
        counts, _ = np.histogram(angles, bins=[-2, -0.95, -0.2, 0.2, 0.95, 2])
        return pd.Series({
            'left_60': counts[0],
            'left_30': counts[1],
            'straight': counts[2],
            'right_30': counts[3],
            'right_60': counts[4],
        })
    
    def convert_to_counts(self, routes):
        if isinstance(routes[0], str):
            routes = [decode_polyline(path)
                      for path in tqdm(routes, desc='Decoding polylines')]
        counts = [self.get_path_angles(points)
                  for points in tqdm(routes, desc='Counting angles')]
        return counts

In [6]:
def decode_polylines_to_pd(series):
    return series.dropna().progress_apply(decode_polyline)

# In action

In [4]:
from pathlib import Path

In [5]:
data_path = Path('/') / 'kaggle' / 'input' / 'preprocessed-citymobil' / 'preprocessed_citymobil'

In [9]:
df_train = pd.read_csv(data_path / 'train.csv', 
                 usecols=("Id", "ETA", "EDA", "RTA", "RDA", "route"),
                 index_col="Id")
df_val = pd.read_csv(data_path / 'validation.csv', 
                     usecols=("Id", "ETA", "EDA", "RTA", "RDA", "route"),
                     index_col="Id")
df_test = pd.read_csv(data_path / 'test_additional.csv', 
                      usecols=("Id", "ETA", "EDA", "route"),
                      index_col="Id")
df_train.head()

Unnamed: 0_level_0,ETA,RTA,EDA,RDA,route
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,226.0,188.0,1.0,1.0,gnvsIaq{jHChA??uC???OPG^F^NRzKBd@AN[r@???`@`@`...
1,718.0,725.0,5.0,6.0,sqpsI}~zjHyAr]e@lMk@fLaBlb@i@rLKhBCdAUxEGlCg@f...
2,612.0,764.0,5.0,5.0,auosI}mmkH?LHd@KhC??o@w@[g@m@iAUk@??{G|OiB`Ek@...
3,1560.0,1412.0,13.0,14.0,{lhsIiffkHmKN_C?mIPwMJ??Si@gA{B??Wq@MRCJTp@hAd...
4,1528.0,893.0,9.0,10.0,yxusI{xnjHgAfG??}IuHkAqA??pIoe@VsA??dAkG`BuH??...


In [8]:
geohasher = GeohashBinarizer(geohash_precision=7)

In [9]:
vwr = VWRegressor()

## Transform

In [11]:
class GeoFeatures:
    def __init__(self, geohasher, vwr):
        self.geohasher = geohasher
        self.vwr = vwr
    
    def fit_transform(self, df, label=None):
        decoded = self.get_decoded(df)
        x = self.geohasher.transform(decoded)
        mask = df.route.notna()
        y = (df['ETA'][mask] / df['RTA'][mask]).to_numpy()
        self.vwr.fit(x, y)
        self.save(decoded, x)
    
    def transform(self, df, label=None):
        decoded = self.get_decoded(df)
        x = self.geohasher.transform(decoded)
        self.save(decoded, x)
        
    def get_decoded(df):
        decoded = decode_polylines_to_pd(df.route)
        return decoded
    
    def save(decoded, x, label):
        counts = TurnAngleCounter().fit_transform(decoded)
        pred = self.vwr.predict(x)
        pred = pd.Series(pred, index=decoded.index, name='geohash_based_prediction')
        counts.add_prefix('turn_').join(pred).to_csv(f'geofeatures_{label}.csv')

In [14]:
geo_features = GeoFeatures(geohasher, vwr)

In [18]:
geo_features.fit_transform(df_train, label='train')

In [16]:
geo_features.transform(df_val, label='val')

In [17]:
geo_features.transform(df_test, label='test')

In [14]:
import pickle

In [15]:
with open('transforms.pkl', 'wb') as f:
    pickle.dump({'geohasher': geohasher}, f)

In [16]:
vwr.save('vwr')