In [1]:
import pandas as pd
import numpy as np
import pylab as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn import cluster
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn import pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin

%matplotlib inline

In [2]:
# Calculate average speed, and use that 
class AvgSpeed(BaseEstimator, RegressorMixin):
    def fit(self, X, y):
        if 'Trip_distance' not in X.columns:
            raise KeyError('X Dataframe needs to have column "Trip_distance"')
        self.avg = (y / X['Trip_distance']).mean()
        return self
    def predict(self, X):
        if 'Trip_distance' not in X.columns:
            raise KeyError('X Dataframe needs to have column "Trip_distance"')
        return self.avg * X['Trip_distance']

# Load data

In [3]:
# load training
training = pd.read_csv('data/Train.csv').set_index('ID')
training['Timestamp'] = pd.to_datetime(training['Timestamp'])
training.head()

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance,ETA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000FLWA8,2019-12-04 20:01:50+00:00,3.258,36.777,3.003,36.718,39627,2784
000RGOAM,2019-12-10 22:37:09+00:00,3.087,36.707,3.081,36.727,3918,576
001QSGIH,2019-11-23 20:36:10+00:00,3.144,36.739,3.088,36.742,7265,526
002ACV6R,2019-12-01 05:43:21+00:00,3.239,36.784,3.054,36.763,23350,3130
0039Y7A8,2019-12-17 20:30:20+00:00,2.912,36.707,3.207,36.698,36613,2138


In [4]:
# sample set 
sample_set = pd.read_csv('data/SampleSubmission.csv').set_index('ID')
sample_set.head()

Unnamed: 0_level_0,ETA
ID,Unnamed: 1_level_1
000V4BQX,0
003WBC5J,0
004O4X3A,0
006CEI5B,0
009G0M2T,0


In [5]:
# testing data
testing = pd.read_csv('data/Test.csv').set_index('ID')
testing['Timestamp'] = pd.to_datetime(testing['Timestamp'])
testing.head()

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000V4BQX,2019-12-21 05:52:37+00:00,2.981,36.688,2.978,36.754,17549
003WBC5J,2019-12-25 21:38:53+00:00,3.032,36.769,3.074,36.751,7532
004O4X3A,2019-12-29 21:30:29+00:00,3.035,36.711,3.01,36.758,10194
006CEI5B,2019-12-31 22:51:57+00:00,2.902,36.738,3.208,36.698,32768
009G0M2T,2019-12-28 21:47:22+00:00,2.86,36.692,2.828,36.696,4513


In [6]:
# weather data
weather_df = pd.read_csv('data/Weather.csv')
weather_df.head()

Unnamed: 0,date,dewpoint_2m_temperature,maximum_2m_air_temperature,mean_2m_air_temperature,mean_sea_level_pressure,minimum_2m_air_temperature,surface_pressure,total_precipitation,u_component_of_wind_10m,v_component_of_wind_10m
0,2019-11-01,290.630524,296.434662,294.125061,101853.617188,292.503998,100806.351562,0.004297,3.561323,0.941695
1,2019-11-02,289.135284,298.432404,295.551666,101225.164062,293.337921,100187.25,0.001767,5.318593,3.258237
2,2019-11-03,287.667694,296.612122,295.182831,100806.617188,293.674316,99771.414062,0.000797,8.447649,3.172982
3,2019-11-04,287.634644,297.173737,294.368134,101240.929688,292.376221,100200.84375,0.000393,5.991428,2.2367
4,2019-11-05,286.413788,294.284851,292.496979,101131.75,289.143066,100088.5,0.004658,6.96273,2.655364


# Make train and OOT set

In [7]:
# make sure sorted by date
training = training.sort_values('Timestamp')
train_df = training.iloc[:-20000]
oot_df = training.iloc[-20000:]
oot_df.shape,train_df.shape

((20000, 7), (63924, 7))

# AVG Speed Model

In [8]:
# baseline model
split = model_selection.TimeSeriesSplit(n_splits=5)
result= model_selection.cross_validate(AvgSpeed(),train_df, train_df['ETA'],cv=split,
                               scoring='neg_root_mean_squared_error')
result, -np.mean(result['test_score']), np.std(result['test_score'])

({'fit_time': array([0.01299405, 0.01099563, 0.0199976 , 0.0240016 , 0.03400373]),
  'score_time': array([0.01000452, 0.00599957, 0.01000047, 0.01199746, 0.00799775]),
  'test_score': array([-608.83239483, -619.45547343, -642.25524582, -602.10699022,
         -628.52682296])},
 620.2353854497933,
 14.23304154386)

In [9]:
# check score on Dev set
reg = AvgSpeed()
reg.fit(train_df, train_df.ETA)
np.sqrt(metrics.mean_squared_error(oot_df.ETA, reg.predict(oot_df)))

632.4052293330318

## Make Submission

In [10]:
# fit model on all data
reg = AvgSpeed()
reg.fit(training, training.ETA)
reg.avg # time / distance

0.10475614041193451

In [11]:
# run test data through
submission = testing.copy()
submission['ETA'] = reg.predict(testing)
submission.head()

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance,ETA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000V4BQX,2019-12-21 05:52:37+00:00,2.981,36.688,2.978,36.754,17549,1838.365508
003WBC5J,2019-12-25 21:38:53+00:00,3.032,36.769,3.074,36.751,7532,789.02325
004O4X3A,2019-12-29 21:30:29+00:00,3.035,36.711,3.01,36.758,10194,1067.884095
006CEI5B,2019-12-31 22:51:57+00:00,2.902,36.738,3.208,36.698,32768,3432.649209
009G0M2T,2019-12-28 21:47:22+00:00,2.86,36.692,2.828,36.696,4513,472.764462


In [12]:
# save
submission[['ETA']].to_csv('baseline_submit_base.csv')

# Do clustering on coordinates

In [13]:
preprocessor = ColumnTransformer(
        transformers=[
            ('Coordninates', cluster.KMeans(n_clusters=8),['Origin_lat','Origin_lon',
                                                           'Destination_lat','Destination_lon']),
            ('Distance', StandardScaler(), ['Trip_distance'])])

    # create model
pipeline = pipeline.Pipeline(steps=[
                          ('preprocessor', preprocessor),
    
                          ('Regression', LinearRegression())])   

In [14]:
split = model_selection.TimeSeriesSplit(n_splits=5)
result= model_selection.cross_validate(pipeline,train_df.drop(columns=['ETA']), train_df['ETA'],cv=split,
                               scoring='neg_root_mean_squared_error')
result, -np.mean(result['test_score']), np.std(result['test_score'])

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


({'fit_time': array([1.69400239, 0.69600654, 0.95799994, 1.29400396, 1.83400869]),
  'score_time': array([0.02100563, 0.01099563, 0.01999903, 0.01699591, 0.01699758]),
  'test_score': array([-249.52950451, -245.11699813, -258.93709838, -238.67766129,
         -238.29852436])},
 246.11195733479002,
 7.661231076028806)

In [15]:
# test on DEV set
pipeline.fit(training.drop(columns=['ETA']), training.ETA)
np.sqrt(metrics.mean_squared_error(oot_df.ETA, pipeline.predict(oot_df)))

  super()._check_params_vs_input(X, default_n_init=10)


229.6707470402815

In [16]:
# make submission
# run test data through
submission = testing.copy()
submission['ETA'] = pipeline.predict(testing)
submission.head()

Unnamed: 0_level_0,Timestamp,Origin_lat,Origin_lon,Destination_lat,Destination_lon,Trip_distance,ETA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000V4BQX,2019-12-21 05:52:37+00:00,2.981,36.688,2.978,36.754,17549,1444.955884
003WBC5J,2019-12-25 21:38:53+00:00,3.032,36.769,3.074,36.751,7532,798.984778
004O4X3A,2019-12-29 21:30:29+00:00,3.035,36.711,3.01,36.758,10194,998.348591
006CEI5B,2019-12-31 22:51:57+00:00,2.902,36.738,3.208,36.698,32768,2086.691574
009G0M2T,2019-12-28 21:47:22+00:00,2.86,36.692,2.828,36.696,4513,431.459254


In [17]:
submission[['ETA']].to_csv('kmeans_submit_base.csv')

In [18]:
# see what pipeline is doing
# extract feature names from pipeline
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        methods = transformer_in_columns[0]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += [methods + '__'+ str(i) for i in names.tolist()]
        elif isinstance(names,list):
            col_name += [methods + '__'+ str(i) for i in names]    
        elif isinstance(names,str):
            col_name.append(methods + '__'+ names)
    return col_name



In [19]:
cols = get_column_names_from_ColumnTransformer(pipeline.named_steps['preprocessor'])
a = pipeline.named_steps['preprocessor']
a.transformers[0][2]

NameError: name 'Pipeline' is not defined

In [None]:
a.named_transformers_['Coordninates'].predict(train_df[a.transformers[0][2]])

# add weather data and try other models

# make submission
