In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df_riders = pd.read_csv("data/Riders.csv")
df_train = pd.read_csv("data/Train.csv")

df = df_riders.merge(df_train, how='inner', on='Rider Id')
df.columns = df.columns.str.lower().str.replace(' ', '_')
del df['arrival_at_destination_-_time']
df.shape

(21201, 32)

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
rider_id,Rider_Id_396,Rider_Id_396,Rider_Id_479,Rider_Id_479,Rider_Id_479
no_of_orders,2946,2946,360,360,360
age,2298,2298,951,951,951
average_rating,14.0,14.0,13.5,13.5,13.5
no_of_ratings,1159,1159,176,176,176
order_no,Order_No_19506,Order_No_14309,Order_No_5471,Order_No_515,Order_No_12379
user_id,User_Id_2746,User_Id_1792,User_Id_2162,User_Id_1363,User_Id_2622
vehicle_type,Bike,Bike,Bike,Bike,Bike
platform_type,1,3,3,3,2
personal_or_business,Personal,Business,Business,Business,Personal


# modeling

In [4]:
def prepare_model(df):
    #Feature engineering
    df['placement_-_time_hour'] = pd.to_datetime(df['placement_-_time']).dt.hour
    df['placement_-_time_minute'] = pd.to_datetime(df['placement_-_time']).dt.minute
    del df['placement_-_time']

    df['confirmation_-_time_hour'] = pd.to_datetime(df['confirmation_-_time']).dt.hour
    df['confirmation_-_time_minute'] = pd.to_datetime(df['confirmation_-_time']).dt.minute
    del df['confirmation_-_time']

    df['arrival_at_pickup_-_time_hour'] = pd.to_datetime(df['arrival_at_pickup_-_time']).dt.hour
    df['arrival_at_pickup_-_time_minute'] = pd.to_datetime(df['arrival_at_pickup_-_time']).dt.minute
    del df['arrival_at_pickup_-_time']


    df['pickup_-_time_hour'] = pd.to_datetime(df['pickup_-_time']).dt.hour
    df['pickup_-_time_minute'] = pd.to_datetime(df['pickup_-_time']).dt.minute
    del df['pickup_-_time']


    #df['arrival_at_destination_-_time_hour'] = pd.to_datetime(df['arrival_at_destination_-_time']).dt.hour
    #df['arrival_at_destination_-_time_minute'] = pd.to_datetime(df['arrival_at_destination_-_time']).dt.minute
    #del df['arrival_at_destination_-_time']
    
    
    #replace mising values
    df['temperature'] = df['temperature'].fillna(0)
    df['precipitation_in_millimeters'] = df['precipitation_in_millimeters'].fillna(0)
    
    return df    

In [5]:
def apply_log_transform(y):
    return np.log1p(y)

apply_log_transform(2000)

7.601402334583733

In [6]:
df = prepare_model(df)
df.sample(3).T

Unnamed: 0,5424,2557,5864
rider_id,Rider_Id_726,Rider_Id_268,Rider_Id_735
no_of_orders,1337,1714,407
age,875,1980,875
average_rating,14.2,13.2,14.1
no_of_ratings,243,357,108
order_no,Order_No_24260,Order_No_17171,Order_No_23471
user_id,User_Id_1205,User_Id_1342,User_Id_2350
vehicle_type,Bike,Bike,Bike
platform_type,3,3,1
personal_or_business,Business,Business,Personal


In [7]:
df.time_from_pickup_to_arrival = df['time_from_pickup_to_arrival'].apply(apply_log_transform)

In [8]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


y_train = df_train.time_from_pickup_to_arrival.values
y_val = df_val.time_from_pickup_to_arrival.values
y_test = df_test.time_from_pickup_to_arrival.values


del df_train['time_from_pickup_to_arrival']
del df_val['time_from_pickup_to_arrival']
del df_test['time_from_pickup_to_arrival']


print(df_train.shape[0], df_val.shape[0], df_test.shape[0] )

12720 4240 4241


In [9]:
#Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
train_dict = df_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Training and Evaluate model

In [10]:
from sklearn.tree import DecisionTreeRegressor,export_text

In [11]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train,y_train)

y_pred = dt.predict(X_val)
rmse = mean_squared_error(y_val,y_pred,squared=False)
print(round(rmse,3))

1.122


# Evaluate model on Test set

In [20]:
df_riders = pd.read_csv("data/Riders.csv")
df_test = pd.read_csv("data/Test.csv")

df_test = df_riders.merge(df_test, how='inner', on='Rider Id')
df_test.columns = df_test.columns.str.lower().str.replace(' ', '_')
df_test.shape

(7068, 29)

In [21]:
def convert_to_original(x):
    return int(np.exp(x))

In [22]:
df_test.sample(3).T

Unnamed: 0,2479,3632,1698
rider_id,Rider_Id_354,Rider_Id_622,Rider_Id_353
no_of_orders,1130,1465,2173
age,1715,719,1157
average_rating,13.3,14.3,14.5
no_of_ratings,303,130,683
order_no,Order_No_4333,Order_No_8729,Order_No_10040
user_id,User_Id_3201,User_Id_1464,User_Id_2763
vehicle_type,Bike,Bike,Bike
platform_type,3,2,3
personal_or_business,Business,Personal,Business


In [23]:
df_test = prepare_model(df_test)
df_test.sample(3).T

Unnamed: 0,6739,3913,6756
rider_id,Rider_Id_438,Rider_Id_383,Rider_Id_438
no_of_orders,557,2412,557
age,239,787,239
average_rating,14.6,14.3,14.6
no_of_ratings,55,309,55
order_no,Order_No_7583,Order_No_17134,Order_No_24993
user_id,User_Id_733,User_Id_3470,User_Id_2344
vehicle_type,Bike,Bike,Bike
platform_type,3,3,3
personal_or_business,Business,Business,Business


In [24]:
test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [25]:
y_pred_test = dt.predict(X_test)

In [31]:
df_sub=df_test[['order_no']]
df_sub['Time from Pickup to Arrival'] = y_pred_test
df_sub['Time from Pickup to Arrival']=df_sub['Time from Pickup to Arrival'].apply(convert_to_original)
df_sub['order_no'] = df_sub['order_no'].str.strip()
df_sub.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Time from Pickup to Arrival'] = y_pred_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['Time from Pickup to Arrival']=df_sub['Time from Pickup to Arrival'].apply(convert_to_original)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['order_no'] = df_sub['order_no'].str.strip(

(7068, 2)

In [32]:
df_sub.sample(5)

Unnamed: 0,order_no,Time from Pickup to Arrival
2356,Order_No_1188,1462
6875,Order_No_2824,1462
2118,Order_No_20284,1462
4819,Order_No_23221,1462
5104,Order_No_19052,564


In [34]:
import time

ts = int(time.time())
df_sub.to_csv('./submissions/{}.csv'.format(str(ts)),index=False)