### 1. Import all the necessary libraries

In [8]:
import torch
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.optim as optim
import category_encoders as ce
import math
import copy
%matplotlib inline

In [2]:
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from math import sqrt

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

### 2. Loading the Dataset

In [3]:
df_tr = pd.read_csv("../data/train.csv")
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


### 3. Get Computed Time from POLYLINE

Our goal is to predict the travel-time of the taxi, which can be derived from the POLYLINE length.

Recall:

```
The travel time of the trip (the prediction target of this project) is defined as the (number of points-1) x 15 seconds. 
For example, a trip with 101 data points in POLYLINE has a length of (101-1) * 15 = 1500 seconds. Some trips have missing data points in POLYLINE, indicated by MISSING_DATA column, and it is part of the challenge how you utilize this knowledge.
```


In [4]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 1, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

### 4. Test and Extract the features: (Original Call + HR + WK + MON + TAXI_ID)

In [5]:
# Verify our guesses of the patterns of TAXI_ID such that all the IDs are in the form of 
# 20000xxx by substracting all the numbers by 20000000 and check if they are between the 
# range [0,1000).
def TAXI_ID_pattern_checker(x):
    # Test if the only last 3 digits of the TRIP_ID exhibit a pattern
    for idx in range(len(x)):
        if (x[idx]-20000000) < 0 or (x[idx]-20000000) >= 1000:
            return False
    return True

if TAXI_ID_pattern_checker(df_tr["TAXI_ID"]):
    print("Pattern is found!")

# Note that the only last three digits of the TAXI_ID are nonzero.
def parse_TAXI_ID(x):
    return (x % pow(10,3)) 

df_tr["Unique_TAXI_ID"] = df_tr["TAXI_ID"].apply(parse_TAXI_ID)

Pattern is found!


In [6]:
from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday(), dt.minute

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
df_tr[["YR", "MON", "DAY", "HR", "WK", "MIN"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [9]:
df_tr_C = copy.deepcopy(df_tr)
# Dropping the missing data 
df_tr.drop(df_tr[df_tr['MISSING_DATA'] == True].index, inplace = True)
df_tr.drop(df_tr[df_tr['POLYLINE'] =='[]']['POLYLINE'].index, inplace = True)

In [11]:
df_test = pd.read_csv("../test/test_public.csv")
df_test[["YR", "MON", "DAY", "HR", "WK", "MIN"]] = df_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_test["Unique_TAXI_ID"] = df_tr["TAXI_ID"].apply(parse_TAXI_ID)

df_test_id = np.unique(df_test["Unique_TAXI_ID"])
df_test_min = np.unique(df_test["MIN"])
df_test_hr = np.unique(df_test["HR"])
df_test_wk = np.unique(df_test["WK"])
df_test_day = np.unique(df_test["DAY"])
df_test_mon = np.unique(df_test["MON"])
df_test_stand = np.unique(df_test["ORIGIN_STAND"])
df_test_call = np.unique(df_test["ORIGIN_CALL"])

### 5. Random Forest

In [12]:
df_test = pd.read_csv("../test/test_public.csv")
df_test["Unique_TAXI_ID"] = df_test["TAXI_ID"].apply(parse_TAXI_ID)
df_test[["YR", "MON", "DAY", "HR", "WK","MIN"]] = df_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [25]:
df_sample = pd.read_csv("../test/sampleSubmission.csv")

In [13]:
indices = pd.read_csv("./train_indices.csv")
df_tr_copy = df_tr_C.iloc[indices["0"].tolist()]

X_typeC = df_tr_copy[["MON", "WK", "HR"]]

y_typeC = df_tr_copy['LEN']

# Random Forest - Call type C
random_regressor_C = RandomForestRegressor()
random_regressor_C.fit(X_typeC, y_typeC)

y_train_pred = random_regressor_C.predict(X_typeC)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeC.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeC.values, y_train_pred))

Train Results for Random Forest Regressor Model:
--------------------------------------------------
Root mean squared error:  844.4090515444652
R-squared:  0.1765662300609453


In [14]:
df_test_C = df_test[df_test["CALL_TYPE"]=="C"]
test_feature = df_test_C[["MON", "WK", "HR"]]
test_pred_C = random_regressor_C.predict(test_feature)

In [15]:
df_tr_copy = copy.deepcopy(df_tr[df_tr["CALL_TYPE"]=="B"])

mask = df_tr_copy['Unique_TAXI_ID'].isin(df_test_id)
df_tr_copy = df_tr_copy[mask]

df_tr_copy = df_tr_copy[["MON","WK","HR", "ORIGIN_STAND","DAY", "LEN"]]
df_tr_copy = df_tr_copy.dropna()

X_typeB = df_tr_copy[["WK", "MON","ORIGIN_STAND"]]

y_typeB = df_tr_copy['LEN']


s = StandardScaler()
X_typeB = s.fit_transform(X_typeB)

# Random Forest - Call type B
random_regressor_B = RandomForestRegressor()
random_regressor_B.fit(X_typeB, y_typeB)

y_train_pred = random_regressor_B.predict(X_typeB)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeB.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeB.values, y_train_pred))

Train Results for Random Forest Regressor Model:
--------------------------------------------------
Root mean squared error:  461.24947004729165
R-squared:  0.045237436886157556


In [20]:
df_test_B = df_test[df_test["CALL_TYPE"]=="B"]
test_feature = df_test_B[["WK", "MON","ORIGIN_STAND"]]
test_pred_B = random_regressor_B.predict(test_feature)

In [39]:
df_tr_copy = copy.deepcopy(df_tr[df_tr["CALL_TYPE"]=="A"])

mask = df_tr_copy['Unique_TAXI_ID'].isin(df_test_id)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['HR'].isin(df_test_hr)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['WK'].isin(df_test_wk)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['MON'].isin(df_test_mon)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['ORIGIN_CALL'].isin(df_test_call)
df_tr_copy = df_tr_copy[mask]


df_tr_copy = df_tr_copy[["MON","WK","DAY","HR", "ORIGIN_CALL",'Unique_TAXI_ID', "LEN"]]
df_tr_copy = df_tr_copy.dropna()

X_typeA = df_tr_copy[[ "HR" ,"WK","ORIGIN_CALL"]]

y_typeA = df_tr_copy['LEN']

s = StandardScaler()
X_typeA = s.fit_transform(X_typeA)

# Random Forest - Call type A with id
random_regressor_A = RandomForestRegressor()
random_regressor_A.fit(X_typeA, y_typeA)

y_train_pred = random_regressor_A.predict(X_typeA)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeA.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeA.values, y_train_pred))

Train Results for Random Forest Regressor Model:
--------------------------------------------------
Root mean squared error:  575.37617250393
R-squared:  0.08550572173179571


In [42]:
df_test_A = df_test[df_test["CALL_TYPE"]=="A"]
df_train_id = np.unique(df_tr_copy["Unique_TAXI_ID"])
mask = df_test_A['Unique_TAXI_ID'].isin(df_train_id)
df_test_A = df_test_A[mask]

test_feature = df_test_A[[ "HR","WK", "ORIGIN_CALL"]]
test_pred = random_regressor_A.predict(test_feature)

In [45]:
index_A = 0

for index, row in df_sample.iterrows():
    if (row["TRIP_ID"] == df_test["TRIP_ID"].iloc[index_A]):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred[index_A]
        index_A = index_A + 1
    if (index_A == 21):
        break

In [48]:
df_tr_copy = copy.deepcopy(df_tr[df_tr["CALL_TYPE"]=="A"])

mask = df_tr_copy['HR'].isin(df_test_hr)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['WK'].isin(df_test_wk)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['MON'].isin(df_test_mon)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['ORIGIN_CALL'].isin(df_test_call)
df_tr_copy = df_tr_copy[mask]


df_tr_copy = df_tr_copy[["MON","WK","DAY","HR", "ORIGIN_CALL",'Unique_TAXI_ID', "LEN"]]
df_tr_copy = df_tr_copy.dropna()

X_typeA = df_tr_copy[[ "HR" ,"WK","ORIGIN_CALL"]]

y_typeA = df_tr_copy['LEN']

s = StandardScaler()
X_typeA = s.fit_transform(X_typeA)

# Random Forest - Call type A with no id

random_regressor_A = RandomForestRegressor()
random_regressor_A.fit(X_typeA, y_typeA)

y_train_pred = random_regressor_A.predict(X_typeA)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeA.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeA.values, y_train_pred))

Train Results for Random Forest Regressor Model:
--------------------------------------------------
Root mean squared error:  587.374831294293
R-squared:  0.04402123100923694


In [49]:
df_test_A = df_test[df_test["CALL_TYPE"]=="A"]
mask = ~df_test_A['Unique_TAXI_ID'].isin(df_train_id)
df_test_A = df_test_A[mask]

test_feature = df_test_A[[ "HR","WK", "ORIGIN_CALL"]]
test_pred = random_regressor_A.predict(test_feature)

In [50]:
index_A = 0

for index, row in df_sample.iterrows():
    if (row["TRIP_ID"] == df_test["TRIP_ID"].iloc[index_A]):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred[index_A]
        index_A = index_A + 1
    if (index_A == 51):
        break

### 6. Saving

In [51]:
df_test = pd.read_csv("../test/test_public.csv")

index_B = 0
index_C = 0

for index, row in df_test.iterrows():
    if (row["CALL_TYPE"]=="B"):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred_B[index_B]
        index_B = index_B + 1
    elif (row["CALL_TYPE"]=="C"):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred_C[index_C]
        index_C = index_C + 1

print(df_sample)

    TRIP_ID  TRAVEL_TIME
0        T1   400.389167
1        T2   400.389167
2        T3   400.389167
3        T4   400.389167
4        T5   400.389167
..      ...          ...
315    T323   660.000000
316    T324   400.389167
317    T325   805.280227
318    T326   660.000000
319    T327   660.000000

[320 rows x 2 columns]


In [52]:
df_sample.to_csv("./my_pred.csv", index=None)