### 1. Import all the necessary libraries

In [None]:
import torch
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.optim as optim
import category_encoders as ce
import math
%matplotlib inline

In [None]:
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from math import sqrt

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
import xgboost

### 2. Loading the Dataset

In [None]:
df_tr = pd.read_csv("../data/train.csv")
df_tr.head()

### 3. Get Computed Time from POLYLINE

Our goal is to predict the travel-time of the taxi, which can be derived from the POLYLINE length.

Recall:

```
The travel time of the trip (the prediction target of this project) is defined as the (number of points-1) x 15 seconds. 
For example, a trip with 101 data points in POLYLINE has a length of (101-1) * 15 = 1500 seconds. Some trips have missing data points in POLYLINE, indicated by MISSING_DATA column, and it is part of the challenge how you utilize this knowledge.
```


In [None]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 1, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

### 4. Test and Extract the features: (Original Call + HR + WK + MON + TAXI_ID)

In [None]:
# Verify our guesses of the patterns of TAXI_ID such that all the IDs are in the form of 
# 20000xxx by substracting all the numbers by 20000000 and check if they are between the 
# range [0,1000).
def TAXI_ID_pattern_checker(x):
    # Test if the only last 3 digits of the TRIP_ID exhibit a pattern
    for idx in range(len(x)):
        if (x[idx]-20000000) < 0 or (x[idx]-20000000) >= 1000:
            return False
    return True

if TAXI_ID_pattern_checker(df_tr["TAXI_ID"]):
    print("Pattern is found!")

# Note that the only last three digits of the TAXI_ID are nonzero.
def parse_TAXI_ID(x):
    return (x % pow(10,3)) 

df_tr["Unique_TAXI_ID"] = df_tr["TAXI_ID"].apply(parse_TAXI_ID)

In [None]:
from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday(), dt.minute

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
df_tr[["YR", "MON", "DAY", "HR", "WK", "MIN"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [None]:
# Dropping the missing data 
df_tr.drop(df_tr[df_tr['MISSING_DATA'] == True].index, inplace = True)
df_tr.drop(df_tr[df_tr['POLYLINE'] =='[]']['POLYLINE'].index, inplace = True)

In [None]:
import copy
df_test = pd.read_csv("../test/test_public.csv")
df_test[["YR", "MON", "DAY", "HR", "WK", "MIN"]] = df_test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
df_test["Unique_TAXI_ID"] = df_tr["TAXI_ID"].apply(parse_TAXI_ID)

df_test_id = np.unique(df_test["Unique_TAXI_ID"])
df_test_min = np.unique(df_test["MIN"])
df_test_hr = np.unique(df_test["HR"])
df_test_wk = np.unique(df_test["WK"])
df_test_day = np.unique(df_test["DAY"])
df_test_mon = np.unique(df_test["MON"])
df_test_stand = np.unique(df_test["ORIGIN_STAND"])
df_test_call = np.unique(df_test["ORIGIN_CALL"])

### 5. Random Forest

In [None]:
indices = pd.read_csv("./train_indices.csv")
df_tr_copy = copy.deepcopy(df_tr[df_tr["CALL_TYPE"]=="C"])
df_tr_copy = df_tr_copy[indices]

X_typeC = df_tr_copy[["MON", "WK", "HR"]]

y_typeC = df_tr_copy['LEN']

# Random Forest - Call type C
random_regressor_C = RandomForestRegressor()
random_regressor_C.fit(X_typeC, y_typeC)

y_train_pred = random_regressor_C.predict(X_typeC)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeC.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeC.values, y_train_pred))

In [None]:
test_pred_C = test_pred

In [None]:
df_tr_copy = copy.deepcopy(df_tr[df_tr["CALL_TYPE"]=="B"])

mask = df_tr_copy['Unique_TAXI_ID'].isin(df_test_id)
df_tr_copy = df_tr_copy[mask]

df_tr_copy = df_tr_copy[["MON","WK","HR", "ORIGIN_STAND","DAY", "LEN"]]
df_tr_copy = df_tr_copy.dropna()

X_typeB = df_tr_copy[["WK", "MON","ORIGIN_STAND"]]

y_typeB = df_tr_copy['LEN']


s = StandardScaler()
X_typeB = s.fit_transform(X_typeB)

# Random Forest - Call type B
random_regressor_B = RandomForestRegressor()
random_regressor_B.fit(X_typeB, y_typeB)

y_train_pred = random_regressor_B.predict(X_typeB)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeB.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeB.values, y_train_pred))

In [None]:
test_pred_B = test_pred

In [None]:
print(f'{datetime.now()} 开始训练结束...')

df_tr_copy = copy.deepcopy(df_tr[df_tr["CALL_TYPE"]=="A"])

mask = df_tr_copy['Unique_TAXI_ID'].isin(df_test_id)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['HR'].isin(df_test_hr)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['WK'].isin(df_test_wk)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['MON'].isin(df_test_mon)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['ORIGIN_CALL'].isin(df_test_call)
df_tr_copy = df_tr_copy[mask]


df_tr_copy = df_tr_copy[["MON","WK","DAY","HR", "ORIGIN_CALL",'Unique_TAXI_ID', "LEN"]]
df_tr_copy = df_tr_copy.dropna()

X_typeA = df_tr_copy[[ "HR" ,"WK","ORIGIN_CALL"]]

y_typeA = df_tr_copy['LEN']

s = StandardScaler()
X_typeA = s.fit_transform(X_typeA)

# Random Forest - Call type A with id

random_regressor_A = RandomForestRegressor()
random_regressor_A.fit(X_typeA, y_typeA)

y_train_pred = random_regressor_A.predict(X_typeA)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeA.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeA.values, y_train_pred))

In [None]:
index_A = 0

for index, row in df_sample.iterrows():
    if (row["TRIP_ID"] == df_test["TRIP_ID"].iloc[index_A]):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred[index_A]
        index_A = index_A + 1
    if (index_A == 21):
        break

In [None]:
df_tr_copy = copy.deepcopy(df_tr[df_tr["CALL_TYPE"]=="A"])

mask = df_tr_copy['HR'].isin(df_test_hr)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['WK'].isin(df_test_wk)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['MON'].isin(df_test_mon)
df_tr_copy = df_tr_copy[mask]
mask = df_tr_copy['ORIGIN_CALL'].isin(df_test_call)
df_tr_copy = df_tr_copy[mask]


df_tr_copy = df_tr_copy[["MON","WK","DAY","HR", "ORIGIN_CALL",'Unique_TAXI_ID', "LEN"]]
df_tr_copy = df_tr_copy.dropna()

X_typeA = df_tr_copy[[ "HR" ,"WK","ORIGIN_CALL"]]

y_typeA = df_tr_copy['LEN']

s = StandardScaler()
X_typeA = s.fit_transform(X_typeA)

# Random Forest - Call type A with no id

random_regressor_A = RandomForestRegressor()
random_regressor_A.fit(X_typeA, y_typeA)

y_train_pred = random_regressor_A.predict(X_typeA)

print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", sqrt(mse(y_typeA.values, y_train_pred)))
print("R-squared: ", r2_score(y_typeA.values, y_train_pred))

In [None]:
index_A = 0

for index, row in df_sample.iterrows():
    if (row["TRIP_ID"] == df_test["TRIP_ID"].iloc[index_A]):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred[index_A]
        index_A = index_A + 1
    if (index_A == 51):
        break

### 6. Saving

In [None]:
df_test = pd.read_csv("../test/test_public.csv")

index_A = 0
index_B = 0
index_C = 0

for index, row in df_test.iterrows():
    if (row["CALL_TYPE"]=="A"):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred_A[index_A]
        index_A = index_A + 1
    if (row["CALL_TYPE"]=="B"):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred_B[index_B]
        index_B = index_B + 1
    elif (row["CALL_TYPE"]=="C"):
        df_sample["TRAVEL_TIME"].iloc[index] = test_pred_C[index_C]
        index_C = index_C + 1

print(df_sample)

In [None]:
df_sample.to_csv("./my_pred.csv", index=None)