In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, ensemble, kernel_ridge, svm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

## Load data

In [2]:
# Ignore ORIGIN_CALL and other categorical columns
np.random.seed(42)
# train_arr = np.loadtxt("../data/numeric_trainvf.csv", dtype=np.float32, delimiter=",", skiprows=1)[:, 4:]
df_train = pd.read_csv("../data/combined_trainvf.csv", index_col=0)
df_train.head()
# Prune
df_train = df_train[df_train["DIST"] < 20]
df_train = df_train[df_train["TARGET"] > 30]
df_train = df_train[df_train["TARGET"] < 15000]

# Categorical
df_train["ORIGIN_CALL"] = df_train["ORIGIN_CALL"].astype("category")
df_train["ORIGIN_STAND"] = df_train["ORIGIN_STAND"].astype("category")
df_train["TAXI_ID"] = df_train["TAXI_ID"].astype("category")
# df_train["YEAR"] = df_train["YEAR"].astype("category")
# df_train["WK_OF_YR"] = df_train["WK_OF_YR"].astype("category")
# df_train["WK_DAY"] = df_train["WK_DAY"].astype("category")
# df_train["HR"] = df_train["HR"].astype("category")
# df_train["MONTH"] = df_train["MONTH"].astype("category")
droplist = ["START_LONG", "START_LAT", "DIST"]
df_train.drop(labels=droplist, inplace=True, axis=1)
# Split
train, valid = train_test_split(df_train, test_size=0.1)
train_label = train["TARGET"]
train.drop(labels=["TARGET"], inplace=True, axis=1)
valid_label = valid["TARGET"]
valid.drop(labels=["TARGET"], inplace=True, axis=1)

# Test
# test = np.loadtxt("../data/combined_testvf.csv", delimiter=",", skiprows=1)[:, 3:]
df_test = pd.read_csv("../data/combined_testvf.csv")
df_test["ORIGIN_CALL"] = df_test["ORIGIN_CALL"].astype("category").cat.set_categories(df_train["ORIGIN_CALL"].cat.categories)
df_test["ORIGIN_STAND"] = df_test["ORIGIN_STAND"].astype("category").cat.set_categories(df_train["ORIGIN_STAND"].cat.categories)
df_test["TAXI_ID"] = df_test["TAXI_ID"].astype("category").cat.set_categories(df_train["TAXI_ID"].cat.categories)
# df_test["YEAR"] = df_test["YEAR"].astype("category").cat.set_categories(df_train["YEAR"].cat.categories)
# df_test["WK_OF_YR"] = df_test["WK_OF_YR"].astype("category").cat.set_categories(df_train["WK_OF_YR"].cat.categories)
# df_test["WK_DAY"] = df_test["WK_DAY"].astype("category").cat.set_categories(df_train["WK_DAY"].cat.categories)
# df_test["HR"] = df_test["HR"].astype("category").cat.set_categories(df_train["HR"].cat.categories)
# df_test["MONTH"] = df_test["MONTH"].astype("category")
df_test.drop(labels=droplist, inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(labels=["TARGET"], inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid.drop(labels=["TARGET"], inplace=True, axis=1)


In [95]:
train.head()

Unnamed: 0,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,START_LONG,START_LAT,A,B,C,YEAR,WK_OF_YR,WK_DAY,MONTH,DAY,HR,DIST
1101256,0,5,391,-8.582832,41.180751,0,1,0,2014,9,1,2,25,5,3.910364
626957,0,35,148,-8.649657,41.154345,0,1,0,2013,46,1,11,12,1,2.546728
1566973,0,6,340,-8.639991,41.159646,0,1,0,2014,23,0,6,2,13,1.673552
1517545,0,63,11,-8.6139,41.146884,0,0,1,2014,21,4,5,23,20,1.487536
1504217,0,33,139,-8.615583,41.140845,0,1,0,2014,21,2,5,21,6,2.101389


In [3]:
def score(model):
    pred = model.predict(valid)
    print(f"RMSE: {np.sqrt(np.sum((pred - valid_label)**2) / len(valid))}")

## Models

### XGBoost

In [None]:
reg_full = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=50, enable_categorical=True, max_cat_to_onehot=1)
reg_full.fit(train, train_label)
preds_full = reg_full.predict(df_test)
score(reg_full)

In [114]:
preds = reg_full.predict(df_test) # No taxi id, origin call, origin stand, no coords
df_pred = pd.read_csv("../data/sampleSubmission.csv")
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./combined_xgboost_categorical_3_default.csv", index=False)

In [17]:
reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=4, enable_categorical=True, max_cat_to_onehot=1)
reg.fit(train, train_label)
preds = reg.predict(df_test)

In [18]:
print(preds)

[ 692.49756  619.2346   658.2535   605.00226  615.8369   802.2189
  649.2317   719.4376   621.45575  686.0294   619.2346   724.77124
  621.45575 1255.2343   697.8312   777.07184  697.8312  1160.9586
  712.9695   750.1318   686.0294   719.4376   686.0294   711.03845
  629.8076   709.0863   686.0294   712.9695   716.09143  711.03845
  686.0294   619.2346   724.77124  836.103    831.7416   732.84015
  759.7096   625.1414   686.0294   680.0603   619.2346   661.03345
 1562.2695   686.0294   619.2346   619.2346   697.8312   697.8312
  747.8408   619.2346   786.17456  701.23676  619.2346   686.0294
  711.03845  619.2346   658.2535   680.0603  1076.4232   686.0294
  711.03845  697.8312   621.45575  715.59216  621.45575  686.0294
  686.0294   614.29553  856.0419   573.5528   819.90094  856.0419
  686.0294  1053.9149   535.35925  845.46124  535.35925  835.98975
  720.60815 1106.2155   665.9773   699.3855   692.91736  698.80383
  528.48334  698.80383  942.45496  712.788    507.66376  665.9773
  6

In [135]:
preds = reg.predict(df_test) # No taxi id, origin call, origin stand, no coords
df_pred = pd.read_csv("../data/sampleSubmission.csv")
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./combined_xgboost_categorical_4_default_15kmax.csv", index=False)

In [108]:
# E V E R Y T H I N G
pred = reg.predict(valid[valid["A"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['A'] == 1])**2) / len(valid[valid['A'] == 1]))}")
pred = reg.predict(valid[valid["B"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['B'] == 1])**2) / len(valid[valid['B'] == 1]))}")
pred = reg.predict(valid[valid["C"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['C'] == 1])**2) / len(valid[valid['C'] == 1]))}")

RMSE: 531.4848147987065
RMSE: 464.3774480486925
RMSE: 837.7974275920382


In [8]:
# Yes taxi id, origin call, origin stand, no coords
pred = reg.predict(valid[valid["A"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['A'] == 1])**2) / len(valid[valid['A'] == 1]))}")
pred = reg.predict(valid[valid["B"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['B'] == 1])**2) / len(valid[valid['B'] == 1]))}")
pred = reg.predict(valid[valid["C"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['C'] == 1])**2) / len(valid[valid['C'] == 1]))}")

RMSE: 477.84808351169596
RMSE: 428.91400283658095
RMSE: 786.1901650206082


In [29]:
# simple
pred = reg.predict(valid[valid["A"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['A'] == 1])**2) / len(valid[valid['A'] == 1]))}")
pred = reg.predict(valid[valid["B"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['B'] == 1])**2) / len(valid[valid['B'] == 1]))}")
pred = reg.predict(valid[valid["C"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['C'] == 1])**2) / len(valid[valid['C'] == 1]))}")

RMSE: 534.2977151110199
RMSE: 463.64833608675974
RMSE: 827.8829632643224


## Create prediction csv

In [9]:
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./combined_xgboost_4_all_notimecategories.csv", index=False)