In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, ensemble, kernel_ridge, svm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split


## Load data

In [2]:
# Ignore ORIGIN_CALL and other categorical columns
np.random.seed(42)
# train_arr = np.loadtxt("../data/numeric_trainvf.csv", dtype=np.float32, delimiter=",", skiprows=1)[:, 4:]
df_train = pd.read_csv("../data/combined_trainvf.csv", index_col=0)
df_train.head()
# Prune
df_train = df_train[df_train["DIST"] < 20]
df_train = df_train[df_train["TARGET"] > 30]
df_train = df_train[df_train["TARGET"] < 20000]

# Categorical
df_train["ORIGIN_CALL"] = df_train["ORIGIN_CALL"].astype("category")
df_train["ORIGIN_STAND"] = df_train["ORIGIN_STAND"].astype("category")
df_train["TAXI_ID"] = df_train["TAXI_ID"].astype("category")
df_train["YEAR"] = df_train["YEAR"].astype("category")
df_train["WK_OF_YR"] = df_train["WK_OF_YR"].astype("category")
df_train["WK_DAY"] = df_train["WK_DAY"].astype("category")
df_train["HR"] = df_train["HR"].astype("category")
# Split
train, valid = train_test_split(df_train, test_size=0.1)
train_label = train["TARGET"]
train.drop(labels=["TARGET"], inplace=True, axis=1)
valid_label = valid["TARGET"]
valid.drop(labels=["TARGET"], inplace=True, axis=1)

# Test
# test = np.loadtxt("../data/combined_testvf.csv", delimiter=",", skiprows=1)[:, 3:]
df_test = pd.read_csv("../data/combined_testvf.csv")
df_test["ORIGIN_CALL"] = df_test["ORIGIN_CALL"].astype("category").cat.set_categories(df_train["ORIGIN_CALL"].cat.categories)
df_test["ORIGIN_STAND"] = df_test["ORIGIN_STAND"].astype("category").cat.set_categories(df_train["ORIGIN_STAND"].cat.categories)
df_test["TAXI_ID"] = df_test["TAXI_ID"].astype("category").cat.set_categories(df_train["TAXI_ID"].cat.categories)
df_test["YEAR"] = df_test["YEAR"].astype("category").cat.set_categories(df_train["YEAR"].cat.categories)
df_test["WK_OF_YR"] = df_test["WK_OF_YR"].astype("category").cat.set_categories(df_train["WK_OF_YR"].cat.categories)
df_test["WK_DAY"] = df_test["WK_DAY"].astype("category").cat.set_categories(df_train["WK_DAY"].cat.categories)
df_test["HR"] = df_test["HR"].astype("category").cat.set_categories(df_train["HR"].cat.categories)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(labels=["TARGET"], inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid.drop(labels=["TARGET"], inplace=True, axis=1)


In [3]:
df_test["ORIGIN_CALL"]

0          0
1          0
2          0
3          0
4          0
       ...  
315        0
316        0
317        0
318        0
319    15003
Name: ORIGIN_CALL, Length: 320, dtype: category
Categories (29026, int64): [0, 1, 2, 3, ..., 29023, 29024, 29025, 29026]

In [244]:
def score(model):
    pred = model.predict(valid)
    print(f"RMSE: {np.sqrt(np.sum((pred - valid_label)**2) / len(valid))}")

## Models

### XGBoost

In [6]:
reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=200, enable_categorical=True, max_cat_to_onehot=1)
reg.fit(train, train_label)
preds = reg.predict(df_test)

In [9]:
preds = reg.predict(df_test) # No taxi id, origin call, origin stand, no coords
df_pred = pd.read_csv("../data/sampleSubmission.csv")
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./combined_xgboost_categorical_200_default.csv", index=False)

In [248]:
# E V E R Y T H I N G
pred = reg.predict(valid[valid["A"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['A'] == 1])**2) / len(valid[valid['A'] == 1]))}")
pred = reg.predict(valid[valid["B"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['B'] == 1])**2) / len(valid[valid['B'] == 1]))}")
pred = reg.predict(valid[valid["C"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['C'] == 1])**2) / len(valid[valid['C'] == 1]))}")

RMSE: 479.1870131209387
RMSE: 429.06258599828124
RMSE: 789.9128936173889


In [8]:
# Yes taxi id, origin call, origin stand, no coords
pred = reg.predict(valid[valid["A"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['A'] == 1])**2) / len(valid[valid['A'] == 1]))}")
pred = reg.predict(valid[valid["B"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['B'] == 1])**2) / len(valid[valid['B'] == 1]))}")
pred = reg.predict(valid[valid["C"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['C'] == 1])**2) / len(valid[valid['C'] == 1]))}")

RMSE: 477.84808351169596
RMSE: 428.91400283658095
RMSE: 786.1901650206082


In [135]:
# simple
pred = reg.predict(valid[valid["A"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['A'] == 1])**2) / len(valid[valid['A'] == 1]))}")
pred = reg.predict(valid[valid["B"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['B'] == 1])**2) / len(valid[valid['B'] == 1]))}")
pred = reg.predict(valid[valid["C"] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid['C'] == 1])**2) / len(valid[valid['C'] == 1]))}")

RMSE: 503.21808466721495
RMSE: 430.7721133212752
RMSE: 911.1326239670425


## Create prediction csv

In [16]:
import pandas as pd
test = np.loadtxt("../data/numeric_testvf.csv", delimiter=",", skiprows=1)[:, 3:]
df_pred = pd.read_csv("../data/sampleSubmission.csv")
# preds = lin_reg.predict(test)
# preds = KNN_Regressor.predict(test)
# preds = grad_boost.predict(test)

# df_pred["TRAVEL_TIME"] = preds
# df_pred.to_csv("./grad_boost_numeric_prune.csv", index=False)
# preds = random_forest.predict(test)

# df_pred["TRAVEL_TIME"] = preds
# df_pred.to_csv("./random_forest_numeric_prune.csv", index=False)
# preds = reg.predict(test)

df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./xgboost_numeric_prune.csv", index=False)

In [247]:
print(preds)

[ 957.6611   752.60284  780.598    719.68353  760.0907   882.75604
  823.38434  750.3728   747.1309   837.69635  747.2265   855.37775
 1015.7043  1510.5624   874.60767  942.7616  1114.02    1190.1124
  895.9712  1091.3286   763.89056  820.5083   612.0671   896.9895
  818.8046   820.55896  787.38635  841.4842   868.1567   889.4218
  787.917    825.947    854.2789   999.2707  1435.5496   833.37396
  834.36444  690.33795  829.9881   841.7452   674.4172   752.75635
 2018.8619   779.18036  753.4257   688.04956  763.33813  847.5007
  725.70325  720.7693  1036.3297   754.051    732.0543   892.9372
  925.0432   687.0384   744.0416   790.23975 1259.5393   747.7781
  852.99066  790.5606   949.1034  1032.2911   675.8869   804.9909
  809.86554  683.07526 1060.402    658.7561  1340.2295   991.07043
  819.4713  1270.9761   707.95526  923.86487  681.9069   972.11096
  868.6488  1266.5344   772.0826   873.8238   840.0343   882.0674
  641.65717  831.165   1812.1106  1108.9347   614.73517 1347.1327
  83

In [186]:
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./combined_xgboost_1000_default_all.csv", index=False)