In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, ensemble, kernel_ridge, svm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
import xgboost as xgb

## Load data

In [2]:
# Ignore ORIGIN_CALL and other categorical columns
np.random.seed(42)
train_arr = np.loadtxt("../data/numeric_trainvf.csv", dtype=np.float32, delimiter=",", skiprows=1)[:, 4:]
mean = train_arr[:, -1].mean()
std = train_arr[:, -1].std()
train_arr = train_arr[train_arr[:, -1] < mean + 5 * std]
train_arr = train_arr[train_arr[:, -1] > 30]
# Split into training/validation sets
np.random.shuffle(train_arr)
valid_arr = train_arr[int(0.9 * train_arr.shape[0]):]
train_arr = train_arr[:int(0.9 * train_arr.shape[0])]
weights = train_arr[:, -1]

train_label = train_arr[:, -1]
train = train_arr[:, :-1]

valid_label = valid_arr[:, -1]
valid = valid_arr[:, :-1]


# train = train[train_label > 50]
# train_label = train_label[train_label > 50]
# train = train[train_label < mean + 5 * std]
# train_label = train_label[train_label < mean + 5 * std]

In [3]:
def score(model):
    pred = model.predict(valid)
    print(f"RMSE: {np.sqrt(np.sum((pred - valid_label)**2) / len(valid))}")

## Models

In [4]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_label)
score(lin_reg)

RMSE: 438.2685160161196


### KNN Regressor

In [5]:
KNN_Regressor = neighbors.KNeighborsRegressor(n_neighbors=10)
KNN_Regressor.fit(train, train_label)

KNeighborsRegressor(n_neighbors=10)

In [6]:
score(KNN_Regressor)

RMSE: 445.49759614025527


In [7]:
np.std(KNN_Regressor.predict(valid))

187.77357

### Gradient Boosting

In [9]:
grad_boost = ensemble.GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)
grad_boost.fit(train, train_label, sample_weight=train_label)

GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)

In [10]:
score(grad_boost)

RMSE: 517.5286449968111


### Random Forest Regression

In [11]:
random_forest = ensemble.RandomForestRegressor(n_estimators=200, n_jobs=2, max_depth=5)
random_forest.fit(train, train_label)

RandomForestRegressor(max_depth=5, n_estimators=200, n_jobs=2)

In [12]:
score(random_forest)

RMSE: 431.8290644444473


### XGBoost

In [17]:
reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=100, max_depth=5)
reg.fit(train, train_label, sample_weight=weights)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [23]:
score(reg)

RMSE: 491.54308539623315


In [20]:
import pandas as pd
test = np.loadtxt("../data/numeric_testvf.csv", delimiter=",", skiprows=1)[:, 3:]
preds = reg.predict(test)
df_pred = pd.read_csv("../data/sampleSubmission.csv")
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./xgboost_numeric_weighted.csv", index=False)

In [31]:
pred = reg.predict(valid[valid[:, 5] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid[:, 5] == 1])**2) / len(valid[valid[:, 5] == 1]))}")
pred = reg.predict(valid[valid[:, 6] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid[:, 6] == 1])**2) / len(valid[valid[:, 6] == 1]))}")
pred = reg.predict(valid[valid[:, 7] == 1])
print(f"RMSE: {np.sqrt(np.sum((pred - valid_label[valid[:, 7] == 1])**2) / len(valid[valid[:, 7] == 1]))}")

RMSE: 490.63767393194445
RMSE: 492.6402795858481
RMSE: 508.26557999975097


## Create prediction csv

In [16]:
import pandas as pd
test = np.loadtxt("../data/numeric_testvf.csv", delimiter=",", skiprows=1)[:, 3:]
df_pred = pd.read_csv("../data/sampleSubmission.csv")
preds = lin_reg.predict(test)
preds = KNN_Regressor.predict(test)
preds = grad_boost.predict(test)

df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./grad_boost_numeric_prune.csv", index=False)
preds = random_forest.predict(test)

df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./random_forest_numeric_prune.csv", index=False)
preds = reg.predict(test)

df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./xgboost_numeric_prune.csv", index=False)

In [None]:
print(preds)

In [None]:
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./predictions.csv", index=False)