In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, ensemble, kernel_ridge, svm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
import xgboost as xgb

## Load data

In [18]:
# Ignore ORIGIN_CALL and other categorical columns
np.random.seed(42)
train_arr = np.loadtxt("../data/simple_train.csv", dtype=np.float32, delimiter=",", skiprows=1)
# Split into training/validation sets
np.random.shuffle(train_arr)
valid_arr = train_arr[int(0.9 * train_arr.shape[0]):]
train_arr = train_arr[:int(0.9 * train_arr.shape[0])]
weights = train_arr[:, -1]

train_label = train_arr[:, -1]
train = train_arr[:, :-1]

valid_label = valid_arr[:, -1]
valid = valid_arr[:, :-1]

# train = train[train_label > 50]
# train = train[train_label < 15000]

In [3]:
def score(model):
    pred = model.predict(valid)
    print(f"RMSE: {np.sqrt(np.sum((pred - valid_label)**2) / len(valid))}")

## Models

In [4]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_label)
score(lin_reg)

RMSE: 431.803860193862


### KNN Regressor

In [5]:
KNN_Regressor = neighbors.KNeighborsRegressor(n_neighbors=10)
KNN_Regressor.fit(train, train_label)

KNeighborsRegressor(n_neighbors=10)

In [6]:
score(KNN_Regressor)

KeyboardInterrupt: 

In [None]:
np.std(KNN_Regressor.predict(valid))

### Gradient Boosting

In [None]:
grad_boost = ensemble.GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)
grad_boost.fit(train, train_label, sample_weight=train_label)

In [None]:
score(grad_boost)

### Random Forest Regression

In [None]:
random_forest = ensemble.RandomForestRegressor(n_estimators=200, n_jobs=2, max_depth=5)
random_forest.fit(train, train_label)

In [None]:
score(random_forest)

In [41]:
reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=100)
reg.fit(train, train_label)
score(reg)

RMSE: 623.8352430403037


In [None]:
score(reg)

## Create prediction csv

In [42]:
import pandas as pd
# Ignore ORIGIN_CALL and other categorical columns
test = np.loadtxt("../data/simple_test.csv", delimiter=",", skiprows=1)
df_pred = pd.read_csv("../data/sampleSubmission.csv")
preds = reg.predict(test)

In [43]:
print(preds)

[ 684.97534  684.97534  684.97534  684.97534  684.97534  785.2732
  684.97534  785.2732   684.97534  684.97534  684.97534  923.75415
  923.75415  923.75415  923.75415  923.75415  923.75415  923.75415
  684.97534  923.75415  684.97534  785.2732   785.2732   684.97534
  684.97534  684.97534  684.97534  684.97534  684.97534  684.97534
  684.97534  684.97534  923.75415  923.75415  923.75415  923.75415
  785.2732   684.97534  684.97534  684.97534  684.97534  923.75415
  923.75415  684.97534  684.97534  684.97534  923.75415  923.75415
  684.97534  684.97534  684.97534  684.97534  684.97534  684.97534
  684.97534  684.97534  684.97534  684.97534  923.75415  785.2732
  684.97534  923.75415  923.75415  785.2732   684.97534  684.97534
  684.97534  785.2732   923.75415  684.97534 1079.8391   923.75415
  684.97534  923.75415  710.2924   937.01624  710.2924   937.01624
  937.01624  937.01624  806.16376  710.2924   806.16376  806.16376
  710.2924   806.16376  937.01624  937.01624  710.2924   806.163

In [37]:
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./simple_xgboost_1000_10.csv", index=False)