In [34]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, ensemble, kernel_ridge, svm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
import xgboost as xgb

## Load data

In [35]:
# Ignore ORIGIN_CALL and other categorical columns
np.random.seed(42)
train_arr = np.loadtxt("../data/simple_train.csv", dtype=np.float32, delimiter=",", skiprows=1)
# Split into training/validation sets
np.random.shuffle(train_arr)
valid_arr = train_arr[int(0.9 * train_arr.shape[0]):]
train_arr = train_arr[:int(0.9 * train_arr.shape[0])]
weights = train_arr[:, -1]

train_label = train_arr[:, -1]
train = train_arr[:, :-1]

valid_label = valid_arr[:, -1]
valid = valid_arr[:, :-1]

# train = train[train_label > 50]
# train = train[train_label < 15000]

In [36]:
def score(model):
    pred = model.predict(valid)
    print(f"RMSE: {np.sqrt(np.sum((pred - valid_label)**2) / len(valid))}")

## Models

In [11]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_label)
score(lin_reg)

RMSE: 626.0896381277727


### KNN Regressor

In [5]:
KNN_Regressor = neighbors.KNeighborsRegressor(n_neighbors=10)
KNN_Regressor.fit(train, train_label)

KNeighborsRegressor(n_neighbors=10)

In [9]:
# score(KNN_Regressor)

In [None]:
np.std(KNN_Regressor.predict(valid))

### Gradient Boosting

In [None]:
grad_boost = ensemble.GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)
grad_boost.fit(train, train_label, sample_weight=train_label)

In [None]:
score(grad_boost)

### Random Forest Regression

In [None]:
random_forest = ensemble.RandomForestRegressor()
random_forest.fit(train, train_label)

In [None]:
score(random_forest)

### XGBoost

In [38]:
reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=10, max_depth=5)
reg.fit(train, train_label)
score(reg)

RMSE: 626.0294442386097


In [39]:
reg_A = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=10)
reg_A.fit(train[train[:, -3] == 1], train_label[train[:, -3] == 1], sample_weight=weights[train[:, -3] == 1])
score(reg_A)
reg_B = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=10)
reg_B.fit(train[train[:, -2] == 1], train_label[train[:, -2] == 1], sample_weight=weights[train[:, -2] == 1])
score(reg_B)
reg_C = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=10)
reg_C.fit(train[train[:, -1] == 1], train_label[train[:, -1] == 1], sample_weight=weights[train[:, -1] == 1])
score(reg_B)

RMSE: 711.4520695232186
RMSE: 668.5472153042178
RMSE: 668.5472153042178


In [40]:
reg_weighted = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=10)
reg_weighted.fit(train, train_label, sample_weight=weights)
score(reg_weighted)

RMSE: 818.6087403796305


In [42]:
def detailed_score(model):
    pred = model.predict(valid[valid[:, -3] == 1])
    print(f"A RMSE: {np.sqrt(np.sum((pred - valid_label[valid[:, -3] == 1])**2) / len(valid_label[valid[:, -3] == 1]))}")
    pred = model.predict(valid[valid[:, -2] == 1])
    print(f"B RMSE: {np.sqrt(np.sum((pred - valid_label[valid[:, -2] == 1])**2) / len(valid_label[valid[:, -2] == 1]))}")
    pred = model.predict(valid[valid[:, -1] == 1])
    print(f"C RMSE: {np.sqrt(np.sum((pred - valid_label[valid[:, -1] == 1])**2) / len(valid_label[valid[:, -1] == 1]))}")
detailed_score(reg)
print("---A expert---")
detailed_score(reg_A)
print("---B expert---")
detailed_score(reg_B)
print("---C expert---")
detailed_score(reg_C)
print("---")
detailed_score(reg_weighted)

A RMSE: 462.3780449503996
B RMSE: 470.302302187638
C RMSE: 891.7508490351014
---A expert---
A RMSE: 556.9294858550196
B RMSE: 607.3162752117859
C RMSE: 932.1894320894835
---B expert---
A RMSE: 506.91515210687817
B RMSE: 547.1156501886231
C RMSE: 906.5777842192924
---C expert---
A RMSE: 1198.168542142165
B RMSE: 1258.658907040706
C RMSE: 1245.7625784032157
---
A RMSE: 543.9072889575858
B RMSE: 543.5884146469983
C RMSE: 1242.1898737532392


## Create prediction csv

In [30]:
import pandas as pd
# Ignore ORIGIN_CALL and other categorical columns
test = np.loadtxt("../data/simple_test.csv", delimiter=",", skiprows=1)
df_pred = pd.read_csv("../data/sampleSubmission.csv")
preds = reg.predict(test)

In [31]:
print(preds)

[ 660.4614   660.4614   660.4614   660.4614   660.4614   690.3557
  660.4614   690.3557   660.4614   660.4614   660.4614   838.57355
  838.57355  838.57355  838.57355  838.57355  838.57355  838.57355
  660.4614   838.57355  660.4614   690.3557   690.3557   660.4614
  660.4614   660.4614   660.4614   660.4614   660.4614   660.4614
  660.4614   660.4614   838.57355  838.57355  838.57355  838.57355
  690.3557   660.4614   660.4614   660.4614   660.4614   838.57355
  838.57355  660.4614   660.4614   660.4614   838.57355  838.57355
  660.4614   660.4614   660.4614   660.4614   660.4614   660.4614
  660.4614   660.4614   660.4614   660.4614   838.57355  690.3557
  660.4614   838.57355  838.57355  690.3557   660.4614   660.4614
  660.4614   690.3557   838.57355  660.4614  1015.5895   838.57355
  660.4614   838.57355  699.0338   811.1405   699.0338   811.1405
  811.1405   811.1405   805.04297  699.0338   805.04297  805.04297
  699.0338   805.04297  811.1405   811.1405   699.0338   805.04297
  

In [33]:
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./simple_xgboost_10000_5.csv", index=False)