In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, ensemble, kernel_ridge, svm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize
import xgboost as xgb
# in life, there exists only pain

## Load data

In [2]:
# Ignore ORIGIN_CALL and other categorical columns
np.random.seed(42)
train_arr = np.loadtxt("../data/no_coord_train.csv", dtype=np.float32, delimiter=",", skiprows=1)[:, 4:]
mean = train_arr[:, -1].mean()
std = train_arr[:, -1].std()
train_arr = train_arr[train_arr[:, -1] < mean + 5 * std]
train_arr = train_arr[train_arr[:, -1] > 50]
# Split into training/validation sets
np.random.shuffle(train_arr)
valid_arr = train_arr[int(0.9 * train_arr.shape[0]):]
train_arr = train_arr[:int(0.9 * train_arr.shape[0])]
weights = train_arr[:, -1]

train_label = train_arr[:, -1]
train = train_arr[:, :-1]

valid_label = valid_arr[:, -1]
valid = valid_arr[:, :-1]

In [5]:
def score(model):
    pred = model.predict(valid)
    print(f"RMSE: {np.sqrt(np.sum((pred - valid_label)**2) / len(valid))}")

## Models

In [4]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_label)
score(lin_reg)

RMSE: 439.5314321575553


### KNN Regressor

In [5]:
KNN_Regressor = neighbors.KNeighborsRegressor(n_neighbors=10)
KNN_Regressor.fit(train, train_label)

KNeighborsRegressor(n_neighbors=10)

In [6]:
score(KNN_Regressor)

RMSE: 450.6238864979781


### Gradient Boosting

In [7]:
grad_boost = ensemble.GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)
grad_boost.fit(train, train_label, sample_weight=train_label)

GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)

In [8]:
score(grad_boost)

RMSE: 516.1063805958696


### Random Forest Regression

In [9]:
random_forest = ensemble.RandomForestRegressor(n_estimators=200, n_jobs=2, max_depth=5)
random_forest.fit(train, train_label)

RandomForestRegressor(max_depth=5, n_estimators=200, n_jobs=2)

In [10]:
score(random_forest)

RMSE: 434.170656025221


In [11]:
np.mean(train_label)

723.00085

In [12]:
print(f"RMSE: {np.sqrt(np.sum((747.66943 - valid_label)**2) / len(valid))}")

RMSE: 445.6634218868758


### XGBoost

In [13]:
reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=100, max_depth=5)
reg.fit(train, train_label, sample_weight=weights)
score(reg)

RMSE: 504.1950399821682


In [17]:
import pandas as pd
test = np.loadtxt("../data/no_coord_test.csv", delimiter=",", skiprows=1)[:, 3:]
df_pred = pd.read_csv("../data/sampleSubmission.csv")
preds = lin_reg.predict(test)
preds = KNN_Regressor.predict(test)
preds = grad_boost.predict(test)

df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./grad_boost_nocoord_prune.csv", index=False)
preds = random_forest.predict(test)

df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./random_forest_nocoord_prune.csv", index=False)
preds = reg.predict(test)

df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./xgboost_nocoord_prune.csv", index=False)

## Create prediction csv

In [None]:
import pandas as pd
# Ignore ORIGIN_CALL and other categorical columns
test = np.loadtxt("../data/no_coord_test.csv", delimiter=",", skiprows=1)[:, 3:]
df_pred = pd.read_csv("../data/sampleSubmission.csv")
preds = reg.predict(test)

In [None]:
print(preds)

In [None]:
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./predictions.csv", index=False)