In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import neighbors, ensemble, kernel_ridge, svm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize

## Load data

In [23]:
# Ignore ORIGIN_CALL and other categorical columns
train_arr = np.loadtxt("../data/numeric_trainvf.csv", dtype=np.float32, delimiter=",", skiprows=1)[:, 4:]
# Split into training/validation sets
np.random.shuffle(train_arr)
valid_arr = train_arr[int(0.9 * train_arr.shape[0]):]
train_arr = train_arr[:int(0.9 * train_arr.shape[0])]
weights = train_arr[:, -1]

train_label = train_arr[:, -1]
train = train_arr[:, :-1]

valid_label = valid_arr[:, -1]
valid = valid_arr[:, :-1]

In [None]:
def score(model):
    pred = model.predict(valid)
    print(f"RMSE: {np.sqrt(np.sum((pred - valid_label)**2) / len(valid))}")

## Models

In [27]:
lin_reg = LinearRegression()
lin_reg.fit(train, train_label)
score(lin_reg)

RMSE: 685.4307853188586


In [None]:
np.std(lin_reg.predict(valid))

### KNN Regressor

In [24]:
KNN_Regressor = neighbors.KNeighborsRegressor(n_neighbors=10)
KNN_Regressor.fit(train, train_label)

In [26]:
score(KNN_Regressor)

RMSE: 702.8292068281938


In [None]:
np.std(KNN_Regressor.predict(valid))

### Gradient Boosting

In [None]:
grad_boost = ensemble.GradientBoostingRegressor(learning_rate=0.001, n_estimators=200)
grad_boost.fit(train, train_label, sample_weight=train_label)

In [None]:
score(grad_boost)

### Random Forest Regression

In [None]:
random_forest = ensemble.RandomForestRegressor(n_estimators=200, n_jobs=2)
random_forest.fit(train, train_label, sample_weight=train_label)

In [None]:
score(random_forest)

## Create prediction csv

In [25]:
import pandas as pd
# Ignore ORIGIN_CALL and other categorical columns
test = np.loadtxt("../data/numeric_testvf.csv", delimiter=",", skiprows=1)[:, 3:]
df_pred = pd.read_csv("../data/sampleSubmission.csv")
KNN_Regressor.predict(test)

array([ 925.5,  649.5,  528. , 1008. ,  468. ,  747. ,  591. ,  793.5,
        528. ,  649.5,  664.5,  505.5,  894. ,  505.5,  591. , 1128. ,
        505.5, 1128. ,  681. ,  735. ,  753. ,  673.5,  816. , 1336.5,
        591. , 1389. ,  775.5,  652.5,  544.5,  759. ,  681. ,  750. ,
        561. ,  577.5,  804. , 1092. ,  696. ,  469.5,  591. ,  664.5,
        711. ,  765. ,  577.5,  798. ,  925.5, 1008. ,  603. ,  765. ,
        561. ,  681. ,  594. ,  759. ,  652.5, 1612.5,  591. , 1008. ,
        681. ,  681. , 1092. ,  673.5,  679.5,  577.5,  883.5,  756. ,
        798. ,  681. ,  664.5,  712.5,  936. ,  711. , 1279.5,  505.5,
        528. ,  559.5,  682.5,  709.5, 1353. ,  747. ,  747. ,  945. ,
        787.5,  672. , 1036.5,  592.5, 1429.5,  538.5, 1962. , 1380. ,
        741. , 2089.5,  555. ,  948. ,  715.5,  822. ,  903. ,  720. ,
        823.5, 2089.5,  750. ,  939. ,  754.5,  538.5,  868.5,  799.5,
        535.5,  727.5, 1081.5,  679.5,  991.5,  921. , 2289. ,  913.5,
      

In [None]:
df_pred["TRAVEL_TIME"] = preds
df_pred.to_csv("./predictions.csv", index=False)