In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geohash

In [39]:
featuredDataset = pd.read_csv('featured-dataset.csv')
featuredDataset = featuredDataset.drop(featuredDataset.columns[0], axis=1)
featuredDataset.head(5)

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,x_start,y_start,z_start,location_start,location_end
0,2017,3,22,16.833333333333332:49.999999999999886,0.777778,0.173648,-0.984808,Wednesday,0.396825,-0.797133,0.603804,0,-0.460713,-0.120608,0.879316,eyckx6e80,eycs8byzz
1,2017,5,24,12.333333333333334:20.0,0.569444,-0.906308,-0.422618,Wednesday,0.367063,-0.670981,0.741474,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq
2,2017,5,24,12.35:21.0,0.570833,-0.902585,-0.430511,Wednesday,0.367262,-0.671905,0.740637,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq
3,2017,5,24,12.35:21.0,0.570833,-0.902585,-0.430511,Wednesday,0.367262,-0.671905,0.740637,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq
4,2017,5,24,12.35:21.0,0.570833,-0.902585,-0.430511,Wednesday,0.367262,-0.671905,0.740637,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq


In [40]:
featuredDataset.shape

(1537, 17)

### Feature extraction

In [41]:

# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
    
def further_data_prep(df):  

    df['start_lat'] = df['location_start'].apply(lambda geo: decodegeo(geo, 0))
    df['start_lon'] = df['location_start'].apply(lambda geo: decodegeo(geo, 1))
    df['end_lat'] = df['location_end'].apply(lambda geo: decodegeo(geo, 0))
    df['end_lon'] = df['location_end'].apply(lambda geo: decodegeo(geo, 1))
    
    return df

In [42]:
featuredDataset = further_data_prep(featuredDataset)
featuredDataset.head(5)

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,...,weekend,x_start,y_start,z_start,location_start,location_end,start_lat,start_lon,end_lat,end_lon
0,2017,3,22,16.833333333333332:49.999999999999886,0.777778,0.173648,-0.984808,Wednesday,0.396825,-0.797133,...,0,-0.460713,-0.120608,0.879316,eyckx6e80,eycs8byzz,38.77352,-9.168756,38.765237,-9.098074
1,2017,5,24,12.333333333333334:20.0,0.569444,-0.906308,-0.422618,Wednesday,0.367063,-0.670981,...,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq,47.409289,8.546956,47.423751,8.555195
2,2017,5,24,12.35:21.0,0.570833,-0.902585,-0.430511,Wednesday,0.367262,-0.671905,...,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq,47.409289,8.546956,47.423751,8.555195
3,2017,5,24,12.35:21.0,0.570833,-0.902585,-0.430511,Wednesday,0.367262,-0.671905,...,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq,47.409289,8.546956,47.423751,8.555195
4,2017,5,24,12.35:21.0,0.570833,-0.902585,-0.430511,Wednesday,0.367262,-0.671905,...,0,0.612977,-0.738237,-0.281542,u0qjdqxen,u0qjf9jxq,47.409289,8.546956,47.423751,8.555195


### Train-test split
For Cross Validation, we split the data into 80% train set and 20% test set.

In [43]:
columns_all_features = featuredDataset.columns
columns_X = ['day_num', 'x_start', 'y_start', 'z_start']
columns_y = ['end_lat', 'end_lon']
X = featuredDataset[columns_X]
y = featuredDataset[columns_y]

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
print ('X: ({}, {})'.format(*X.shape))
print ('y: ({}, {})'.format(*y.shape))
print ('X_train: ({}, {})'.format(*X_train.shape))
print ('y_train: ({}, {})'.format(*y_train.shape))
print ('X_test: ({}, {})'.format(*X_test.shape))
print ('y_test: ({}, {})'.format(*y_test.shape))

X: (1537, 4)
y: (1537, 2)
X_train: (1229, 4)
y_train: (1229, 2)
X_test: (308, 4)
y_test: (308, 2)


### Machine Learning

In [46]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [47]:
# Funtion for cross-validation over a grid of parameters

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
    best = gs.best_estimator_
    return best

#### Grid Search Cross Validation
Grid Search CV implements an exhaustive search over specified parameter values for an estimator.
Important members are fit, predict.

It iterates through a dictionary of hyper parameters and choose the combination that better fits the model.

In [48]:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')

BEST {'n_neighbors': 5} -3.15864936119 {'split0_test_score': array([-0.00151617, -0.00060575, -0.00048735]), 'split1_test_score': array([ -4.34287932e-05,  -7.69864045e-05,  -2.25453448e-05]), 'split2_test_score': array([-15.80417493, -15.80246456, -15.77927105]), 'split3_test_score': array([-0.00066441, -0.00045609, -0.0004521 ]), 'split4_test_score': array([-0.00070371, -0.00037731, -0.00017443]), 'mean_test_score': array([-3.16399231, -3.16336768, -3.15864936]), 'std_test_score': array([ 6.32330521,  6.32276204,  6.31351975]), 'rank_test_score': array([3, 2, 1], dtype=int32), 'split0_train_score': array([-0.        , -0.98888218, -2.53179367]), 'split1_train_score': array([-0.        , -0.98889181, -2.53703436]), 'split2_train_score': array([-0.        , -0.00015217, -0.0002098 ]), 'split3_train_score': array([-0.        , -0.99280409, -2.53649995]), 'split4_train_score': array([-0.        , -0.98783378, -2.52992351]), 'mean_train_score': array([ 0.        , -0.7917128 , -2.02709226

#### Model accuracy: R-Squared and Root-mean-squared deviation
R-squared is a statistical measure of how close the data are to the fitted regression line. It ranges from 0 to 1, being 1 the best coefficient.
RMSE is the square root of the mean square error. In other worids the distance, on average, of a data point from the fitted line, measured along a vertical line.

In [49]:
# Fit the best Random Forest and calculate R^2 values for training and test sets
knn_reg=knn_best.fit(X_train, y_train)
knn_training_accuracy = knn_reg.score(X_train, y_train)
knn_test_accuracy = knn_reg.score(X_test, y_test)
print ("############# based on standard predict ################")
print ("R^2 on training data: %0.8f" % (knn_training_accuracy))
print ("R^2 on test data:     %0.8f" % (knn_test_accuracy))

############# based on standard predict ################
R^2 on training data: 0.97231458
R^2 on test data:     0.99999609


In [50]:
# Calculate the Root Mean Squared Error
np.sqrt(mean_squared_error(knn_reg.predict(X_test),y_test))

0.020617576356285399

In [51]:
sampleds = pd.DataFrame(featuredDataset, columns=(columns_X + columns_y))
sampleds = sampleds.sample(10)
sampleds

Unnamed: 0,day_num,x_start,y_start,z_start,end_lat,end_lon
645,0.384325,0.612977,-0.738237,-0.281542,47.423751,8.555195
1100,0.3875,0.612977,-0.738237,-0.281542,47.423751,8.555195
270,0.372421,0.612977,-0.738237,-0.281542,47.423751,8.555195
1510,0.401984,0.612977,-0.738237,-0.281542,47.423751,8.555195
858,0.388294,0.612977,-0.738237,-0.281542,47.423751,8.555195
1219,0.390079,0.612977,-0.738237,-0.281542,47.423751,8.555195
123,0.374802,0.612977,-0.738237,-0.281542,47.423751,8.555195
567,0.382738,0.612977,-0.738237,-0.281542,47.423751,8.555195
1391,0.394444,0.612977,-0.738237,-0.281542,47.423751,8.555195
763,0.386508,0.612977,-0.738237,-0.281542,47.423751,8.555195


In [52]:
y_pred = knn_reg.predict(sampleds.iloc[:,:-2])
y_pred

array([[ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533],
       [ 47.42375135,   8.55519533]])

### Save the model
We dump the trained model into a file, so that we can later load and use it without having to fit it again

In [53]:
from sklearn.externals import joblib
joblib.dump(knn_reg, 'k_nearest_model.pkl') 

['k_nearest_model.pkl']