In [1]:
# %matplotlib inline

In [12]:
import xgboost
from bayes_opt import BayesianOptimization
import pandas as pd
import numpy as np
import sklearn.preprocessing
import sklearn.neighbors
import sklearn.svm
import functools

In [3]:
def mapkprecision(truthvalues, predictions):
    '''
    This is a faster implementation of MAP@k valid for numpy arrays.
    It is only valid when there is one single truth value. 

    m ~ number of observations
    k ~ MAP at k -- in this case k should equal 3

    truthvalues.shape = (m,) 
    predictions.shape = (m, k)
    '''
    z = (predictions == truthvalues[:, None]).astype(np.float32)
    weights = 1./(np.arange(predictions.shape[1], dtype=np.float32) + 1.)
    z = z * weights[None, :]
    return np.mean(np.sum(z, axis=1))

In [4]:
train_set = pd.read_csv('train.csv')
initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]')
d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm')
                           for mn in train_set.time.values)
train_set['hour'] = (d_times.hour+ d_times.minute/60)
train_set['weekday'] = d_times.weekday 
train_set['month'] = d_times.month 
train_set['year'] = (d_times.year - 2013)

In [9]:
def validation_map3_xgboost(xlower, xupper, ylower, yupper, cut_threshold=10, 
                            n_estimators=100, gamma=0.01, subsample=0.95, learning_rate=0.1,
                            colsample_bytree = 1., colsample_bylevel=1., reg_alpha=1.,
                            reg_lambda=0., min_child_weight=0.3, max_depth=4,
                            margin=0.
                           ):
    #print(locals())
    
    epsilon = 1.0e-5
    
    if xupper == 10.:
        xupper += epsilon
    if yupper == 10.:
        yupper += epsilon
    
    train_set_bin_2 = train_set[(train_set.x >= xlower - margin) & (train_set.x < xupper + margin) 
                              & (train_set.y >= ylower - margin) & (train_set.y < yupper + margin)].copy()
    train_set_bin_2.sort_values('time', inplace=True)
    Nrows = train_set_bin_2.shape[0]
    
    eighty_percent_mark = int(0.8*Nrows)
    
    train_set_bin = train_set_bin_2[:eighty_percent_mark]
    
    #take the final 20% and shuffle it
    validation_set = train_set_bin_2[eighty_percent_mark:]
    
    N_iter = 5 # this randomly reshuffles the data between the training and testing sets and averages
    # reduces overfitting by bayesian optimizer due to choice of train, validation split
    
    map3_values = []
    
    for i in range(N_iter):
        new_train_set_bin = train_set_bin.copy()
        new_validation_set_bin = validation_set.sample(frac=1.) 
    
        # take half the shuffled data, append to train set, and the other half is for 
        # the validation set.
        half_mark = int(new_validation_set_bin.shape[0] * 0.5)
        new_train_set_bin.append(new_validation_set_bin[:half_mark])
        new_validation_set_bin = new_validation_set_bin[half_mark:]
        new_validation_set_bin = new_validation_set_bin[(new_validation_set_bin.x >= xlower) 
                                                        & (new_validation_set_bin.x < xupper)
                                                        & (new_validation_set_bin.y >= ylower)
                                                        & (new_validation_set_bin.y < yupper)
                                                       ]
    
        place_counts = new_train_set_bin.place_id.value_counts()
        mask = place_counts[new_train_set_bin.place_id.values] >= cut_threshold
        new_train_set_bin = new_train_set_bin.loc[mask.values]

        X_train = new_train_set_bin['x y accuracy hour weekday month year'.split()].as_matrix()
        Y_train = new_train_set_bin['place_id'].values

        X_vali = new_validation_set_bin['x y accuracy hour weekday month year'.split()].as_matrix()
        Y_vali = new_validation_set_bin['place_id'].values

        classifier = xgboost.XGBClassifier(n_estimators=int(round(n_estimators)), objective='multi:softprob',
                                           learning_rate=float(learning_rate),
                                           gamma=gamma, subsample=subsample,
                                           colsample_bytree = colsample_bytree, 
                                           colsample_bylevel = colsample_bylevel, 
                                           reg_alpha=reg_alpha,
                                           reg_lambda=reg_lambda,
                                           min_child_weight=min_child_weight,
                                           max_depth=int(round(max_depth))
                                          )
        classifier.fit(X_train, Y_train)
        predict_y_vali = classifier.predict_proba(X_vali)
        predicted_vali_idx = np.argsort(
                predict_y_vali, axis=1)[:, -3:][:, ::-1]
        map3 = mapkprecision(Y_vali, classifier.classes_.take(predicted_vali_idx))
        map3_values.append(map3)
    return np.mean(map3_values)    

In [10]:
def validation_map3_kNN(xlower, xupper, ylower, yupper, cut_threshold=10, 
                        w_x=500, w_y=1000., w_hour=4., w_weekday=3., w_year=10,
                            margin=0., n_neighbors=25, metric='manhattan'
                           ):
    #print(locals())
    
    epsilon = 1.0e-5
    
    if xupper == 10.:
        xupper += epsilon
    if yupper == 10.:
        yupper += epsilon
    
    train_set_bin_2 = train_set[(train_set.x >= xlower - margin) & (train_set.x < xupper + margin) 
                              & (train_set.y >= ylower - margin) & (train_set.y < yupper + margin)].copy()
    train_set_bin_2.sort_values('time', inplace=True)
    Nrows = train_set_bin_2.shape[0]
    
    eighty_percent_mark = int(0.8*Nrows)
    
    train_set_bin = train_set_bin_2[:eighty_percent_mark]
    
    #take the final 20% and shuffle it
    validation_set = train_set_bin_2[eighty_percent_mark:]
    
    N_iter = 5 # this randomly reshuffles the data between the training and testing sets and averages
    # reduces overfitting by bayesian optimizer due to choice of train, validation split
    
    map3_values = []
    
    for i in range(N_iter):
        new_train_set_bin = train_set_bin.copy()
        new_validation_set_bin = validation_set.sample(frac=1.) 
    
        # take half the shuffled data, append to train set, and the other half is for 
        # the validation set.
        half_mark = int(new_validation_set_bin.shape[0] * 0.5)
        new_train_set_bin.append(new_validation_set_bin[:half_mark])
        new_validation_set_bin = new_validation_set_bin[half_mark:]
        new_validation_set_bin = new_validation_set_bin[(new_validation_set_bin.x >= xlower) 
                                                        & (new_validation_set_bin.x < xupper)
                                                        & (new_validation_set_bin.y >= ylower)
                                                        & (new_validation_set_bin.y < yupper)
                                                       ]
    
        place_counts = new_train_set_bin.place_id.value_counts()
        mask = place_counts[new_train_set_bin.place_id.values] >= cut_threshold
        new_train_set_bin = new_train_set_bin.loc[mask.values]
        
        # There is a degree of freedom where all the values can vary simultaneously and maintain
        # the same ratio. To counteract this, I do not set w_month. All other distances must
        # scale to this one.
        
        new_train_set_bin.x *= w_x
        new_train_set_bin.y *= w_y
        new_train_set_bin.hour *= w_hour
        new_train_set_bin.weekday *= w_weekday
#         new_train_set_bin.month *= w_month
        new_train_set_bin.year *= w_year
        
        new_validation_set_bin.x *= w_x
        new_validation_set_bin.y *= w_y
        new_validation_set_bin.hour *= w_hour
        new_validation_set_bin.weekday *= w_weekday
#         new_validation_set_bin.month *= w_month
        new_validation_set_bin.year *= w_year

        X_train = new_train_set_bin['x y hour weekday month year'.split()].as_matrix()
        Y_train = new_train_set_bin['place_id'].values

        X_vali = new_validation_set_bin['x y hour weekday month year'.split()].as_matrix()
        Y_vali = new_validation_set_bin['place_id'].values

        classifier = sklearn.neighbors.KNeighborsClassifier(int(round(n_neighbors)), metric=metric)
        classifier.fit(X_train, Y_train)
        predict_y_vali = classifier.predict_proba(X_vali)
        predicted_vali_idx = np.argsort(
                predict_y_vali, axis=1)[:, -3:][:, ::-1]
        map3 = mapkprecision(Y_vali, classifier.classes_.take(predicted_vali_idx))
        map3_values.append(map3)
    return np.mean(map3_values)    

In [22]:
def validation_map3_SVC(xlower, xupper, ylower, yupper, cut_threshold=10, 
                            margin=0., C=1., gamma=0.1
                           ):
    #print(locals())
    
    epsilon = 1.0e-5
    
    if xupper == 10.:
        xupper += epsilon
    if yupper == 10.:
        yupper += epsilon
    
    train_set_bin_2 = train_set[(train_set.x >= xlower - margin) & (train_set.x < xupper + margin) 
                              & (train_set.y >= ylower - margin) & (train_set.y < yupper + margin)].copy()
    train_set_bin_2.sort_values('time', inplace=True)  
    
    Nrows = train_set_bin_2.shape[0]
    
    eighty_percent_mark = int(0.8*Nrows)
    
    train_set_bin = train_set_bin_2[:eighty_percent_mark]
    
    #take the final 20% and shuffle it
    validation_set = train_set_bin_2[eighty_percent_mark:]
    
    N_iter = 5 # this randomly reshuffles the data between the training and testing sets and averages
    # reduces overfitting by bayesian optimizer due to choice of train, validation split
    
    map3_values = []
    
    for i in range(N_iter):
        new_train_set_bin = train_set_bin.copy()
        new_validation_set_bin = validation_set.sample(frac=1.) 
    
        # take half the shuffled data, append to train set, and the other half is for 
        # the validation set.
        half_mark = int(new_validation_set_bin.shape[0] * 0.5)
        new_train_set_bin.append(new_validation_set_bin[:half_mark])
        new_validation_set_bin = new_validation_set_bin[half_mark:]
        new_validation_set_bin = new_validation_set_bin[(new_validation_set_bin.x >= xlower) 
                                                        & (new_validation_set_bin.x < xupper)
                                                        & (new_validation_set_bin.y >= ylower)
                                                        & (new_validation_set_bin.y < yupper)
                                                       ]
    
        place_counts = new_train_set_bin.place_id.value_counts()
        mask = place_counts[new_train_set_bin.place_id.values] >= cut_threshold
        new_train_set_bin = new_train_set_bin.loc[mask.values]
        
        # There is a degree of freedom where all the values can vary simultaneously and maintain
        # the same ratio. To counteract this, I do not set w_month. All other distances must
        # scale to this one.
        
        new_train_set_bin.x /= 10.
        new_train_set_bin.y /= 10.
        new_train_set_bin.hour /= 24.
        new_train_set_bin.weekday /= 7.
        new_train_set_bin.month /= 12.
        new_train_set_bin.year /= 2.
        
        new_validation_set_bin.x /= 10.
        new_validation_set_bin.y /= 10.
        new_validation_set_bin.hour /= 24.
        new_validation_set_bin.weekday /= 7.
        new_validation_set_bin.month /= 12.
        new_validation_set_bin.year /= 2.

        X_train = new_train_set_bin['x y hour weekday month year'.split()].as_matrix()
        Y_train = new_train_set_bin['place_id'].values

        X_vali = new_validation_set_bin['x y hour weekday month year'.split()].as_matrix()
        Y_vali = new_validation_set_bin['place_id'].values

        classifier = sklearn.svm.SVC(C=C, kernel='rbf', probability=True, gamma=gamma, 
                                       decision_function_shape='ovo')
        classifier.fit(X_train, Y_train)
        predict_y_vali = classifier.predict_proba(X_vali)
        predicted_vali_idx = np.argsort(
                predict_y_vali, axis=1)[:, -3:][:, ::-1]
        map3 = mapkprecision(Y_vali, classifier.classes_.take(predicted_vali_idx))
        map3_values.append(map3)
    return np.mean(map3_values)    

In [11]:
np.random.seed(9) # for the bayes optimizer
rs = np.random.RandomState(9) # for bin choices
xranges = np.arange(0., 10., 0.1)
yranges = np.arange(0., 10., 0.1)

for i in range(10):
    x = rs.choice(xranges)
    y = rs.choice(yranges)
    
    print("X, Y: {}, {}".format(x, y))
    
    f = functools.partial(validation_map3_kNN, xlower=x, xupper=x + .1, ylower=y, yupper=y +.1, metric='manhattan')
    bo = BayesianOptimization(f=f,
                              pbounds={ "cut_threshold": (0, 50),
                                        "w_x": (1, 1000),
                                        "w_y": (1, 2000),
                                        "w_hour": (0, 100),
                                        "w_weekday": (0, 100),
                                        "w_year": (0, 100),
                                        "n_neighbors": (3, 100),
                                        "margin": (0., 0.04)
                                      },
                              verbose=False)
    #w_x=500, w_y=1000., w_hour=4., w_weekday=3., w_year=10,
    bo.explore({'w_x': [250, 500, 1000], 'w_y': [500, 1000, 1500], "w_hour": (2, 4, 8), 
                "w_weekday": (1, 3, 5), "w_year": (5, 10, 20), "n_neighbors": (10, 25, 40),
                "margin": (0.005, 0.015, 0.03), "cut_threshold": (2, 6, 12)})
    bo.maximize(n_iter=100, acq="ei", xi=0.0)
    print(bo.res['max'])
    print("\n")

X, Y: 9.2, 5.4


KeyboardInterrupt: 

In [None]:
np.random.seed(9) # for the bayes optimizer
rs = np.random.RandomState(9) # for bin choices
xranges = np.arange(0., 10., 0.1)
yranges = np.arange(0., 10., 0.1)

for i in range(10):
    x = rs.choice(xranges)
    y = rs.choice(yranges)
    
    print("X, Y: {}, {}".format(x, y))
    
    f = functools.partial(validation_map3_kNN, xlower=x, xupper=x + .1, ylower=y, yupper=y +.1, metric='minkowski')
    bo = BayesianOptimization(f=f,
                              pbounds={ "cut_threshold": (0, 50),
                                        "w_x": (1, 1000),
                                        "w_y": (1, 2000),
                                        "w_hour": (0, 100),
                                        "w_weekday": (0, 100),
                                        "w_year": (0, 100),
                                        "n_neighbors": (3, 100),
                                        "margin": (0., 0.04)
                                      },
                              verbose=False)
    #w_x=500, w_y=1000., w_hour=4., w_weekday=3., w_year=10,
    bo.explore({'w_x': [250, 500, 1000], 'w_y': [500, 1000, 1500], "w_hour": (2, 4, 8), 
                "w_weekday": (1, 3, 5), "w_year": (5, 10, 20), "n_neighbors": (10, 25, 40),
                "margin": (0.005, 0.015, 0.03), "cut_threshold": (2, 6, 12)})
    bo.maximize(n_iter=100, acq="ei", xi=0.0)
    print(bo.res['max'])
    print("\n")

In [None]:
np.random.seed(9) # for the bayes optimizer
rs = np.random.RandomState(9) # for bin choices
xranges = np.arange(0., 10., 0.1)
yranges = np.arange(0., 10., 0.1)

for i in range(10):
    x = rs.choice(xranges)
    y = rs.choice(yranges)
    
    print("X, Y: {}, {}".format(x, y))
    
    f = functools.partial(validation_map3_xgboost, xlower=x, xupper=x + .1, ylower=y, yupper=y +.1)
    bo = BayesianOptimization(f=f,
                              pbounds={"n_estimators": (30, 300), 
                                       "learning_rate": (0.01, 0.5),
                                       "cut_threshold": (0, 50),
                                       "colsample_bytree": (0.4, 1.),
                                       "subsample": (0.5, 1.),
                                       "gamma": (0.0, 0.3),
                                       "reg_alpha": (0.0, 2.0),
                                       "reg_lambda": (0.0, 1.0),
                                       "max_depth": (3, 10),
                                       "margin": (0., 0.04)
                                      },
                              verbose=False)
    bo.maximize(init_points=5, n_iter=50, acq="ei", xi=0.0)
    print(bo.res['max'])
    print("\n")