In [1]:
from ny_utils import *

import datetime
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.mixture import GMM
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

seed = 0
n_folds = 10

# Read Data into Array from One Person

In [24]:
# plot locations for a user, color in terms of the hour of the day and the day of the week
d = read_NY_NY('Megan.csv')

# get datetime objects from timestamps
datetime_labels = [datetime.datetime.fromtimestamp( lab ) for lab in d[:,2]]

# lat long
Y = d[:,:2]

# hour of the day
hours = [h.hour for h in datetime_labels]

# day of the week
days = [h.weekday() for h in datetime_labels]

# time of day
times = [seconds_from_midnight(h) for h in datetime_labels]

# weekend vs. weekday
day_type = [(i > 4).real for i in days]


## Set up CV sets
cross_val = KFold(Y.shape[0],n_folds = n_folds, shuffle=True, random_state = seed)

## Run KNN as baseline score

In [None]:
## Select features

X = np.array((days,times,day_type)).T # this one was the best -> some information is added to day of week by weekend/weekday
# X = np.array((hours,day_type)).T # day of the week and time of day
# X = np.array((day_type,times)).T # this one was much better

# X = np.array(days)*3600*24+np.array(times)
# X.resize((X.shape[0],1)) # days since midnight on monday - most basic and does alright but doesn't account for something exactly 24 hrs apart being more similar than 12 hours apart

## Normalize
scalar = StandardScaler().fit(X)
X = scalar.transform(X)

In [None]:
nn = range(1,70)
weights = ['uniform','distance']
p = [1,2]

cross_val = KFold(Y.shape[0],n_folds = n_folds, shuffle=True, random_state = seed)

In [None]:
pred = GridSearchCV(KNN(),
                    param_grid = {'n_neighbors' : nn,
                                  'weights' : weights,
                                  'p' : p},
                    n_jobs=-1,
                    cv = cross_val)

In [None]:
pred.fit(X,Y)

In [None]:
pred.best_score_

In [None]:
pred.best_score_

In [None]:
pred.best_params_

## Run GMM-style predictor to compare

### Plan is to fit the generative model, then cycle through all of the previously visited points and pick the location that is most probable given the time

THINGS TO TRY:

- week/weekend + hour features vs. single "hours since monday morning" feature

In [23]:
class GMMPredictor(GMM):

    def __init__(self, n_neighbors=1, n_components=1, covariance_type='diag',
                 random_state=None, tol=1e-3, min_covar=1e-3,
                 n_iter=100, n_init=1, params='wmc', init_params='wmc',
                 verbose=0):
        # n_neighbors describes how many of the most likely previous locations are averaged
        # to make a prediction (much like KNN)
        self.n_neighbors = n_neighbors
        self.n_components = n_components
        self.covariance_type = covariance_type
        self.tol = tol
        self.min_covar = min_covar
        self.random_state = random_state
        self.n_iter = n_iter
        self.n_init = n_init
        self.params = params
        self.init_params = init_params
        self.verbose = verbose
        
    # using same scoring method as with KNN to compare
    def score(self,X,y,sample_weight=None):
        from sklearn.metrics import r2_score
        return r2_score(y, self.predict(X), sample_weight=sample_weight,
                        multioutput='variance_weighted')

    def fit(self,X,Y, sample_weight = None):
        self.generator = GMM(n_components=self.n_components, covariance_type=self.covariance_type,
                 random_state=self.random_state, tol=self.tol, min_covar=self.min_covar,
                 n_iter=self.n_iter, n_init=self.n_init, params=self.params, init_params=self.init_params,
                 verbose=self.verbose)
        
        self.n_prev_locs = Y.shape[0]
        self.n_features = X.shape[1]
        
        all_data = np.hstack((X,Y))
        
        self.generator.fit(all_data)
        self.Y = Y
    
    def predict(self,X):
        n_X = X.shape[0]
        locs = np.zeros((n_X,self.Y.shape[1]))
        for i in range(n_X):
            thisX = X[i,:]
            X_arr = thisX * np.ones((self.n_prev_locs,self.n_features))
            X_and_loc = np.hstack((X_arr,self.Y))
            
            prob = self.generator.score(X_and_loc)
            ix = prob.argsort()[-self.n_neighbors:]
            top_probs = np.exp(prob[ix])
            total_top_prob = top_probs.sum()
            scaled_probs = top_probs / total_top_prob
            
            mean_loc = np.dot(scaled_probs,Y[ix,:])
            locs[i,:] = mean_loc
        
        return locs
                
            

In [25]:
## Select features

# X = np.array((days,times)).T # day of the week and time of day

# X = np.array((hours,day_type)).T# type of day and hour of the day

# X = np.array(days)*3600*24+np.array(times)
# X.resize((X.shape[0],1)) # days since midnight on monday

X = np.array((times,day_type)).T # type of day and time of day

## Normalize
# scalar = StandardScaler().fit(X)
# X = scalar.transform(X)

In [26]:
components = [1,2,3,4]
nn = [5,60]

In [28]:
pred = GridSearchCV(GMMPredictor(random_state = seed, covariance_type = 'full'),
                    param_grid = {'n_components': components,
                                  'n_neighbors': nn},
                    cv = cross_val)


In [29]:
pred.fit(X,Y)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=949, n_folds=10, shuffle=True, random_state=0),
       error_score='raise',
       estimator=GMMPredictor(covariance_type='full', init_params='wmc', min_covar=0.001,
       n_components=1, n_init=1, n_iter=100, n_neighbors=1, params='wmc',
       random_state=0, tol=0.001, verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [5, 60], 'n_components': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [30]:
pred.best_score_

0.0031744305558522646

In [31]:
pred.best_params_

{'n_components': 1, 'n_neighbors': 60}

In [None]:
pred.best_estimator_.generator.weights_

## Now loop through each person, try GMM and KNN, take best, and make predicitons

In [None]:
best_method = {}
best_params = {}
best_score = {}

In [None]:
# For restarting loops...
already_done = ['Ana','Billy','David','Dianne','Donald','Elisabeth','Garland','George','Heather','Hilda','James',
                'Jerrie']

# loop over all training data files in the directory
for filename in os.listdir("./"):
    if filename.endswith(".csv"):
        if not filename.endswith("_quest.csv"):
            name = filename.split('.')[0]
            if name not in already_done:
                print "Starting", name

                ## Data preprocessing
                d = read_NY_NY(filename)

                # lat long
                Y = d[:,:2]

                # get datetime objects from timestamps
                datetime_labels = [datetime.datetime.fromtimestamp( lab ) for lab in d[:,2]]
                # day of the week
                days = [h.weekday() for h in datetime_labels]
                # time of day
                times = [seconds_from_midnight(h) for h in datetime_labels]
                # weekend vs. weekday
                day_type = [(i > 4).real for i in days]

                # make X dataset
                X = np.array((days,times,day_type)).T
                scalar = StandardScaler().fit(X)
                X = scalar.transform(X)

                # Set up CV sets
                cross_val = KFold(Y.shape[0],n_folds = n_folds, shuffle=True, random_state = seed)


                ## Try KNN
                nn_max = 40
                nn_min = 1
                found_best = False
                knn_iter = 1
                while not found_best:
                    print "KNN iteration:", knn_iter
                    print "nn_max =", nn_max
                    print found_best
                    nn = range(nn_min,nn_max+1)
                    weights = ['uniform','distance']
                    p = [1,2]

                    knn_pred = GridSearchCV(KNN(),
                            param_grid = {'n_neighbors' : nn,
                                          'weights' : weights,
                                          'p' : p},
                            n_jobs=-1,
                            cv = cross_val)
                    knn_pred.fit(X,Y)
                    if knn_pred.best_params_['n_neighbors'] == nn_max:
                        nn_min = nn_max
                        nn_max = nn_min + 40
                        knn_iter += 1
                    else:
                        found_best = True
                knn_score = knn_pred.best_score_
                knn_neighbors = knn_pred.best_params_['n_neighbors']


                ## Try GMM Predictor using close to the same # of neighbors to average over as the 
                comp_min = 1
                comp_max = 4
                nn = np.arange(knn_neighbors-10,knn_neighbors+11,5)
                best_gmm_score = -9999

                gmm_iter = 1
                found_best = False
                while not found_best:
                    print "GMM Iteration:", gmm_iter
                    print "comp range = %i - %i" %(comp_min,comp_max)
                    print "nn range =", nn
                    components = range(comp_min,comp_max+1)

                    gmm_pred = GridSearchCV(GMMPredictor(random_state = seed, covariance_type = 'full'),
                            param_grid = {'n_components': components,
                                          'n_neighbors': nn},
                            n_jobs=-1,
                            cv = cross_val)
                    gmm_pred.fit(X,Y)

                    best_neighbors = gmm_pred.best_params_['n_neighbors']
                    best_comp = gmm_pred.best_params_['n_components']

                    if ((best_neighbors != nn[-1]) & (best_neighbors != nn[0]) & (best_comp != components[-1])) or (gmm_pred.best_score_ < best_gmm_score):
                        found_best = True

                    else:
                        gmm_iter += 1
                        if best_neighbors == nn[0]:
                            nn = np.arange(nn.min()-10,nn.max()-9,5)
                        elif best_neighbors == nn[-1]:
                            nn = np.arange(nn.min()+10,nn.max()+11,5)
                        elif best_comp == components[-1]:
                            comp_max += 4
                            comp_min += 4
                        elif best_comp == components[0]:
                            comp_min -= 4
                            comp_max -= 4
                    best_gmm_score = gmm_pred.best_score_

                gmm_score = gmm_pred.best_score_

                if gmm_score > knn_score:
                    best_method[filename] = 'GMM'
                    final_predictor = gmm_pred
                else:
                    best_method[filename] = 'KNN'
                    final_predictor = knn_pred

                best_params[filename] = final_predictor.best_params_
                best_score[filename] = final_predictor.best_score_
                print "Best Params:", final_predictor.best_params_

                ## make predictions
                test = read_test_data('../new_york_quest/' + name + '_quest.csv')
                # get datetime objects from timestamps
                datetime_labels = [datetime.datetime.fromtimestamp( lab ) for lab in test]
                # day of the week
                days = [h.weekday() for h in datetime_labels]
                # time of day
                times = [seconds_from_midnight(h) for h in datetime_labels]
                # weekend vs. weekday
                day_type = [(i > 4).real for i in days]

                # make X dataset
                X = np.array((days,times,day_type)).T
                X = scalar.transform(X)

                # predict
                predicted = final_predictor.predict(X)

                # save
                name = filename.split('.')[0]
                save_name = name + '_'
                with open('../new_york_ans/' + name + '_quest_ans.csv','w') as f:
                    np.savetxt('../new_york_ans/' + name + '_quest_ans.csv', predicted, delimiter = ',')

                print "Done:", name

Starting John
KNN iteration: 1
nn_max = 40
False




KNN iteration: 2
nn_max = 80
False
KNN iteration: 3
nn_max = 120
False
GMM Iteration: 1
comp range = 1 - 4
nn range = [108 113 118 123 128]
GMM Iteration: 2
comp range = 1 - 4
nn range = [118 123 128 133 138]
GMM Iteration: 3
comp range = 1 - 4
nn range = [128 133 138 143 148]
GMM Iteration: 4
comp range = 1 - 4
nn range = [138 143 148 153 158]
GMM Iteration: 5
comp range = 1 - 4
nn range = [148 153 158 163 168]
GMM Iteration: 6
comp range = 1 - 4
nn range = [158 163 168 173 178]




Best Params: {'n_neighbors': 118, 'weights': 'distance', 'p': 1}
Done: John
Starting Latasha
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [-3  2  7 12 17]
GMM Iteration: 2
comp range = 1 - 4
nn range = [ 7 12 17 22 27]
GMM Iteration: 3
comp range = 5 - 8
nn range = [ 7 12 17 22 27]
Best Params: {'n_neighbors': 7, 'weights': 'uniform', 'p': 1}
Done: Latasha
Starting Lee
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [23 28 33 38 43]
GMM Iteration: 2
comp range = 1 - 4
nn range = [33 38 43 48 53]
GMM Iteration: 3
comp range = 1 - 4
nn range = [43 48 53 58 63]
GMM Iteration: 4
comp range = 1 - 4
nn range = [53 58 63 68 73]
GMM Iteration: 5
comp range = 1 - 4
nn range = [63 68 73 78 83]
GMM Iteration: 6
comp range = 1 - 4
nn range = [73 78 83 88 93]
GMM Iteration: 7
comp range = 1 - 4
nn range = [ 83  88  93  98 103]




Best Params: {'n_neighbors': 33, 'weights': 'uniform', 'p': 2}
Done: Lee
Starting Martin
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [ 2  7 12 17 22]
GMM Iteration: 2
comp range = 1 - 4
nn range = [12 17 22 27 32]
Best Params: {'n_neighbors': 12, 'weights': 'uniform', 'p': 2}
Done: Martin
Starting Mary
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [-7 -2  3  8 13]




Best Params: {'n_neighbors': 3, 'weights': 'distance', 'p': 1}
Done: Mary
Starting Megan
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [22 27 32 37 42]
GMM Iteration: 2
comp range = 1 - 4
nn range = [32 37 42 47 52]
GMM Iteration: 3
comp range = 1 - 4
nn range = [42 47 52 57 62]
GMM Iteration: 4
comp range = 1 - 4
nn range = [52 57 62 67 72]




Best Params: {'n_neighbors': 32, 'weights': 'distance', 'p': 1}
Done: Megan
Starting Mildred
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [26 31 36 41 46]
GMM Iteration: 2
comp range = 1 - 4
nn range = [16 21 26 31 36]
GMM Iteration: 3
comp range = 1 - 4
nn range = [ 6 11 16 21 26]
GMM Iteration: 4
comp range = 1 - 4
nn range = [-4  1  6 11 16]
GMM Iteration: 5
comp range = 1 - 4
nn range = [-14  -9  -4   1   6]
GMM Iteration: 6
comp range = 1 - 4
nn range = [-24 -19 -14  -9  -4]
GMM Iteration: 7
comp range = 1 - 4
nn range = [-34 -29 -24 -19 -14]




Best Params: {'n_neighbors': -24, 'n_components': 3}
Done: Mildred
Starting Patricia
KNN iteration: 1
nn_max = 40
False




KNN iteration: 2
nn_max = 80
False
GMM Iteration: 1
comp range = 1 - 4
nn range = [67 72 77 82 87]
GMM Iteration: 2
comp range = 1 - 4
nn range = [57 62 67 72 77]
GMM Iteration: 3
comp range = 1 - 4
nn range = [47 52 57 62 67]
Best Params: {'n_neighbors': 57, 'n_components': 3}
Done:



 Patricia
Starting Philip
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [18 23 28 33 38]
GMM Iteration: 2
comp range = 1 - 4
nn range = [ 8 13 18 23 28]
Best Params: {'n_neighbors': 28, 'weights': 'uniform', 'p': 2}
Done: Philip
Starting Ramiro
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [26 31 36 41 46]
GMM Iteration: 2
comp range = 1 - 4
nn range = [36 41 46 51 56]
GMM Iteration: 3
comp range = 1 - 4
nn range = [46 51 56 61 66]
GMM Iteration: 4
comp range = 1 - 4
nn range = [56 61 66 71 76]
GMM Iteration: 5
comp range = 1 - 4
nn range = [66 71 76 81 86]
GMM Iteration: 6
comp range = 1 - 4
nn range = [76 81 86 91 96]
GMM Iteration: 7
comp range = 1 - 4
nn range = [ 86  91  96 101 106]




Best Params: {'n_neighbors': 36, 'weights': 'distance', 'p': 2}
Done: Ramiro
Starting Robert
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [-2  3  8 13 18]
GMM Iteration: 2
comp range = 1 - 4
nn range = [-12  -7  -2   3   8]




Best Params: {'n_neighbors': 8, 'weights': 'uniform', 'p': 2}
Done: Robert
Starting Rosalie
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [29 34 39 44 49]
GMM Iteration: 2
comp range = 1 - 4
nn range = [39 44 49 54 59]
GMM Iteration: 3
comp range = 1 - 4
nn range = [49 54 59 64 69]
GMM Iteration: 4
comp range = 1 - 4
nn range = [59 64 69 74 79]
GMM Iteration: 5
comp range = 1 - 4
nn range = [69 74 79 84 89]
GMM Iteration: 6
comp range = 1 - 4
nn range = [79 84 89 94 99]
GMM Iteration: 7
comp range = 1 - 4
nn range = [ 89  94  99 104 109]
GMM Iteration: 8
comp range = 1 - 4
nn range = [ 99 104 109 114 119]
GMM Iteration: 9
comp range = 1 - 4
nn range = [109 114 119 124 129]




Best Params: {'n_neighbors': 39, 'weights': 'uniform', 'p': 2}
Done: Rosalie
Starting Rose
KNN iteration: 1
nn_max = 40
False




KNN iteration: 2
nn_max = 80
False
GMM Iteration: 1
comp range = 1 - 4
nn range = [30 35 40 45 50]
GMM Iteration: 2
comp range = 1 - 4
nn range = [40 45 50 55 60]
GMM Iteration: 3
comp range = 1 - 4
nn range = [50 55 60 65 70]
GMM Iteration: 4
comp range = 1 - 4
nn range = [60 65 70 75 80]
GMM Iteration: 5
comp range = 1 - 4
nn range = [70 75 80 85 90]




Best Params: {'n_neighbors': 40, 'weights': 'uniform', 'p': 2}
Done: Rose
Starting Ruth
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [18 23 28 33 38]
GMM Iteration: 2
comp range = 1 - 4
nn range = [28 33 38 43 48]




Best Params: {'n_neighbors': 28, 'weights': 'uniform', 'p': 1}
Done: Ruth
Starting Ryan
KNN iteration: 1
nn_max = 40
False




GMM Iteration: 1
comp range = 1 - 4
nn range = [21 26 31 36 41]
GMM Iteration: 2
comp range = 1 - 4
nn range = [31 36 41 46 51]
GMM Iteration: 3
comp range = 1 - 4
nn range = [41 46 51 56 61]
GMM Iteration: 4
comp range = 5 - 8
nn range = [41 46 51 56 61]
