In [1]:
import pandas as pd
import scipy as sp
import matplotlib as plt
import numpy as np
import json
from random import shuffle, random
from sklearn.cross_validation import ShuffleSplit, train_test_split

In [2]:
def HaversineDistance(c1, c2): 
  lon_diff = np.abs(c1[0]-c2[0])*np.pi/360.0
  lat_diff = np.abs(c1[1]-c2[1])*np.pi/360.0
  a = np.sin(lat_diff)**2 + np.cos(c1[1]*np.pi/180.0) * np.cos(c2[1]*np.pi/180.0) * np.sin(lon_diff)**2
  d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
  return d

def EuclidDistance(c1, c2):
    return np.sqrt((c1[0]-c2[0])**2 + (c1[1]-c2[1])**2)


def load_data(num_records_to_load=10):
    submission_df = pd.read_csv('/home/tony/ML/taxi/taxi2_time/test.csv')
    submission_df['POLYLINE'] = submission_df['POLYLINE'].apply(json.loads)
    submission_df['COORDS_LEN'] = submission_df['POLYLINE'].apply(len)
    submission_df['START'] = submission_df['POLYLINE'].apply(lambda x: x[0])

    # read train
    taxi_df = pd.read_csv('/home/tony/ML/taxi/taxi2_time/train.csv', nrows=num_records_to_load)
    taxi_df['POLYLINE'] = taxi_df['POLYLINE'].apply(json.loads)
    taxi_df['COORDS_LEN'] = taxi_df['POLYLINE'].apply(len)
    taxi_df = taxi_df[taxi_df.COORDS_LEN > 10]
    taxi_df['START'] = taxi_df['POLYLINE'].apply(lambda x: x[0])
    taxi_df['END'] = taxi_df['POLYLINE'].apply(lambda x: x[-1])
    
    return taxi_df, submission_df


def createTrainTestSplit(df, percent_test=0.1):
    num_rows = len(df)
    num_train = num_rows - int(num_rows*percent_test)
    mask = np.random.rand(num_rows) > percent_test
    
    train_df, test_df = df[mask], df[~mask]
    train_time, test_time = 15*train_df['COORDS_LEN'].values, 15*test_df['COORDS_LEN'].values
    train_end, test_end = train_df['END'].values, test_df['END'].values

    # Save reference to complete path for analytical purposes
    test_df['POLYLINE_ACTUAL'] = test_df['POLYLINE'].values[:]
    
    #Create partial paths for the test data
    #coords = test_df['POLYLINE'].values[:]
    #partial_lengths = [round(0.5*len(coord)) for coord in coords]
    #test_df['POLYLINE'] = [coords[:n] for n in partial_lengths]
    test_df['POLYLINE'] = [coord[:int(round(0.5*len(coord)))] for coord in test_df['POLYLINE'].values]
    
    
    #Drop all data that we shouldn't have during training
    test_df = test_df.drop(['COORDS_LEN', 'END'], axis=1)
    test_df['COORDS_LEN'] = test_df['POLYLINE'].apply(len)
    
    return train_df, test_df, train_time, test_time, train_end, test_end

def travelTimeScore(pred_times, actual_times):
    score = np.sqrt(np.mean((np.log(pred_times+1)-np.log(actual_times+1))**2))
    return score

def travelEndScore(pred_ends, actual_ends):
    num_points = len(pred_ends)
    preds, actuals = pred_ends, actual_ends
    score = np.mean([HaversineDistance(preds[i], actuals[i]) for i in range(num_points)])
    return score
    
def submitTravelTime(validation_df, filename):
    validation_df[['TRIP_ID', 'TRAVEL_TIME']].to_csv(filename, index=False)
    
def submitTravelDestination(validation_df, filename):
    validation_df['LATITUDE'] = validation_df['TRAVEL_END'].apply(lambda x: x[1])
    validation_df['LONGITUDE'] = validation_df['TRAVEL_END'].apply(lambda x: x[0])
    validation_df[['TRIP_ID', 'LATITUDE', 'LONGITUDE']].to_csv(filename, index=False)

In [35]:
def weightedMeanTravelTimes(train, test, num_trips=100):
    test['TRAVEL_TIME'] = 0
    for idx, start_coord in enumerate(test['START']):
        dists = train['START'].apply(lambda x: HaversineDistance(x, start_coord))
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        w = np.maximum(dists.iloc[smallest_dist_indexes], 0.01)
        path_lengths = train.iloc[smallest_dist_indexes]['COORDS_LEN']
        no_path_lengths_indexes = np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        print idx
        #print "test[idx, travel_time]: ", test[idx, 'TRAVEL_TIME']
        print "test.loc(idx, coords_len): ", test.loc[idx, "COORDS_LEN"]
        print "avg: ", np.average(s.iloc[no_path_lengths_indexes], weights=1/w.iloc[no_path_lengths_indexes]**2)
        #test.loc[idx, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[idx, 'COORDS_LEN'], np.average(s.iloc[no_path_lengths_indexes], weights=1/w.iloc[no_path_lengths_indexes]**2))
        test.loc[idx, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[idx, 'COORDS_LEN'], np.average(path_lengths.iloc[no_path_lengths_indexes], weights=1/w.iloc[no_path_lengths_indexes]**2))

        
    test['TRAVEL_TIME'] = test['TRAVEL_TIME'].astype(int)
    return test['TRAVEL_TIME'].values


def weightedMeanTravelTimes3(train, test, num_trips=100):
    test_times = np.zeros(len(test))
    for idx, start_coord in enumerate(test['START'].values):
        dists = np.array([HaversineDistance(x, start_coord) for x in train['START'].values])
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        w = np.maximum(dists[smallest_dist_indexes], 0.01)
        path_lengths = train['COORDS_LEN'].values[smallest_dist_indexes]
        no_path_lengths_indexes = np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        test_times[idx] = 15*np.maximum(test['COORDS_LEN'].values[idx], np.average(path_lengths[no_path_lengths_indexes], weights=1/w[no_path_lengths_indexes]**2))

        
    test['TRAVEL_TIME'] = test_times.astype(int)
    return test_times


# good score! 0.385 locally, 0.556 on leaderboard for 20k examples, 4 num_trips
def weightedMeanTravelTimes_2_Points(train, test, num_trips=100):
    test_times = np.zeros(len(test))
    train_lens = train['POLYLINE'].apply(len)
    for idx, test_coord in enumerate(test['POLYLINE'].values):
        dists_starts = np.array([HaversineDistance(x[0], test_coord[0]) for x in train['POLYLINE'].values])
        dists_ends = np.array([(HaversineDistance(x[(len(test_coord)-1)], test_coord[-1]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists = dists_starts + dists_ends
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        w = np.maximum(dists[smallest_dist_indexes], 0.01)
        path_lengths = train['COORDS_LEN'].values[smallest_dist_indexes]
        no_path_lengths_indexes = np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        test_times[idx] = 15*np.maximum(test['COORDS_LEN'].values[idx], np.average(path_lengths[no_path_lengths_indexes], weights=1/w[no_path_lengths_indexes]**2))

    test['TRAVEL_TIME'] = test_times.astype(int)
    return test_times
        
def weightedMeanTravelTimes_3_Points(train, test, num_trips=100):
    test_times = np.zeros(len(test))
    train_lens = train['POLYLINE'].apply(len)
    for idx, test_coord in enumerate(test['POLYLINE'].values):
        dists_starts = np.array([HaversineDistance(x[0], test_coord[0]) for x in train['POLYLINE'].values])
        dists_mids = np.array([(HaversineDistance(x[(int(len(test_coord)/2))], test_coord[int(len(test_coord)/2)]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists_ends = np.array([(HaversineDistance(x[(len(test_coord)-1)], test_coord[-1]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists = dists_starts + dists_mids + dists_ends
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        w = np.maximum(dists[smallest_dist_indexes], 0.01)
        path_lengths = train['COORDS_LEN'].values[smallest_dist_indexes]
        no_path_lengths_indexes = np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        test_times[idx] = 15*np.maximum(test['COORDS_LEN'].values[idx], np.average(path_lengths[no_path_lengths_indexes], weights=1/w[no_path_lengths_indexes]**2))
     
    test['TRAVEL_TIME'] = test_times.astype(int)
    return test_times


def weightedMeanTravelEnd_3_Points(train, test, num_trips=10):
    pred_ends = [[0,0] for x in range(len(test['POLYLINE'].values)) ]
    train_ends = train['END'].values
    train_lens = train['POLYLINE'].apply(len)
    for idx, test_coord in enumerate(test['POLYLINE'].values):
        dists_starts = np.array([HaversineDistance(x[0], test_coord[0]) for x in train['POLYLINE'].values])
        dists_mids = np.array([(HaversineDistance(x[(int(len(test_coord)/2))], test_coord[int(len(test_coord)/2)]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists_ends = np.array([(HaversineDistance(x[(len(test_coord)-1)], test_coord[-1]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists = dists_starts + dists_mids + dists_ends
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        
        w = np.maximum(dists[smallest_dist_indexes], 0.01)
        path_lengths = train['COORDS_LEN'].values[smallest_dist_indexes]
        no_path_lengths_indexes =  np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        
        points = train_ends[no_path_lengths_indexes] 
        dists_sum = [0 for _ in range(len(points))]
        for k in range(len(points)):
            dists_sum[k] = np.sum([1/max(0.001, EuclidDistance(points[c], points[k])**2) for c in range(len(points)) if c != k])
        densest = points[np.argmax(dists_sum)]
        pred_ends[idx] = densest
    
    test['TRAVEL_END'] = pred_ends
    return pred_ends

def weightedMeanTravelEnd_3_Points_focusOnEnd(train, test, num_trips=10):
    pred_ends = [[0,0] for x in range(len(test['POLYLINE'].values)) ]
    train_ends = train['END'].values
    train_lens = train['POLYLINE'].apply(len)
    for idx, test_coord in enumerate(test['POLYLINE'].values):
        #dists_starts = np.array([HaversineDistance(x[0], test_coord[0]) for x in train['POLYLINE'].values])
        dists_4 = np.array([(HaversineDistance(x[(int(len(test_coord)-min(len(test_coord), 4)))], test_coord[int(len(test_coord)-min(len(test_coord), 4))]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists_3 = np.array([(HaversineDistance(x[(int(len(test_coord)-min(len(test_coord), 3)))], test_coord[int(len(test_coord)-min(len(test_coord), 3))]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists_2 = np.array([(HaversineDistance(x[(int(len(test_coord)-min(len(test_coord), 2)))], test_coord[int(len(test_coord)-min(len(test_coord), 2))]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists_1 = np.array([(HaversineDistance(x[(len(test_coord)-1)], test_coord[-1]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists =  0.9*dists_1 + 0.8*dists_2 + 0.7*dists_3 + 0.6*dists_4
        
        
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        
        w = np.maximum(dists[smallest_dist_indexes], 0.01)
        path_lengths = train['COORDS_LEN'].values[smallest_dist_indexes]
        no_path_lengths_indexes =  np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        
        points = train_ends[no_path_lengths_indexes] 
        dists_sum = [0 for _ in range(len(points))]
        for k in range(len(points)):
            dists_sum[k] = np.sum([1/max(0.01, EuclidDistance(points[c], points[k])**2) for c in range(len(points)) if c != k])
        densest = points[np.argmax(dists_sum)]
        pred_ends[idx] = densest
    
    test['TRAVEL_END'] = pred_ends
    return pred_ends


def weightedMeanTravelEnd_3_Points_with_w(train, test, num_trips=10):
    pred_ends = [[0,0] for x in range(len(test['POLYLINE'].values)) ]
    train_ends = train['END'].values
    train_lens = train['POLYLINE'].apply(len)
    for idx, test_coord in enumerate(test['POLYLINE'].values):
        dists_starts = np.array([HaversineDistance(x[0], test_coord[0]) for x in train['POLYLINE'].values])
        dists_mids = np.array([(HaversineDistance(x[(int(len(test_coord)/2))], test_coord[int(len(test_coord)/2)]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists_ends = np.array([(HaversineDistance(x[(len(test_coord)-1)], test_coord[-1]) if len(x) > len(test_coord) else 100) for x in train['POLYLINE'].values ])
        dists = dists_starts + dists_mids + dists_ends
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        
        w = np.maximum(dists[smallest_dist_indexes], 0.01)
        path_lengths = train['COORDS_LEN'].values[smallest_dist_indexes]
        no_path_lengths_indexes =  np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        
        points = train_ends[no_path_lengths_indexes] 
        w_points = w[no_path_lengths_indexes]
        dists_sum = [0 for _ in range(len(points))]
        for k in range(len(points)):
            dists_sum[k] = w_points[k]*np.sum([1/max(0.01, EuclidDistance(points[c], points[k])**2) for c in range(len(points)) if c != k])
        densest = points[np.argmax(dists_sum)]
        pred_ends[idx] = densest
    
    test['TRAVEL_END'] = pred_ends
    return pred_ends

class PointClassifier:
    def __init__(self):
        pass
    
    def fit(self, X, y, num_records=4000, num_test=200):
        train_df, test_df, train_time, test_time, train_end, test_end = createTrainTestSplit(taxi_df, num_test/float(num_records))

    
    def predict(self, X, y, k_neighbors=5):
        pred_time = weightedMeanTravelTimes_3_Points(train_df, test_, k_neighbors)
        return travelTimeScore(pred_time, test_time)
    
    

In [8]:
np.maximum(np.array([1,2,3,4,5]), 2)

array([2, 2, 3, 4, 5])

In [208]:
'%d_%d' % (11,23.3)

'11_23'

In [None]:

def train(num_records=1000, num_averaged=10, cv_folds=10, cv_size=20, submit=False, verbose=False):
    taxi_df, submission_df = load_data(num_records_to_load=num_records)

    scores = []
    for k in range(cv_folds):
        train_df, test_df, train_time, test_time, train_end, test_end = createTrainTestSplit(taxi_df, cv_size/float(num_records))
        #print "num training:", len(train_df), "   num test:", len(test_df)

        pred_time = weightedMeanTravelTimes_3_Points(train_df, test_df, num_averaged)
        score = travelTimeScore(pred_time, test_time)
        scores.append(score)
        if verbose:
            print "score:", score, "average:", np.average(scores)

    print "score:", np.average(scores), ", num_records:", num_records, ", num_averaged:", num_averaged
            
    if submit:
        validation_time = weightedMeanTravelTimes_3_Points(train_df, submission_df, num_averaged)
        submitTravelTime(submission_df, 'sub_time__%d_%d_%d.csv' % (num_records, num_averaged, np.average(scores)*100))

for num_averaged in [5, 25, 100, 1000]:
    for num_records in [500000, 1000000]:
        train(num_records, num_averaged, cv_folds=5, cv_size=100, submit=True, verbose=False)

In [None]:

def train_dest(num_records=1000, num_averaged=10, cv_folds=10, cv_size=20, submit=False, verbose=False):
    taxi_df, submission_df = load_data(num_records_to_load=num_records)

    scores = []
    for k in range(cv_folds):
        train_df, test_df, train_time, test_time, train_end, test_end = createTrainTestSplit(taxi_df, cv_size/float(num_records))
        #print "num training:", len(train_df), "   num test:", len(test_df)

    pred_ends = weightedMeanTravelEnd_3_Points_focusOnEnd(train_df, test_df, num_averaged)
    score = travelEndScore(pred_ends, test_end)
    scores.append(score)
    if verbose:
        print "score: ", score, "avg: ", np.average(scores)

    print "avg: ", np.average(scores), "min:", np.min(scores), "max:", np.max(scores), "num_records: ", num_records, "num_averaged:", num_averaged
    
    if submit:
        validation_time = weightedMeanTravelEnd_3_Points_focusOnEnd(train_df, submission_df, num_samples)
        submitTravelDestination(submission_df, 'sub_dest__%d_%d_%d.csv' % (num_records, num_averaged, np.average(scores)*100))
        
for num_records in [2000, 20000, 100000, 500000, 1000000]:
    for num_averaged in [5, 15, 30, 80]:
        train_dest(num_records, num_averaged, cv_folds=5, cv_size=100, submit=True, verbose=False)  

In [34]:
np.max(submission_df['COORDS_LEN'].values)

612

In [18]:
pred_ends

[[-8.604594, 41.134158],
 [-8.604594, 41.134158],
 [-8.604594, 41.134158],
 [-8.66574, 41.170671],
 [-8.604594, 41.134158],
 [-8.6247, 41.161554],
 [-8.589402, 41.163309],
 [-8.603973, 41.142816],
 [-8.604594, 41.134158],
 [-8.604594, 41.134158],
 [-8.604594, 41.134158]]