In [3]:
import pandas as pd
import scipy as sp
import matplotlib as plt
import numpy as np
import json
from random import shuffle, random
from sklearn.cross_validation import ShuffleSplit, train_test_split

In [41]:
def HaversineDistance(c1, c2): 
  lon_diff = np.abs(c1[0]-c2[0])*np.pi/360.0
  lat_diff = np.abs(c1[1]-c2[1])*np.pi/360.0
  a = np.sin(lat_diff)**2 + np.cos(c1[1]*np.pi/180.0) * np.cos(c2[1]*np.pi/180.0) * np.sin(lon_diff)**2
  d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
  return d


def load_data(num_records_to_load=10):
    submission_df = pd.read_csv('/home/tony/ML/taxi/taxi2_time/test.csv')
    submission_df['POLYLINE'] = submission_df['POLYLINE'].apply(json.loads)
    submission_df['COORDS_LEN'] = submission_df['POLYLINE'].apply(len)
    submission_df['START'] = submission_df['POLYLINE'].apply(lambda x: x[0])

    # read train
    taxi_df = pd.read_csv('/home/tony/ML/taxi/taxi2_time/train.csv', nrows=num_records_to_load)
    taxi_df['POLYLINE'] = taxi_df['POLYLINE'].apply(json.loads)
    taxi_df['COORDS_LEN'] = taxi_df['POLYLINE'].apply(len)
    taxi_df = taxi_df[taxi_df.COORDS_LEN > 10]
    taxi_df['START'] = taxi_df['POLYLINE'].apply(lambda x: x[0])
    taxi_df['END'] = taxi_df['POLYLINE'].apply(lambda x: x[-1])
    
    return taxi_df, submission_df


def createTrainTestSplit(df, percent_test=0.1):
    num_rows = len(df)
    num_train = num_rows - int(num_rows*percent_test)
    mask = np.random.rand(num_rows) > percent_test
    
    train_df, test_df = df[mask], df[~mask]
    train_time, test_time = 15*train_df['COORDS_LEN'].values, 15*test_df['COORDS_LEN'].values
    train_end, test_end = train_df['END'].values, test_df['END'].values

    # Save reference to complete path for analytical purposes
    test_df['POLYLINE_ACTUAL'] = test_df['POLYLINE'].values[:]
    
    #Create partial paths for the test data
    #coords = test_df['POLYLINE'].values[:]
    #partial_lengths = [round(0.5*len(coord)) for coord in coords]
    #test_df['POLYLINE'] = [coords[:n] for n in partial_lengths]
    test_df['POLYLINE'] = [coord[:int(round(0.5*len(coord)))] for coord in test_df['POLYLINE'].values]
    
    
    #Drop all data that we shouldn't have during training
    test_df = test_df.drop(['COORDS_LEN', 'END'], axis=1)
    test_df['COORDS_LEN'] = test_df['POLYLINE'].apply(len)
    
    return train_df, test_df, train_time, test_time, train_end, test_end

def travelTimeScore(pred_times, actual_times):
    score = np.sqrt(np.mean((np.log(pred_times+1)-np.log(actual_times+1))**2))
    return score

def travelEndScore(pred_ends, actual_ends):
    num_points = len(pred_ends)
    preds, actuals = pred_ends.values, actual_ends.values
    np.mean([HaversineDistance(preds[i], actuals[i]) for i in range(num_points)])

def submitTravelTime(validation_df, filename):
    validation_df[['TRIP_ID', 'TRAVEL_TIME']].to_csv(filename, index=False)
    
def submitTravelDestination(validation_df, filename):
    validation_df[['TRIP_ID', 'LATITUDE', 'LONGITUDE']].to_csv(filename, index=False)

In [55]:
t1, t2 = np.array([1,2,3]), np.array([12,13,14])
np.sqrt(np.mean((np.log(t1+1)-np.log(t2+1))**2)), travelTimeScore(t1, t2)
#travelEndScore(train_df['END'][0:4], train_df['END'][5:9])
c1, c2 = train_df['END'][0:4].values, train_df['END'][5:9].values
num_points = len(c1)
num_points
[c1[i] for i in range(num_points)]


[[-8.66574, 41.170671],
 [-8.61597, 41.14053],
 [-8.607996, 41.142915],
 [-8.578224, 41.160717]]

In [22]:
def weightedMeanTravelTimes(train, test, num_trips=100):
    test['TRAVEL_TIME'] = 0
    for idx, start_coord in enumerate(test['START']):
        dists = train['START'].apply(lambda x: HaversineDistance(x, start_coord))
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        w = np.maximum(dists.iloc[smallest_dist_indexes], 0.01)
        path_lengths = train.iloc[smallest_dist_indexes]['COORDS_LEN']
        no_path_lengths_indexes = np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        print idx
        #print "test[idx, travel_time]: ", test[idx, 'TRAVEL_TIME']
        print "test.loc(idx, coords_len): ", test.loc[idx, "COORDS_LEN"]
        print "avg: ", np.average(s.iloc[no_path_lengths_indexes], weights=1/w.iloc[no_path_lengths_indexes]**2)
        #test.loc[idx, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[idx, 'COORDS_LEN'], np.average(s.iloc[no_path_lengths_indexes], weights=1/w.iloc[no_path_lengths_indexes]**2))
        test.loc[idx, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[idx, 'COORDS_LEN'], np.average(path_lengths.iloc[no_path_lengths_indexes], weights=1/w.iloc[no_path_lengths_indexes]**2))

        
    test['TRAVEL_TIME'] = test['TRAVEL_TIME'].astype(int)
    return test['TRAVEL_TIME'].values


def weightedMeanTravelTimes3(train, test, num_trips=100):
    test_times = np.zeros(len(test))
    for idx, start_coord in enumerate(test['START'].values):
        dists = np.array([HaversineDistance(x, start_coord) for x in train['START'].values])
        smallest_dist_indexes = np.argpartition(dists, num_trips)[0:num_trips]
        w = np.maximum(dists[smallest_dist_indexes], 0.01)
        path_lengths = train['COORDS_LEN'].values[smallest_dist_indexes]
        no_path_lengths_indexes = np.argpartition(path_lengths, int(num_trips*.95))[0:int(num_trips*.95)]
        test_times[idx] = 15*np.maximum(test['COORDS_LEN'].values[idx], np.average(path_lengths[no_path_lengths_indexes], weights=1/w[no_path_lengths_indexes]**2))

        
    test['TRAVEL_TIME'] = test_times.astype(int)
    return test_times

In [28]:
taxi_df, submission_df = load_data(num_records_to_load=40)
#weightedMeanTravelTimes(taxi_df, submission_df, num_trips=10)

train_df, test_df, train_time, test_time, train_end, test_end = createTrainTestSplit(taxi_df, 0.5)
print "num training:", len(train_df), "   num test:", len(test_df)

pred_time = weightedMeanTravelTimes3(train_df, test_df, 4)
pred_time


#submit
#submitTravelTime(validation_df, 'inverse_distance_kernel.csv')

num training: 18    num test: 20


Try using .loc[row_index,col_indexer] = value instead


array([ 415.77714249,  332.05184966,  587.36860893,  392.35600861,
        252.57242007,  510.77219689,  240.        ,  255.        ,
        480.        ,  312.04912694,  214.72612846,  460.39705776,
        308.81688407,  435.        ,  285.76286789,  221.75059314,
        345.        ,  502.2493432 ,  315.        ,  525.        ])

In [25]:
test_df

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,START,POLYLINE_ACTUAL,COORDS_LEN
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847, 41.159826], [-8.640351, 41.159871...","[-8.639847, 41.159826]","[[-8.639847, 41.159826], [-8.640351, 41.159871...",10
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994, 41.18049], [-8.645949, 41.180517]...","[-8.645994, 41.18049]","[[-8.645994, 41.18049], [-8.645949, 41.180517]...",15
5,1372636965620000231,C,,,20000231,1372636965,A,False,"[[-8.615502, 41.140674], [-8.614854, 41.140926...","[-8.615502, 41.140674]","[[-8.615502, 41.140674], [-8.614854, 41.140926...",13
7,1372637299620000011,C,,,20000011,1372637299,A,False,"[[-8.617563, 41.146182], [-8.617527, 41.145849...","[-8.617563, 41.146182]","[[-8.617563, 41.146182], [-8.617527, 41.145849...",17
9,1372637905620000320,C,,,20000320,1372637905,A,False,"[[-8.615907, 41.140557], [-8.614449, 41.141088...","[-8.615907, 41.140557]","[[-8.615907, 41.140557], [-8.614449, 41.141088...",10
10,1372636875620000233,C,,,20000233,1372636875,A,False,"[[-8.619894, 41.148009], [-8.620164, 41.14773]...","[-8.619894, 41.148009]","[[-8.619894, 41.148009], [-8.620164, 41.14773]...",11
11,1372637984620000520,C,,,20000520,1372637984,A,False,"[[-8.56242, 41.168403], [-8.562429, 41.168358]...","[-8.56242, 41.168403]","[[-8.56242, 41.168403], [-8.562429, 41.168358]...",22
12,1372637343620000571,A,31508.0,,20000571,1372637343,A,False,"[[-8.618868, 41.155101], [-8.6175, 41.154912],...","[-8.618868, 41.155101]","[[-8.618868, 41.155101], [-8.6175, 41.154912],...",16
13,1372638595620000233,C,,,20000233,1372638595,A,False,"[[-8.608716, 41.153499], [-8.607627, 41.153481...","[-8.608716, 41.153499]","[[-8.608716, 41.153499], [-8.607627, 41.153481...",17
14,1372638151620000231,C,,,20000231,1372638151,A,False,"[[-8.612208, 41.14053], [-8.612235, 41.140521]...","[-8.612208, 41.14053]","[[-8.612208, 41.14053], [-8.612235, 41.140521]...",14


In [300]:
pred_time = weightedMeanTravelTimes(train_df, test_df,10)



0
test.loc(idx, coords_len):  23
avg:  36.066944722
1
test.loc(idx, coords_len): 

KeyError: 'the label [1] is not in the [index]'

In [309]:
#train_df['POLYLINE'].apply(len)
test_df["COORDS_LEN"][2]

65

In [188]:
coords = test_df['POLYLINE'].values
partial_lengths = [round(0.5*len(coord)) for coord in coords]
partial_coords = [coords[:n] for n in partial_lengths]


[array([ [[-8.597817, 41.162517], [-8.598006, 41.161725], [-8.598915, 41.161419], [-8.600112, 41.161455], [-8.600517, 41.160591], [-8.600886, 41.159898], [-8.600904, 41.15988], [-8.600985, 41.1597], [-8.601228, 41.15889], [-8.601705, 41.157756], [-8.602353, 41.15628], [-8.602758, 41.155083], [-8.602875, 41.154921], [-8.60292, 41.154885], [-8.602929, 41.154867], [-8.603127, 41.154633], [-8.604099, 41.154786], [-8.604324, 41.154786], [-8.604567, 41.154804], [-8.604657, 41.15421], [-8.605278, 41.153697], [-8.607249, 41.153625], [-8.609103, 41.153742], [-8.609382, 41.153688], [-8.609814, 41.153283], [-8.610057, 41.152041], [-8.610633, 41.151033], [-8.610858, 41.150637], [-8.611659, 41.150493], [-8.612919, 41.150061], [-8.613549, 41.15007], [-8.614017, 41.149728], [-8.614143, 41.149188], [-8.614296, 41.148576], [-8.614215, 41.147739], [-8.613396, 41.147235], [-8.613729, 41.145966], [-8.613693, 41.145894], [-8.613621, 41.145903], [-8.613495, 41.14593]],
        [[-8.597124, 41.1822], [-8.596

In [247]:
'''
	__author__:		 Willie Liao
	__description__: Get the trips in train with the closest starting location.
					 Use the weighted average of train trips to estimate trip duration.
					 I could not get R to read lines fast enough so here's the Python version.
					 It's been several years since I've used Python, 
					 	so please fork and make it more efficient!
    __edit__:        Multiply lonlat1[1] and lonlat2[1] by pi/180
'''					 

import json
import zipfile
import numpy as np
import pandas as pd
# from __future__ import division

### Control the number of trips read for training 
### Control the number of closest trips used to calculate trip duration
N_read = 20
N_trips = 10

### Get Haversine distance
def get_dist(lonlat1, lonlat2):
  lon_diff = np.abs(lonlat1[0]-lonlat2[0])*np.pi/360.0
  lat_diff = np.abs(lonlat1[1]-lonlat2[1])*np.pi/360.0
  a = np.sin(lat_diff)**2 + np.cos(lonlat1[1]*np.pi/180.0) * np.cos(lonlat2[1]*np.pi/180.0) * np.sin(lon_diff)**2  
  d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
  return(d)

# read test
#zf = zipfile.ZipFile('../input/test.csv.zip')
test = pd.read_csv('/home/tony/ML/taxi/taxi2_time/test.csv')
test['POLYLINE'] = test['POLYLINE'].apply(json.loads)
test['snapshots'] = test['POLYLINE'].apply(len)
test['lonlat'] = test['POLYLINE'].apply(lambda x: x[0])
test.drop('POLYLINE', axis=1, inplace=True)

# read train
#zf = zipfile.ZipFile('../input/train.csv.zip')
train = pd.read_csv('/home/tony/ML/taxi/taxi2_time/train.csv', nrows=N_read)
train['POLYLINE'] = train['POLYLINE'].apply(json.loads)
train['snapshots'] = train['POLYLINE'].apply(len)
train = train[train.snapshots>10]
train['lonlat'] = train['POLYLINE'].apply(lambda x: x[0])
train.drop('POLYLINE', axis=1, inplace=True)

test['TRAVEL_TIME'] = 0
for row, ll in enumerate(test['lonlat']):	
    ### Weighted mean of trip duration
    ### Bound below by 10 meters since we use 1/distance^2 as weight
    ### Treat 10% of longest lasting trips as outliers  	
    #print "start_coord:", ll
    #print "train[START]:", train['lonlat']
    d = train['lonlat'].apply(lambda x: get_dist(x, ll))
    #print d
    i = np.argpartition(d, N_trips)[0:N_trips]
    w = np.maximum(d.iloc[i], 0.01)
    #print w
    s = train.iloc[i]['snapshots']
    j = np.argpartition(s, int(N_trips*.95))[0:int(N_trips*.95)]
    #print j
    test.loc[row, 'TRAVEL_TIME'] = 15*np.maximum(test.loc[row, 'snapshots'], np.average(s.iloc[j], weights=1/w.iloc[j]**2))
    print test.loc[row, 'TRAVEL_TIME']
    
test['TRAVEL_TIME'] = test['TRAVEL_TIME'].astype(int)
test['TRAVEL_TIME']
#test[['TRIP_ID', 'TRAVEL_TIME']].to_csv('submission.csv', index=False)


546.881277619
600.0
600.0
405.619103822
330.29383806
2055.0
548.975161177
673.148496361
645.0
1185.0
945.0
547.072885592
728.970920478
1140.0
810.0
1455.0
405.0
945.0
547.125496569
2460.0
546.68897825
426.944847685
402.567058341
546.870050386
387.75789816
571.151802866
698.296315687
548.415691225
765.0
600.0
675.0
426.114589638
570.0
666.864275211
807.352009186
2355.0
1050.0
474.847876988
546.928240933
382.296576264
720.0
845.216629877
600.0
885.0
403.538606324
255.081842752
465.0
345.211069439
931.516253997
855.0
960.0
442.080667329
476.161924654
660.0
585.0
443.508929386
795.0
343.749767886
2010.0
570.0
546.248415379
573.266461189
680.532052235
576.575236979
415.509435007
885.0
546.939644162
441.900328562
615.0
401.925490991
7050.0
1680.0
547.495718203
1170.0
1020.0
390.968883251
427.399307172
495.0
520.738016632
1200.0
495.0
289.577487351
735.0
1455.0
375.0
630.0
645.0
555.0
525.0
630.0
1050.0
630.0
435.0
382.227741985
3375.0
2070.0
347.065741708
1005.0
508.851483017
388.558575497
9

0      546
1      600
2      600
3      405
4      330
5     2055
6      548
7      673
8      645
9     1185
10     945
11     547
12     728
13    1140
14     810
...
305     390
306     418
307     397
308     430
309     375
310     334
311    1080
312     675
313    4005
314     705
315     720
316    1410
317     340
318     673
319     780
Name: TRAVEL_TIME, Length: 320, dtype: int64