In [1]:
using DataFrames
using JSON
using Iterators
using taxis
using HDF5, JLD
using Stats
using kNN
using sequenceCompare
#reload("taxis")
#reload("sequenceCompare")
nprocs()

8

In [2]:
println("Begin")

println("loading csv files")
taxi_df = readtable("/home/tony/ML/taxi/taxi2_time/train_100k.csv")
taxi_validation_df = readtable("/home/tony/ML/taxi/taxi2_time/test.csv")

println("loading coords")
taxi_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in  taxi_df[:POLYLINE]]
taxi_validation_df[:COORDS] = [float(hcat(JSON.parse(x)...)) for x in taxi_validation_df[:POLYLINE]]

println("getting coords counts")
taxi_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_df[:COORDS]]
taxi_validation_df[:NUM_COORDS] = [length(x)::Int64 for x in taxi_validation_df[:COORDS]]

println("deleting unneeded data rows/columns")
delete!(taxi_validation_df, :POLYLINE)
delete!(taxi_df, :POLYLINE)

println("adding start/end point columns")
taxi_df[:START] = [x[:,1] for x in taxi_df[:COORDS]]
taxi_validation_df[:START] = [x[:,1] for x in taxi_validation_df[:COORDS]]

taxi_df[:END] = [x[:,end] for x in taxi_df[:COORDS]]
taxi_validation_df[:END] = [x[:,end] for x in taxi_validation_df[:COORDS]]

println("deleting training examples with no coords!")
#These examples are not going to be useful!
deleterows!(taxi_df, find(taxi_df[:NUM_COORDS] .== 0))

println("generating test coords column")
taxi_df[:COORDS_TEST] = [x[1:2,1:round(rand(1)[1]*size(x,2))] for x in taxi_df[:COORDS]]

println("done!")

Begin
loading csv files
loading coords
getting coords counts
deleting unneeded data rows/columns
adding start/end point columns
deleting training examples with no coords!
generating test coords column
done!


In [3]:
println("finding unique number of coords")
all_coords_val = hcat(taxi_validation_df[:COORDS]...)
all_coords = hcat(taxi_df[:COORDS]...)

finding unique number of coords


2x4774976 Array{Float64,2}:
 -8.61864  -8.6185  -8.62033  -8.62215  …  -8.6304  -8.63042  -8.63041
 41.1414   41.1414  41.1425   41.1438      41.1579  41.158    41.1579 

# Creating coord dict

In [58]:
#small_taxi_df = GetTableOrderedSubset(taxi_df, 10000)
#coordsDB = ConstructCoordsDatabase(small_taxi_df, 4)

#Creating new features


In [None]:
function GetDateInfo(df)
    if haskey(df, :DAYOFWEEK)
        return df
end

function GetDistanceData(df)
    if haskey(df, :DISTANCE)
        return df
end

# Training models

In [58]:
all_train_coords = taxi_df[:COORDS][1:200]
all_validation_coords = taxi_validation_df[:COORDS]
test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, all_validation_coords, 2)
taxi_validation_df[:GUESS_PATHS] = test_guess_paths

20/320 for 200 train path examples
40/320 for 200 train path examples
60/320 for 200 train path examples
80/320 for 200 train path examples
100/320 for 200 train path examples
120/320 for 200 train path examples
140/320 for 200 train path examples
160/320 for 200 train path examples
180/320 for 200 train path examples
200/320 for 200 train path examples
220/320 for 200 train path examples
240/320 for 200 train path examples
260/320 for 200 train path examples
280/320 for 200 train path examples
300/320 for 200 train path examples
320/320 for 200 train path examples


320-element Array{Any,1}:
 2x4 Array{Float64,2}:
 -8.59926  -8.59849  -8.5967  -8.59458
 41.1492   41.1484   41.1494  41.1507                                                                      
 2x25 Array{Float64,2}:
 -8.6108  -8.6108  -8.6108  -8.61088  …  -8.60394  -8.60395  -8.60395
 41.1449  41.1449  41.1449  41.1452      41.1615   41.1615   41.1616       
 2x31 Array{Float64,2}:
 -8.58581  -8.58576  -8.58664  -8.58831  …  -8.5805  -8.58051  -8.58049
 41.1487   41.149    41.149    41.1494      41.1649  41.1649   41.1649   
 2x17 Array{Float64,2}:
 -8.61248  -8.6123  -8.61161  -8.61086  …  -8.61542  -8.61538  -8.61536
 41.1461   41.146   41.1461   41.1457      41.1426   41.1426   41.1426   
 2x1 Array{Float64,2}:
 -8.61766
 41.1464                                                                                                                                
 2x42 Array{Float64,2}:
 -8.62834  -8.62834  -8.62806  -8.62793  …  -8.62623  -8.62623  -8.62623
 41.1577   41.1578   41.157

In [59]:
all_train_coords = taxi_df[:COORDS][1000:1200]
test_df = tail(taxi_df, 100)
all_test_coords = test_df[:COORDS_TEST]
test_guess_paths = findClosestTrainingExampleForTestSet(all_train_coords, all_test_coords, 2)
test_df[:GUESS_PATHS] = test_guess_paths

20/100 for 201 train path examples
40/100 for 201 train path examples
60/100 for 201 train path examples
80/100 for 201 train path examples
100/100 for 201 train path examples


100-element Array{Any,1}:
 2x36 Array{Float64,2}:
 -8.61116  -8.6112  -8.61087  -8.61088  …  -8.60263  -8.6026  -8.60257
 41.1493   41.1493  41.1492   41.1493      41.1616   41.1617  41.1617     
 2x1 Array{Float64,2}:
 -8.621 
 41.1501                                                                                                                                 
 2x43 Array{Float64,2}:
 -8.60153  -8.60109  -8.59983  -8.59957  …  -8.56193  -8.56193  -8.56189
 41.1457   41.1458   41.1459   41.1459      41.1413   41.1413   41.1413 
 2x7 Array{Float64,2}:
 -8.61071  -8.61082  -8.61055  -8.61039  -8.61008  -8.61191  -8.61199
 41.1457   41.1458   41.146    41.1461   41.1464   41.1486   41.1486        
 2x34 Array{Float64,2}:
 -8.61405  -8.61403  -8.61404  -8.61465  …  -8.61293  -8.61295  -8.61297
 41.1412   41.1412   41.1412   41.1411      41.1335   41.1335   41.1335 
 2x33 Array{Float64,2}:
 -8.60787  -8.60783  -8.60781  -8.60783  …  -8.60202  -8.60202  -8.60201
 41.1675   41.1674   41.167

In [88]:


function score_path_guess(test_df)
    pred_paths = test_df[:GUESS_PATHS]
    actual_paths = test_df[:COORDS]
    pred_times = [(length(x)*15)::Int64 for x in pred_paths]
    actual_times = [(length(x)*15)::Int64 for x in actual_paths]
    score = sqrt(mean((log(max(1100, pred_times)+1)-log(actual_times-1)).^2))
    println("time score: ", score) 
    for k=1:length(pred_times)
        println("pred: ", pred_times[k], ", actual: ", actual_times[k]) 
    end
end

score_path_guess(test_df)


time score: 0.5085077313659201
pred: 1080, actual: 1290
pred: 30, actual: 1530
pred: 1290, actual: 2250
pred: 210, actual: 810
pred: 1020, actual: 390
pred: 990, actual: 1230
pred: 720, actual: 810
pred: 720, actual: 780
pred: 600, actual: 270
pred: 900, actual: 1020
pred: 1410, actual: 1200
pred: 210, actual: 780
pred: 30, actual: 1830
pred: 30, actual: 750
pred: 1920, actual: 8130
pred: 720, actual: 1140
pred: 600, actual: 1140
pred: 1620, actual: 1680
pred: 720, actual: 1980
pred: 600, actual: 1230
pred: 870, actual: 780
pred: 600, actual: 1500
pred: 2220, actual: 1500
pred: 900, actual: 690
pred: 1470, actual: 1770
pred: 600, actual: 960
pred: 1350, actual: 1290
pred: 900, actual: 2490
pred: 30, actual: 2370
pred: 840, actual: 510
pred: 630, actual: 1950
pred: 30, actual: 1950
pred: 30, actual: 2550
pred: 810, actual: 1800
pred: 600, actual: 1680
pred: 210, actual: 960
pred: 210, actual: 1170
pred: 720, actual: 1020
pred: 210, actual: 1440
pred: 1380, actual: 720
pred: 2070, actual

In [126]:
log([1 2 3])

1x3 Array{Float64,2}:
 0.0  0.693147  1.09861

In [62]:
taxi_validation_df[:GUESS_PATHS] = test_guess_paths
#guess_times = [length(x)*15 for x in test_guess_paths]
num_test_examples = length(test_guess_paths)

guess_times = Array(Int64, num_test_examples)
dest_coords = cell(num_test_examples)

all_test_paths = taxi_validation_df[:COORDS]
for k=1:num_test_examples
    test_path = all_test_paths[k]
    best_guess_path = test_guess_paths[k]
    
    test_path_time = length(test_path)*15
    best_guess_time = length(best_guess_path)*15
    
    if test_path_time > best_guess_time
        println(k, ":  guessing ", best_guess_time, " but existing time is ", test_path_time)
        best_guess_time = test_path_time + 100
    end
    guess_times[k] = best_guess_time
end

submission_validation = guess_times

1:  guessing 30 but existing time is 330
2:  guessing 750 but existing time is 1200
3:  guessing 1050 but existing time is 1200
5:  guessing 30 but existing time is 60
6:  guessing 1170 but existing time is 4110
7:  guessing 300 but existing time is 720
8:  guessing 390 but existing time is 510
10:  guessing 1050 but existing time is 2370
11:  guessing 600 but existing time is 1890
12:  guessing 330 but existing time is 510
13:  guessing 30 but existing time is 150
14:  guessing 1200 but existing time is 2280
15:  guessing 930 but existing time is 1620
16:  guessing 1920 but existing time is 2910
18:  guessing 840 but existing time is 1890
19:  guessing 780 but existing time is 930
20:  guessing 600 but existing time is 4920
21:  guessing 30 but existing time is 180
23:  guessing 450 but existing time is 540
24:  guessing 30 but existing time is 120
30:  guessing 960 but existing time is 1200
31:  guessing 960 but existing time is 1350
32:  guessing 510 but existing time is 810
33:  gu

320-element Array{Int64,1}:
  430
 1300
 1300
  510
  160
 4210
  820
  610
 1470
 2470
 1990
  610
  250
    ⋮
 1080
  850
  690
 3210
 1450
 8110
 1510
 1540
 2920
  390
  870
 1660

#Generating Submission

In [53]:
# beat the benchmark example
#mean_time = mean(times_validation)
#submission_validation = [max(x, mean_time) for x in times_validation]

#submission_validation

df_submission = DataFrame()
df_submission[:TRIP_ID] = taxi_validation_df[:TRIP_ID]
df_submission[:TRAVEL_TIME] = submission_validation
writetable("second_submission.csv", df_submission)

LoadError: `!` has no method matching !(::Function)
while loading In[50], in expression starting on line 1

In [388]:
immutable Point2{T}
  x::T
  y::T
end

D = [Point2(1.,2.) => 42]
haskey(D, Point2(1., 2.))  #False!


true

LoadError: ArgumentError("setindex!(::DataFrame, ...) only broadcasts scalars, not arrays")
while loading In[42], in expression starting on line 2

In [39]:
taxi_validation_df[:COORDS]

320-element DataArray{Any,1}:
 2x11 Array{Float64,2}:
 -8.58568  -8.58571  -8.58568  -8.58573  …  -8.587   -8.58658  -8.58488
 41.1485   41.1486   41.1489   41.1489      41.1475  41.1472   41.1466    
 2x40 Array{Float64,2}:
 -8.61088  -8.61086  -8.6109  -8.61044  …  -8.60293  -8.60255  -8.60189
 41.1456   41.1456   41.1458  41.1462      41.1628   41.1631   41.1636    
 2x40 Array{Float64,2}:
 -8.58574  -8.58573  -8.58572  -8.58629  …  -8.57695  -8.5759  -8.5749
 41.1486   41.1488   41.149    41.149       41.1664   41.1672  41.1677     
 2x8 Array{Float64,2}:
 -8.61396  -8.61412  -8.61509  -8.61528  …  -8.61524  -8.61505  -8.61464
 41.1412   41.1411   41.1409   41.1408      41.1408   41.1408   41.141    
 2x2 Array{Float64,2}:
 -8.6199  -8.61989
 41.148   41.148                                                                                                                
 2x137 Array{Float64,2}:
 -8.63061  -8.63061  -8.63074  -8.63151  …  -8.62639  -8.6264  -8.62641
 41.1782   41.1782