In [2]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [3]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read Pre-OD Data

In [5]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [64]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
pre_od_data = read_data(sqlContext,exp_data_folder_path + 'pre_od_matrix')

In [65]:
print_df(pre_od_data)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,...,d_timestamp,d_shape_id,d_shape_seq,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id,o_unixtimestamp,d_unixtimestamp,leg_duration
0,50,JB605,2017-05-10,7,19:12:26,1720,5304847,-25.455303,-49.267471,29142,...,08:16:08,2073,6491312,-25.551371,-49.274145,38165,919123001529,79946,40568,-1.0
1,40,GR110,2017-05-10,1,07:18:48,1717,3366595,-25.404101,-49.33544,34157,...,14:11:17,2236,4887443,-25.449524,-49.358804,40623,927712935938,37128,61877,412.483333
2,829,BA011,2017-05-10,22,14:11:17,2236,4887443,-25.449524,-49.358804,40623,...,07:18:48,1717,3366595,-25.404101,-49.33544,34157,927712935937,61877,37128,-1.0
3,801,LC072,2017-05-10,9,13:37:38,2202,5825427,-25.44423,-49.291248,30157,...,14:25:48,2816,6244828,-25.439749,-49.280987,30481,927712935940,59858,62748,48.166667
4,365,BC928,2017-05-10,6,14:25:48,2816,6244828,-25.439749,-49.280987,30481,...,13:37:38,2202,5825427,-25.44423,-49.291248,30157,927712935939,62748,59858,-1.0
5,684,HA019,2017-05-10,12,05:03:05,2148,6636916,-25.513097,-49.295246,31053,...,06:10:45,3669,6592154,-25.42822,-49.246843,3377,927712935942,28985,33045,67.666667
6,370,LC016,2017-05-10,4,06:10:45,3669,6592154,-25.42822,-49.246843,3377,...,16:23:55,2816,6244727,-25.422411,-49.252535,29903,927712935943,33045,69835,613.166667
7,365,BC946,2017-05-10,7,16:23:55,2816,6244727,-25.422411,-49.252535,29903,...,16:26:16,2816,6244745,-25.424402,-49.259234,29898,927712935944,69835,69976,2.35
8,365,BC946,2017-05-10,7,16:26:16,2816,6244745,-25.424402,-49.259234,29898,...,05:03:05,2148,6636916,-25.513097,-49.295246,31053,927712935941,69976,28985,-1.0
9,701,JC311,2017-05-10,3,08:24:39,2161,6354906,-25.469638,-49.316661,31646,...,18:34:30,2162,6355287,-25.456744,-49.303425,33057,927712935946,41079,77670,609.85


### Reading Bus Trip Data

In [14]:
old_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/back-bulma-output'
bus_trip_data = read_file(old_exp_data_folder_path + '/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [15]:
bus_trip_data = bus_trip_data.withColumn("date", F.date_sub(F.col("date"), 1))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))

In [16]:
bus_trip_data = bus_trip_data.na.drop(subset=["route","busCode","busStopId","gps_timestamp_in_secs","tripNum"]) \
                             .dropDuplicates(['route','busCode','tripNum','busStopId']) \
                             .orderBy(['route','busCode','tripNum','gps_timestamp_in_secs'])

In [44]:
print_df(bus_trip_data, l=40)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,date,gps_timestamp,gps_timestamp_in_secs
0,1,1,2938,6451385,-25.428133,-49.264531,338.081,BN997,,-25.428198,-49.264428,12.657534,06:21:08,29887,NO_PROBLEM,0,2017-05-09,2017-05-09 06:21:08,1494321668
1,1,1,2938,6451394,-25.430622,-49.263426,635.815,BN997,,-25.43083,-49.263316,25.616758,06:22:10,31453,NO_PROBLEM,0,2017-05-09,2017-05-09 06:22:10,1494321730
2,1,1,2938,6451403,-25.433455,-49.262219,972.378,BN997,,,,,06:22:44,31454,BETWEEN,0,2017-05-09,2017-05-09 06:22:44,1494321764
3,1,1,2938,6451415,-25.435452,-49.265462,1428.729,BN997,,-25.435406,-49.265455,5.110665,06:24:42,30748,NO_PROBLEM,0,2017-05-09,2017-05-09 06:24:42,1494321882
4,1,1,2938,6451418,-25.435893,-49.266571,1550.5,BN997,,-25.435816,-49.266521,9.893764,06:24:52,30749,NO_PROBLEM,0,2017-05-09,2017-05-09 06:24:52,1494321892
5,1,1,2938,6451424,-25.436991,-49.269392,1859.32,BN997,,-25.43696,-49.269536,14.841262,06:26:15,35216,NO_PROBLEM,0,2017-05-09,2017-05-09 06:26:15,1494321975
6,1,1,2938,6451461,-25.435749,-49.274886,2652.419,BN997,,-25.435708,-49.274871,4.83996,06:29:21,26166,NO_PROBLEM,0,2017-05-09,2017-05-09 06:29:21,1494322161
7,1,1,2938,6451475,-25.433142,-49.276703,3090.819,BN997,,,,,06:31:19,35219,BETWEEN,0,2017-05-09,2017-05-09 06:31:19,1494322279
8,1,1,2938,6451490,-25.430831,-49.276405,3473.319,BN997,,,,,06:33:21,29420,BETWEEN,0,2017-05-09,2017-05-09 06:33:21,1494322401
9,1,1,2938,6451495,-25.429963,-49.274031,3730.817,BN997,,,,,06:34:09,29082,BETWEEN,0,2017-05-09,2017-05-09 06:34:09,1494322449


In [43]:
bus_trip_data = bus_trip_data.withColumn('route', F.col('route').cast(T.IntegerType()))

### Testing with new BUSTE data

In [None]:
#bus_trip_data_new = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext) \
#                    .na.drop(subset=["route","busCode","busStopId","timestamp","tripNum"])

In [None]:
#print_df(bus_trip_data_new.select(['route','busCode','tripNum','busStopId','timestamp']) \
#         .filter('route == 022') \
#         .orderBy(['route','busCode','tripNum','busStopId','timestamp']), l=40)

### Building stops-distance table

In [18]:
busStops = bus_trip_data.select(['busStopId','shapeLat','shapeLon']).dropDuplicates(['busStopId'])

In [13]:
#print busStops.count()
#print_df(busStops)

In [19]:
o_busStops = rename_columns(
        busStops,
        [
            ("busStopId", "o_busStopId"),
            ("shapeLat", "o_shapeLat"),
            ("shapeLon", "o_shapeLon")
        ]
    )

d_busStops = rename_columns(
        busStops,
        [
            ("busStopId", "d_busStopId"),
            ("shapeLat", "d_shapeLat"),
            ("shapeLon", "d_shapeLon")
        ]
    )

In [20]:
#print_df(o_busStops)

In [21]:
#print_df(d_busStops)

In [22]:
stops_dist_table = o_busStops.crossJoin(d_busStops)

In [23]:
#print_df(stops_dist_table)

In [None]:
#stops_dist_table.count()

In [24]:
def dist(long_x, lat_x, long_y, lat_y):
    return F.acos(
        F.sin(F.toRadians(lat_x)) * F.sin(F.toRadians(lat_y)) + 
        F.cos(F.toRadians(lat_x)) * F.cos(F.toRadians(lat_y)) * 
            F.cos(F.toRadians(long_x) - F.toRadians(long_y))
    ) * F.lit(6371.0)

In [25]:
stops_dist_table = stops_dist_table.withColumn("dist", dist('o_shapeLon', 'o_shapeLat','d_shapeLon','d_shapeLat'))

In [None]:
#print_df(stops_dist_table)

In [26]:
def get_stops_dist(stops_dist_df,stopA,stopB):
    return stops_dist_df.where(
        (F.col('o_busStopId') == stopA) &
        (F.col('d_busStopId') == stopB)).select('dist').collect()[0][0]

get_stops_dist_udf = F.udf(get_stops_dist)

In [27]:
#get_stops_dist(stops_dist_table,31261,33717)

### Finding the closest alighting stop to next trip boarding

In [95]:
trips_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/trips.txt')

In [97]:
print_df(trips_data)

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id
0,34,1,3151613,Terminal Boa Vista,,1,,1800
1,34,1,3151614,Abaeté,,0,,1799
2,34,1,3151615,Terminal Boa Vista,,1,,1800
3,34,1,3151616,Abaeté,,0,,1799
4,34,1,3151617,Terminal Boa Vista,,1,,1800
5,34,1,3151618,Abaeté,,0,,1799
6,34,1,3151619,Terminal Boa Vista,,1,,1800
7,34,1,3151620,Abaeté,,0,,1799
8,34,1,3151621,Terminal Boa Vista,,1,,1800
9,34,1,3151622,Abaeté,,0,,1799


In [151]:
stops_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/stops.txt')

In [152]:
print_df(stops_data)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,70,104505,Terminal Campina do Siqueira - 303 - Centenári...,Terminal Campina do Siqueira - Campo Comprido,-25.435724,-49.306998,,,0,14506.0,,
1,270,104905,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501341,-49.237597,,,0,14485.0,,
2,276,105606,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.45155,-49.214917,,,0,14481.0,,
3,299,105603,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.451665,-49.215086,,,0,14481.0,,
4,308,104907,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501311,-49.237825,,,0,14485.0,,
5,568,190836,"R. Dep. José Hoffmann, 80 - Vista Alegre",150 - C. Música / V.Alegre (Ponto Final),-25.408609,-49.29986,,,0,,,
6,581,110312,Praça Santos Andrade - 150 - C. da Música / Vi...,Praça Santos Andrade 150 - C.Música / V. Alegre .,-25.42821,-49.265846,,,0,,,
7,597,190896,"R. Eng. Agro. Lauro Klas, 106 - Pilarzinho",160 - Jd. Mercês / Guanabara (Ponto Final),-25.39841,-49.293255,,,0,,,
8,616,150689,"Rua Rio de Janeiro, 1293 - Água Verde",Ponto Final 160 - Jd. Mercês / Guanabara (Sent...,-25.462635,-49.27792,,,0,,,
9,662,190600,"Rua São Francisco Xavier, 132 - Pilarzinho",166 - Vila Nori (Ponto Final) 167 - Fredolin W...,-25.389211,-49.302757,,,0,,,


In [106]:
stop_times_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/stop_times.txt')

In [108]:
print_df(stop_times_data)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type
0,3151613,05:05:00,05:05:00,1899,1,,0,0
1,3151613,05:05:43,05:05:43,27220,2,,0,0
2,3151613,05:06:34,05:06:34,27221,3,,0,0
3,3151613,05:07:15,05:07:15,27222,4,,0,0
4,3151613,05:08:11,05:08:11,34486,5,,0,0
5,3151613,05:08:48,05:08:48,27223,6,,0,0
6,3151613,05:09:41,05:09:41,27224,7,,0,0
7,3151613,05:10:36,05:10:36,27179,8,,0,0
8,3151613,05:11:27,05:11:27,27226,9,,0,0
9,3151613,05:12:08,05:12:08,31888,10,,0,0


In [153]:
shapes_stops = trips_data.join(stop_times_data,'trip_id', 'inner') \
                    .join(stops_data, 'stop_id', 'inner') \
                    .select(['shape_id','stop_id','stop_sequence', 'stop_lat','stop_lon']) \
                    .dropDuplicates(['shape_id','stop_id','stop_sequence']) \
                    .orderBy(['shape_id','stop_sequence'])

In [119]:
shapes_stops = trips_shapes.join(stop_times_data, 'trip_id', 'inner') \
                            .select(['shape_id','stop_id','shape_pt_sequence'])

In [154]:
print_df(shapes_stops)

Unnamed: 0,shape_id,stop_id,stop_sequence,stop_lat,stop_lon
0,1708,33156,1,-25.413929,-49.279703
1,1708,33157,2,-25.410517,-49.276479
2,1708,33159,3,-25.411726,-49.270902
3,1708,33158,4,-25.415285,-49.270134
4,1708,30150,5,-25.416733,-49.267863
5,1708,28637,6,-25.414184,-49.265917
6,1708,33161,7,-25.41271,-49.265298
7,1708,33162,8,-25.413442,-49.261645
8,1708,33163,9,-25.415827,-49.259532
9,1708,28641,10,-25.417083,-49.256173


In [128]:
def get_trip_next_stops(shape_stops_df,o_shape_id,o_stop_id):
    shape_stops = shape_stops_df.filter((F.col('shape_id') == o_shape_id))
    o_stop_seq = shape_stops.filter(F.col('stop_id') == o_stop_id).first()['stop_sequence']
    return shape_stops.filter(F.col('stop_sequence') > o_stop_seq)

In [148]:
def get_closest_next_stop(stops_dist_df,next_stops,d_stop_id):
    shape_next_stops = next_stops.withColumn('d_stopId',F.lit(d_stop_id))
    return shape_next_stops.join(stops_dist_df, (shape_next_stops.stop_id == stops_dist_df.o_busStopId) & (shape_next_stops.d_stopId == stops_dist_df.d_busStopId), 'inner') \
                        .select(['o_busStopId','o_shapeLat','o_shapeLon','d_busStopId','d_shapeLat','d_shapeLon','dist']) \
                        .orderBy('dist')

In [83]:
print_df(pre_od_data.limit(2))

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,...,d_timestamp,d_shape_id,d_shape_seq,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id,o_unixtimestamp,d_unixtimestamp,leg_duration
0,50,JB605,2017-05-10,7,19:12:26,1720,5304847,-25.455303,-49.267471,29142,...,08:16:08,2073,6491312,-25.551371,-49.274145,38165,919123001529,79946,40568,-1.0
1,40,GR110,2017-05-10,1,07:18:48,1717,3366595,-25.404101,-49.33544,34157,...,14:11:17,2236,4887443,-25.449524,-49.358804,40623,927712935938,37128,61877,412.483333


In [150]:
for row in pre_od_data.limit(2).rdd.collect():
    next_stops = get_trip_next_stops(shapes_stops,row['o_shape_id'],row['o_stop_id'])
    closest_next_stop = get_closest_next_stop(stops_dist_table,next_stops,row['d_stop_id'])
    closest_next_stop.show()

KeyboardInterrupt: 

In [134]:
print_df(shapes_stops.filter((F.col('shape_id') == 1720)),l=50)

Unnamed: 0,shape_id,stop_id,stop_sequence
0,1720,32454,1
1,1720,32453,2
2,1720,31771,3
3,1720,32416,4
4,1720,32417,5
5,1720,32410,6
6,1720,30943,7
7,1720,31477,8
8,1720,31464,9
9,1720,31461,10


In [None]:
def df_loop(df):
    i = 0
    for row in df.rdd.collect():
        if i > 10:
            break
        print row['d_busStopId']
        i+=1

In [None]:
df_loop(d_busStops)

In [None]:
next_stops = get_trip_next_stops(bus_trip_data,'022','BL307',1,1494324183)
print_df(next_stops)

In [None]:
8 	022 	BL307 	1 	25821 	1494324731 	07:12:11
9 	022 	BL307 	1 	26246 	1494325361 	07:22:41
10 	022 	BL307 	1 	26240 	1494325564 	07:26:04
11 	022 	BL307 	1 	25681 	1494326340 	07:39:00
12 	022 	BL307 	1 	26284 	1494326896 	07:48:16

In [None]:
next_stops = next_stops.withColumn('d_stopId',F.lit(33717))

In [None]:
print_df(next_stops)

In [None]:
#next_stops = next_stops.withColumn('dist',get_stops_dist(stops_dist_table,next_stops.busStopId,next_stops.d_stopid))

In [None]:
next_stops_dist = next_stops.join(stops_dist_table, (next_stops.busStopId == stops_dist_table.o_busStopId) & (next_stops.d_stopId == stops_dist_table.d_busStopId), 'inner') \
                    .select(['route','busCode','tripNum','o_busStopId','o_shapeLat','o_shapeLon','d_busStopId','d_shapeLat','d_shapeLon','timestamp','dist']) \
                    .orderBy('dist').first()

In [None]:
next_stops_dist['dist']