In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np
import time

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def read_buste_data_v3(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def read_buste_data_old(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read Pre-OD Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [4]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
pre_od_data = read_data(sqlContext,exp_data_folder_path + 'pre_od_matrix') \
                .orderBy(['cardNum','o_date','o_timestamp'])

In [None]:
#print_df(pre_od_data)

In [None]:
#pre_od_data.columns

### Reading Bus Trip Data

In [5]:
old_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/back-bulma-output'
bus_trip_data = read_buste_data_old(old_exp_data_folder_path + '/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [6]:
bus_trip_data = bus_trip_data.withColumn("date", F.date_sub(F.col("date"), 1))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))

In [7]:
bus_trip_data = bus_trip_data.na.drop(subset=["route","busCode","busStopId","gps_timestamp_in_secs","tripNum"]) \
                             .dropDuplicates(['route','busCode','tripNum','busStopId']) \
                             .orderBy(['route','busCode','tripNum','gps_timestamp_in_secs']) \
                             .withColumn('id',F.monotonically_increasing_id()) \
                             .withColumn('route', F.col('route').cast(T.IntegerType()))

In [None]:
print_df(bus_trip_data)

### Testing with new BUSTE data

In [None]:
#bus_trip_data_new = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext) \
#                    .na.drop(subset=["route","busCode","busStopId","timestamp","tripNum"])

In [None]:
#print_df(bus_trip_data_new.select(['route','busCode','tripNum','busStopId','timestamp']) \
#         .filter('route == 022') \
#         .orderBy(['route','busCode','tripNum','busStopId','timestamp']), l=40)

### Building stops-distance table

In [None]:
#busStops = bus_trip_data.select(['busStopId','shapeLat','shapeLon']).dropDuplicates(['busStopId'])

In [None]:
#print busStops.count()
#print_df(busStops)

In [None]:
#o_busStops = rename_columns(
#        busStops,
#        [
#            ("busStopId", "o_busStopId"),
#            ("shapeLat", "o_shapeLat"),
#            ("shapeLon", "o_shapeLon")
#        ]
#    )
#
#d_busStops = rename_columns(
#        busStops,
#        [
#            ("busStopId", "d_busStopId"),
#            ("shapeLat", "d_shapeLat"),
#            ("shapeLon", "d_shapeLon")
#        ]
#    )

In [None]:
#print_df(o_busStops)

In [None]:
#print_df(d_busStops)

In [None]:
#stops_dist_table = o_busStops.crossJoin(d_busStops)

In [None]:
#print_df(stops_dist_table)

In [None]:
#stops_dist_table.count()

In [None]:
def dist(long_x, lat_x, long_y, lat_y):
    return F.acos(
        F.sin(F.toRadians(lat_x)) * F.sin(F.toRadians(lat_y)) + 
        F.cos(F.toRadians(lat_x)) * F.cos(F.toRadians(lat_y)) * 
            F.cos(F.toRadians(long_x) - F.toRadians(long_y))
    ) * F.lit(6371.0)

In [None]:
#stops_dist_table = stops_dist_table.withColumn("dist", dist('o_shapeLon', 'o_shapeLat','d_shapeLon','d_shapeLat'))

In [None]:
#print_df(stops_dist_table)

In [None]:
def get_stops_dist(stops_dist_df,stopA,stopB):
    return stops_dist_df.where(
        (F.col('o_busStopId') == stopA) &
        (F.col('d_busStopId') == stopB)).select('dist').collect()[0][0]

get_stops_dist_udf = F.udf(get_stops_dist)

In [None]:
#get_stops_dist(stops_dist_table,31261,33717)

### Finding the closest alighting stop to next trip boarding

In [None]:
trips_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/trips.txt')

In [None]:
#print_df(trips_data)

In [None]:
stops_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/stops.txt')

In [None]:
#print_df(stops_data)

In [None]:
stop_times_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/stop_times.txt')

In [None]:
#print_df(stop_times_data)

In [None]:
shapes_stops = trips_data.join(stop_times_data,'trip_id', 'inner') \
                    .join(stops_data, 'stop_id', 'inner') \
                    .select(['shape_id','stop_id','stop_sequence', 'stop_lat','stop_lon']) \
                    .dropDuplicates(['shape_id','stop_id','stop_sequence']) \
                    .orderBy(['shape_id','stop_sequence'])

In [None]:
def get_trip_next_stops(shape_stops_df,o_shape_id,o_stop_id):
    shape_stops = shape_stops_df.filter((F.col('shape_id') == o_shape_id))
    o_shape_stop = shape_stops.filter(F.col('stop_id') == o_stop_id).first()
    
    if (o_shape_stop == None):
        print "Stop", o_stop_id, "could not be found in shape", o_shape_id
        return None

    o_stop_seq = o_shape_stop['stop_sequence']
    return shape_stops.filter(F.col('stop_sequence') > o_stop_seq)

In [None]:
def get_stop_lat_long(shape_stops_df,stop_id):
    d_stop_df = shape_stops_df.filter(F.col('stop_id') == stop_id).first()
    
    if (d_stop_df == None):
        print "Stop", o_stop_id, "could not be found"
        return None
    
    return (d_stop_df['stop_lat'],d_stop_df['stop_lon'])

In [None]:
def get_closest_next_stop(next_stops,d_stop_lat,d_stop_lon):
    shape_next_stops = next_stops.withColumn('d_stop_lat',F.lit(d_stop_lat)) \
                                 .withColumn('d_stop_lon',F.lit(d_stop_lon))    
    return shape_next_stops.withColumn('dist',dist('stop_lat', 'stop_lon','d_stop_lat','d_stop_lon')) \
                        .orderBy('dist') \
                        .select(['stop_lat', 'stop_lon','d_stop_lat','d_stop_lon','stop_id','dist']) \
                        .first()
    

In [None]:
#tuple(value for value in pre_od_data.first()) + (1,)

In [None]:
#(None,)*6 + (-2,)

In [None]:
size_limit = 1

destinations = [None] * size_limit
index = 0

for row in pre_od_data.limit(size_limit).rdd.collect():
    orig_lat_long = get_stop_lat_long(shapes_stops,row['o_stop_id'])
    
    next_stops = get_trip_next_stops(shapes_stops,row['o_shape_id'],row['o_stop_id'])
    
    if (next_stops == None):
        destinations[index] = (None,)*6 + (-1,)
    else:
        dest_lat_long = get_stop_lat_long(shapes_stops,row['d_stop_id'])

        if dest_lat_long == None:
            destinations[index] = (None,)*6 + (-2,)
        else:
            closest_next_stop = get_closest_next_stop(next_stops,dest_lat_long[0],dest_lat_long[1])
            if closest_next_stop == None:
                #print "No Stop found!"
                destinations[index] = (None,)*6 + (-3,)
            else:
                print closest_next_stop
                if closest_next_stop['dist'] <= 1:
                    #print "OK!"
                    #destinations[index] = (closest_next_stop['stop_id'],closest_next_stop['dist'],1)
                    destinations[index] = tuple(v for v in closest_next_stop) + (1,)
                else:
                    #print "Too far!"
                    destinations[index] = tuple(v for v in closest_next_stop) + (0,)
                    
                
            
    index+=1
    

    
schema = T.StructType([
    T.StructField("d_stop_id", T.IntegerType(), True),
    T.StructField("dist_to_next_o", T.DoubleType(), True),
    T.StructField("result", T.IntegerType(), True)
])

destinations_df = sqlContext.createDataFrame([((tup[0]), (tup[1]), (tup[2])) for tup in destinations],schema)

In [None]:
print_df(destinations_df)

In [None]:
destinations_df = rename_columns(destinations_df,[("d_stop_id", "est_d_stop_id")])

In [None]:
pre_od_data_sample = pre_od_data.limit(200).withColumn('id', F.monotonically_increasing_id())
destinations_sample = destinations_df.withColumn('id', F.monotonically_increasing_id())

od_matrix_sample = pre_od_data_sample.join(destinations_sample, 'id', 'inner')

In [None]:
print_df(od_matrix_sample)

In [None]:
pre_od_data.count()

### Trying with BUSTE trip lookup

In [None]:
def get_buste_trip_next_stops(trips_df,route,bus_code,trip_num,o_stop_id,o_timestamp):
    return trips_df.filter(((F.col('route') == route) & (F.col('busCode') == bus_code)) &
                                 ((F.col('tripNum') == trip_num) & (F.col('gps_timestamp_in_secs') > o_timestamp)))

In [None]:
def get_buste_trip_next_stops2(trips_df,route,bus_code,trip_num,o_stop_id,o_timestamp):
    w = Window.partitionBy([F.col('date'),F.col('route'),F.col('busCode'),F.col('tripNum')]) \
              .orderBy(F.col('gps_timestamp_in_secs'))
    return trips_df.filter(((F.col('route') == route) & (F.col('busCode') == bus_code)) &
                                 ((F.col('tripNum') == trip_num) & (F.col('gps_timestamp_in_secs') > o_timestamp)) \
                    .over(w))

In [None]:
def get_closest_buste_trip_next_stop(next_stops,next_o_stop_lat,next_o_stop_lon):
    trip_next_stops = next_stops.withColumn('next_o_stop_lat',F.lit(next_o_stop_lat)) \
                                 .withColumn('next_o_stop_lon',F.lit(next_o_stop_lon))    
    return trip_next_stops.withColumn('dist',dist('shapeLat', 'shapeLon','next_o_stop_lat','next_o_stop_lon')) \
                        .orderBy('dist') \
                        .select(['id','dist']) \
                        .first()
    

In [None]:
print_df(pre_od_data.limit(size_limit))

In [None]:
size_limit = 4

destinations = [None] * size_limit
index = 0

for row in pre_od_data.limit(size_limit).rdd.collect():
    st1 = time.time()
    #next_stops = get_buste_trip_next_stops(bus_trip_data,row['o_route'],row['o_bus_code'],row['o_tripNum'],row['o_stop_id'],row['o_unixtimestamp'])
    next_stops = get_buste_trip_next_stops2(bus_trip_data,row['o_route'],row['o_bus_code'],row['o_tripNum'],row['o_stop_id'],row['o_unixtimestamp'])
    
    if (next_stops.rdd.isEmpty()):
        print "No next stops found"
        print "Time to run next stops:", time.time() - st1
#         destinations[index] = (None,)*6 + (-1,)
    else:
        print next_stops.count(), "next stops found!"
        next_stops.show()
        print "Time to run next stops:", time.time() - st1

#         st2 = time.time()
#         closest_next_stop = get_closest_buste_trip_next_stop(next_stops,row['next_o_shape_lat'],row['next_o_shape_lon'])
#         closest_next_stop['id']
#         print "Time to run closest stop:", time.time() - st2
        
        #if closest_next_stop == None:
        #    print "No Stop found!"
            #destinations[index] = (None,)*6 + (-3,)
        #else:
            #print closest_next_stop
#                 if closest_next_stop['dist'] <= 1:
#                     #print "OK!"
#                     #destinations[index] = (closest_next_stop['stop_id'],closest_next_stop['dist'],1)
#                     destinations[index] = tuple(v for v in closest_next_stop) + (1,)
#                 else:
#                     #print "Too far!"
#                     destinations[index] = tuple(v for v in closest_next_stop) + (0,)
                    
                
            
#     index+=1
    

    
# schema = T.StructType([
#     T.StructField("d_stop_id", T.IntegerType(), True),
#     T.StructField("dist_to_next_o", T.DoubleType(), True),
#     T.StructField("result", T.IntegerType(), True)
# ])

# destinations_df = sqlContext.createDataFrame([((tup[0]), (tup[1]), (tup[2])) for tup in destinations],schema)

In [None]:
get_buste_trip_next_stops(bus_trip_data,175,'BC289',10,31749,72165).show()

In [None]:
pre_od_data.columns

In [None]:
bus_trip_data.columns

In [None]:
next_stops = get_trip_next_stops(bus_trip_data,'022','BL307',1,1494324183)
print_df(next_stops)

In [None]:
8 	022 	BL307 	1 	25821 	1494324731 	07:12:11
9 	022 	BL307 	1 	26246 	1494325361 	07:22:41
10 	022 	BL307 	1 	26240 	1494325564 	07:26:04
11 	022 	BL307 	1 	25681 	1494326340 	07:39:00
12 	022 	BL307 	1 	26284 	1494326896 	07:48:16

In [None]:
next_stops = next_stops.withColumn('d_stopId',F.lit(33717))

In [None]:
print_df(next_stops)

In [None]:
#next_stops = next_stops.withColumn('dist',get_stops_dist(stops_dist_table,next_stops.busStopId,next_stops.d_stopid))

In [None]:
next_stops_dist = next_stops.join(stops_dist_table, (next_stops.busStopId == stops_dist_table.o_busStopId) & (next_stops.d_stopId == stops_dist_table.d_busStopId), 'inner') \
                    .select(['route','busCode','tripNum','o_busStopId','o_shapeLat','o_shapeLon','d_busStopId','d_shapeLat','d_shapeLon','timestamp','dist']) \
                    .orderBy('dist').first()

In [None]:
next_stops_dist['dist']

### Trying a different approach

In [8]:
pre_od_data.columns

['o_route',
 'o_bus_code',
 'o_date',
 'o_tripNum',
 'o_timestamp',
 'o_shape_id',
 'o_shape_seq',
 'o_shape_lat',
 'o_shape_lon',
 'o_stop_id',
 'o_boarding_id',
 'cardNum',
 'next_o_route',
 'next_o_bus_code',
 'next_o_date',
 'next_o_tripNum',
 'next_o_timestamp',
 'next_o_shape_id',
 'next_o_shape_seq',
 'next_o_shape_lat',
 'next_o_shape_lon',
 'next_o_stop_id',
 'next_o_boarding_id',
 'o_unixtimestamp',
 'next_o_unixtimestamp',
 'leg_duration']

In [9]:
bus_trip_data.columns

['route',
 'tripNum',
 'shapeId',
 'shapeSequence',
 'shapeLat',
 'shapeLon',
 'distanceTraveledShape',
 'busCode',
 'gpsPointId',
 'gpsLat',
 'gpsLon',
 'distanceToShapePoint',
 'timestamp',
 'busStopId',
 'problem',
 'numPassengers',
 'date',
 'gps_timestamp',
 'gps_timestamp_in_secs',
 'id']

In [10]:
pre_od_data_sample = pre_od_data.limit(10)

In [11]:
cond = [bus_trip_data.route == pre_od_data_sample.o_route, 
        bus_trip_data.busCode == pre_od_data_sample.o_bus_code, 
        bus_trip_data.date == pre_od_data_sample.o_date,
        bus_trip_data.tripNum == pre_od_data_sample.o_tripNum,
        bus_trip_data.busStopId == pre_od_data_sample.o_stop_id]
od_trips = bus_trip_data.join(pre_od_data_sample, cond, 'left')

In [12]:
print_df(od_trips.orderBy(['route','busCode','tripNum','gps_timestamp_in_secs']))

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,next_o_timestamp,next_o_shape_id,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration
0,,1,2934,6446501,-25.452183,-49.22404,0.0,DC852,,-25.45222,...,,,,,,,,,,
1,,1,2934,6447496,-25.450932,-49.21997,432.312,DC852,,-25.450976,...,,,,,,,,,,
2,,1,2934,6447510,-25.44871,-49.22002,739.771,DC852,,-25.448616,...,,,,,,,,,,
3,,1,2934,6447525,-25.446724,-49.221252,1001.839,DC852,,-25.446655,...,,,,,,,,,,
4,,1,2934,6447538,-25.444639,-49.223163,1393.901,DC852,,-25.444756,...,,,,,,,,,,
5,,1,2934,6447574,-25.4402,-49.221386,2247.402,DC852,,-25.440206,...,,,,,,,,,,
6,1.0,1,2938,6451385,-25.428133,-49.264531,338.081,BN997,,-25.428198,...,,,,,,,,,,
7,1.0,1,2938,6451394,-25.430622,-49.263426,635.815,BN997,,-25.43083,...,,,,,,,,,,
8,1.0,1,2938,6451403,-25.433455,-49.262219,972.378,BN997,,,...,,,,,,,,,,
9,1.0,1,2938,6451415,-25.435452,-49.265462,1428.729,BN997,,-25.435406,...,,,,,,,,,,


In [18]:
print_df(bus_trip_data.orderBy(['route','busCode','tripNum','gps_timestamp_in_secs']))

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,date,gps_timestamp,gps_timestamp_in_secs,id
0,,1,2934,6446501,-25.452183,-49.22404,0.0,DC852,,-25.45222,-49.224016,4.743753,07:39:06,14404,NO_PROBLEM,0,2017-05-09,2017-05-09 07:39:06,1494326346,1709396986661
1,,1,2934,6447496,-25.450932,-49.21997,432.312,DC852,,-25.450976,-49.220008,6.208857,07:40:11,32415,NO_PROBLEM,0,2017-05-09,2017-05-09 07:40:11,1494326411,1709396986662
2,,1,2934,6447510,-25.44871,-49.22002,739.771,DC852,,-25.448616,-49.220005,10.545214,07:40:58,32413,NO_PROBLEM,0,2017-05-09,2017-05-09 07:40:58,1494326458,1709396986663
3,,1,2934,6447525,-25.446724,-49.221252,1001.839,DC852,,-25.446655,-49.221266,7.777327,07:41:54,32412,NO_PROBLEM,0,2017-05-09,2017-05-09 07:41:54,1494326514,1709396986664
4,,1,2934,6447538,-25.444639,-49.223163,1393.901,DC852,,-25.444756,-49.22312,13.68669,07:42:55,33024,NO_PROBLEM,0,2017-05-09,2017-05-09 07:42:55,1494326575,1709396986665
5,,1,2934,6447574,-25.4402,-49.221386,2247.402,DC852,,-25.440206,-49.221395,1.111246,07:46:37,27959,NO_PROBLEM,0,2017-05-09,2017-05-09 07:46:37,1494326797,1709396986666
6,1.0,1,2938,6451385,-25.428133,-49.264531,338.081,BN997,,-25.428198,-49.264428,12.657534,06:21:08,29887,NO_PROBLEM,0,2017-05-09,2017-05-09 06:21:08,1494321668,0
7,1.0,1,2938,6451394,-25.430622,-49.263426,635.815,BN997,,-25.43083,-49.263316,25.616758,06:22:10,31453,NO_PROBLEM,0,2017-05-09,2017-05-09 06:22:10,1494321730,1
8,1.0,1,2938,6451403,-25.433455,-49.262219,972.378,BN997,,,,,06:22:44,31454,BETWEEN,0,2017-05-09,2017-05-09 06:22:44,1494321764,2
9,1.0,1,2938,6451415,-25.435452,-49.265462,1428.729,BN997,,-25.435406,-49.265455,5.110665,06:24:42,30748,NO_PROBLEM,0,2017-05-09,2017-05-09 06:24:42,1494321882,3
