In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read Pre-OD Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [4]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
pre_od_data = read_data(sqlContext,exp_data_folder_path + 'pre_od_matrix')

In [5]:
#print_df(pre_od_data)

### Reading Bus Trip Data

In [6]:
old_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/back-bulma-output'
bus_trip_data = read_file(old_exp_data_folder_path + '/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [7]:
bus_trip_data = bus_trip_data.withColumn("date", F.date_sub(F.col("date"), 1))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))

In [8]:
bus_trip_data = bus_trip_data.na.drop(subset=["route","busCode","busStopId","gps_timestamp_in_secs","tripNum"]) \
                             .dropDuplicates(['route','busCode','tripNum','busStopId']) \
                             .orderBy(['route','busCode','tripNum','gps_timestamp_in_secs'])

In [9]:
#print_df(bus_trip_data, l=40)

In [10]:
bus_trip_data = bus_trip_data.withColumn('route', F.col('route').cast(T.IntegerType()))

### Testing with new BUSTE data

In [None]:
#bus_trip_data_new = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext) \
#                    .na.drop(subset=["route","busCode","busStopId","timestamp","tripNum"])

In [None]:
#print_df(bus_trip_data_new.select(['route','busCode','tripNum','busStopId','timestamp']) \
#         .filter('route == 022') \
#         .orderBy(['route','busCode','tripNum','busStopId','timestamp']), l=40)

### Building stops-distance table

In [11]:
#busStops = bus_trip_data.select(['busStopId','shapeLat','shapeLon']).dropDuplicates(['busStopId'])

In [12]:
#print busStops.count()
#print_df(busStops)

In [13]:
#o_busStops = rename_columns(
#        busStops,
#        [
#            ("busStopId", "o_busStopId"),
#            ("shapeLat", "o_shapeLat"),
#            ("shapeLon", "o_shapeLon")
#        ]
#    )
#
#d_busStops = rename_columns(
#        busStops,
#        [
#            ("busStopId", "d_busStopId"),
#            ("shapeLat", "d_shapeLat"),
#            ("shapeLon", "d_shapeLon")
#        ]
#    )

In [14]:
#print_df(o_busStops)

In [15]:
#print_df(d_busStops)

In [16]:
#stops_dist_table = o_busStops.crossJoin(d_busStops)

In [17]:
#print_df(stops_dist_table)

In [18]:
#stops_dist_table.count()

In [19]:
def dist(long_x, lat_x, long_y, lat_y):
    return F.acos(
        F.sin(F.toRadians(lat_x)) * F.sin(F.toRadians(lat_y)) + 
        F.cos(F.toRadians(lat_x)) * F.cos(F.toRadians(lat_y)) * 
            F.cos(F.toRadians(long_x) - F.toRadians(long_y))
    ) * F.lit(6371.0)

In [20]:
#stops_dist_table = stops_dist_table.withColumn("dist", dist('o_shapeLon', 'o_shapeLat','d_shapeLon','d_shapeLat'))

In [22]:
#print_df(stops_dist_table)

In [23]:
def get_stops_dist(stops_dist_df,stopA,stopB):
    return stops_dist_df.where(
        (F.col('o_busStopId') == stopA) &
        (F.col('d_busStopId') == stopB)).select('dist').collect()[0][0]

get_stops_dist_udf = F.udf(get_stops_dist)

In [24]:
#get_stops_dist(stops_dist_table,31261,33717)

### Finding the closest alighting stop to next trip boarding

In [31]:
trips_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/trips.txt')

In [32]:
#print_df(trips_data)

In [33]:
stops_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/stops.txt')

In [34]:
#print_df(stops_data)

In [35]:
stop_times_data = read_data(sqlContext,'/local/tarciso/data/gtfs/curitiba/stop_times.txt')

In [36]:
#print_df(stop_times_data)

In [37]:
shapes_stops = trips_data.join(stop_times_data,'trip_id', 'inner') \
                    .join(stops_data, 'stop_id', 'inner') \
                    .select(['shape_id','stop_id','stop_sequence', 'stop_lat','stop_lon']) \
                    .dropDuplicates(['shape_id','stop_id','stop_sequence']) \
                    .orderBy(['shape_id','stop_sequence'])

In [67]:
def get_trip_next_stops(shape_stops_df,o_shape_id,o_stop_id):
    shape_stops = shape_stops_df.filter((F.col('shape_id') == o_shape_id))
    o_shape_stop = shape_stops.filter(F.col('stop_id') == o_stop_id).first()
    
    if (o_shape_stop == None):
        print "Stop", o_stop_id, "could not be found in shape", o_shape_id
        return None

    o_stop_seq = o_shape_stop['stop_sequence']
    return shape_stops.filter(F.col('stop_sequence') > o_stop_seq)

In [68]:
def get_stop_lat_long(shape_stops_df,stop_id):
    d_stop_df = shape_stops_df.filter(F.col('stop_id') == stop_id).first()
    
    if (d_stop_df == None):
        print "Stop", o_stop_id, "could not be found"
        return None
    
    return (d_stop_df['stop_lat'],d_stop_df['stop_lon'])

In [69]:
def get_closest_next_stop(next_stops,d_stop_lat,d_stop_lon):
    shape_next_stops = next_stops.withColumn('d_stop_lat',F.lit(d_stop_lat)) \
                                 .withColumn('d_stop_lon',F.lit(d_stop_lon))    
    return shape_next_stops.withColumn('dist',dist('stop_lat', 'stop_lon','d_stop_lat','d_stop_lon')) \
                        .orderBy('dist').select(['stop_id','dist']).first()
    

In [70]:
print_df(pre_od_data.limit(2))

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,...,d_timestamp,d_shape_id,d_shape_seq,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id,o_unixtimestamp,d_unixtimestamp,leg_duration
0,50,JB605,2017-05-10,7,19:12:26,1720,5304847,-25.455303,-49.267471,29142,...,08:16:08,2073,6491312,-25.551371,-49.274145,38165,919123001529,79946,40568,-1.0
1,40,GR110,2017-05-10,1,07:18:48,1717,3366595,-25.404101,-49.33544,34157,...,14:11:17,2236,4887443,-25.449524,-49.358804,40623,927712935938,37128,61877,412.483333


In [None]:
size_limit = 200

destinations = [None] * size_limit
index = 0

for row in pre_od_data.limit(size_limit).rdd.collect():
    next_stops = get_trip_next_stops(shapes_stops,row['o_shape_id'],row['o_stop_id'])
    
    if (next_stops == None):
        destinations[index] = (None,None,-2)
    else:
        dest_lat_long = get_stop_lat_long(shapes_stops,row['d_stop_id'])

        if dest_lat_long == None:
            destinations[index] = (None,None,-3)
        else:
            closest_next_stop = get_closest_next_stop(next_stops,dest_lat_long[0],dest_lat_long[1])
            if closest_next_stop == None:
                #print "No Stop found!"
                destinations[index] = (None,None,-1)
            else:
                if closest_next_stop['dist'] <= 1:
                    #print "OK!"
                    destinations[index] = (closest_next_stop['stop_id'],closest_next_stop['dist'],1)
                else:
                    #print "Too far!"
                    destinations[index] = (closest_next_stop['stop_id'],closest_next_stop['dist'],0)
            
    index+=1
    

    
schema = T.StructType([
    T.StructField("d_stop_id", T.IntegerType(), True),
    T.StructField("dist_to_next_o", T.DoubleType(), True),
    T.StructField("result", T.IntegerType(), True)
])

destinations_df = sqlContext.createDataFrame([((tup[0]), (tup[1]), (tup[2])) for tup in destinations],schema)

In [None]:
print_df(destinations_df)

In [None]:
print_df(shapes_stops.filter((F.col('shape_id') == 1720)),l=50)

In [None]:
def df_loop(df):
    i = 0
    for row in df.rdd.collect():
        if i > 10:
            break
        print row['d_busStopId']
        i+=1

In [None]:
df_loop(d_busStops)

In [None]:
next_stops = get_trip_next_stops(bus_trip_data,'022','BL307',1,1494324183)
print_df(next_stops)

In [None]:
8 	022 	BL307 	1 	25821 	1494324731 	07:12:11
9 	022 	BL307 	1 	26246 	1494325361 	07:22:41
10 	022 	BL307 	1 	26240 	1494325564 	07:26:04
11 	022 	BL307 	1 	25681 	1494326340 	07:39:00
12 	022 	BL307 	1 	26284 	1494326896 	07:48:16

In [None]:
next_stops = next_stops.withColumn('d_stopId',F.lit(33717))

In [None]:
print_df(next_stops)

In [None]:
#next_stops = next_stops.withColumn('dist',get_stops_dist(stops_dist_table,next_stops.busStopId,next_stops.d_stopid))

In [None]:
next_stops_dist = next_stops.join(stops_dist_table, (next_stops.busStopId == stops_dist_table.o_busStopId) & (next_stops.d_stopId == stops_dist_table.d_busStopId), 'inner') \
                    .select(['route','busCode','tripNum','o_busStopId','o_shapeLat','o_shapeLon','d_busStopId','d_shapeLat','d_shapeLon','timestamp','dist']) \
                    .orderBy('dist').first()

In [None]:
next_stops_dist['dist']