In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read Pre-OD Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [4]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
pre_od_data = read_data(sqlContext,exp_data_folder_path + 'pre_od_matrix')

In [5]:
print_df(pre_od_data)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_lat,o_shape_lon,o_stop_id,o_boarding_id,cardNum,...,d_date,d_tripNum,d_timestamp,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id,o_unixtimestamp,d_unixtimestamp,leg_duration
0,30,KB603,2017-05-10,2,18:09:11,-25.388545,-49.21149,32495,1279900254208,3659830,...,2017-05-10,1,06:49:06,-25.480276,-49.268259,32014,1279900254209,76151,35346,-1.0
1,561,EC001,2017-05-10,1,06:49:06,-25.480276,-49.268259,32014,1279900254209,3659830,...,2017-05-10,2,18:09:11,-25.388545,-49.21149,32495,1279900254208,35346,76151,680.083333
2,561,EC006,2017-05-10,12,16:26:23,-25.435352,-49.273778,26152,1279900254210,3659926,...,2017-05-10,1,06:30:23,-25.513591,-49.300009,35745,1279900254211,69983,34223,-1.0
3,40,MB604,2017-05-10,1,06:30:23,-25.513591,-49.300009,35745,1279900254211,3659926,...,2017-05-10,12,16:26:23,-25.435352,-49.273778,26152,1279900254210,34223,69983,596.0
4,471,EC003,2017-05-10,7,13:15:25,-25.435529,-49.271583,26178,1279900254212,3659999,...,2017-05-10,3,07:43:30,-25.524062,-49.234259,31212,1279900254213,58525,38610,-1.0
5,535,EA600,2017-05-10,3,07:43:30,-25.524062,-49.234259,31212,1279900254213,3659999,...,2017-05-10,7,13:15:25,-25.435529,-49.271583,26178,1279900254212,38610,58525,331.916667
6,535,EA600,2017-05-10,3,07:43:32,-25.524062,-49.234259,31212,1279900254214,3660010,...,2017-05-10,12,15:09:48,-25.408467,-49.276251,27290,1279900254215,38612,65388,446.266667
7,176,BC939,2017-05-10,12,15:09:48,-25.408467,-49.276251,27290,1279900254215,3660010,...,2017-05-10,3,07:43:32,-25.524062,-49.234259,31212,1279900254214,65388,38612,-1.0
8,176,BC939,2017-05-10,12,15:09:56,-25.408467,-49.276251,27290,1279900254216,3660122,...,2017-05-10,7,18:42:04,-25.430091,-49.271777,26334,1279900254217,65396,78124,212.133333
9,207,BC033,2017-05-10,7,18:42:04,-25.430091,-49.271777,26334,1279900254217,3660122,...,2017-05-10,12,15:09:56,-25.408467,-49.276251,27290,1279900254216,78124,65396,-1.0


### Reading Bus Trip Data

In [77]:
old_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/back-bulma-output'
bus_trip_data = read_file(old_exp_data_folder_path + '/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [78]:
bus_trip_data = bus_trip_data.withColumn("date", F.date_sub(F.col("date"), 1))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))

In [79]:
bus_trip_data = bus_trip_data.na.drop(subset=["route","busCode","busStopId","gps_timestamp_in_secs","tripNum"]) \
                             .dropDuplicates(['route','busCode','tripNum','busStopId']) \
                             .select(['route','busCode','tripNum','busStopId','gps_timestamp_in_secs', 'timestamp']) \
                             .filter('route == 022') \
                             .orderBy(['route','busCode','tripNum','gps_timestamp_in_secs'])

In [80]:
print_df(bus_trip_data, l=40)

Unnamed: 0,route,busCode,tripNum,busStopId,gps_timestamp_in_secs,timestamp
0,22,BL307,1,25683,1494322081,06:28:01
1,22,BL307,1,25684,1494322350,06:32:30
2,22,BL307,1,25685,1494322629,06:37:09
3,22,BL307,1,25415,1494322859,06:40:59
4,22,BL307,1,26276,1494323164,06:46:04
5,22,BL307,1,25699,1494323558,06:52:38
6,22,BL307,1,25698,1494323878,06:57:58
7,22,BL307,1,26267,1494324183,07:03:03
8,22,BL307,1,25821,1494324731,07:12:11
9,22,BL307,1,26246,1494325361,07:22:41


### Testing with new BUSTE data

In [53]:
#bus_trip_data_new = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext) \
#                    .na.drop(subset=["route","busCode","busStopId","timestamp","tripNum"])

In [54]:
#print_df(bus_trip_data_new.select(['route','busCode','tripNum','busStopId','timestamp']) \
#         .filter('route == 022') \
#         .orderBy(['route','busCode','tripNum','busStopId','timestamp']), l=40)

### Building stops-distance table

In [7]:
busStops = user_boardings.select(['busStopId','shapeLat','shapeLon']).dropDuplicates(['busStopId'])

In [8]:
print busStops.count()
print_df(busStops)

5326


Unnamed: 0,busStopId,shapeLat,shapeLon
0,31261,-25.545885,-49.263491
1,32396,-25.468642,-49.299404
2,31951,-25.494953,-49.20858
3,29993,-25.541054,-49.294021
4,33868,-25.517789,-49.324665
5,33722,-25.407783,-49.339019
6,26583,-25.439577,-49.269716
7,30903,-25.467922,-49.230765
8,33717,-25.424029,-49.279715
9,7554,-25.504891,-49.356074


In [9]:
o_busStops = rename_columns(
        busStops,
        [
            ("busStopId", "o_busStopId"),
            ("shapeLat", "o_shapeLat"),
            ("shapeLon", "o_shapeLon")
        ]
    )

d_busStops = rename_columns(
        busStops,
        [
            ("busStopId", "d_busStopId"),
            ("shapeLat", "d_shapeLat"),
            ("shapeLon", "d_shapeLon")
        ]
    )

In [10]:
print_df(o_busStops)

Unnamed: 0,o_busStopId,o_shapeLat,o_shapeLon
0,31261,-25.545885,-49.263491
1,32396,-25.468642,-49.299404
2,31951,-25.494953,-49.20858
3,29993,-25.541054,-49.294021
4,33868,-25.517789,-49.324665
5,33722,-25.407783,-49.339019
6,26583,-25.439577,-49.269716
7,30903,-25.467922,-49.230765
8,33717,-25.424029,-49.279715
9,7554,-25.504891,-49.356074


In [11]:
print_df(d_busStops)

Unnamed: 0,d_busStopId,d_shapeLat,d_shapeLon
0,31261,-25.545885,-49.263491
1,32396,-25.468642,-49.299404
2,31951,-25.494953,-49.20858
3,29993,-25.541054,-49.294021
4,33868,-25.517789,-49.324665
5,33722,-25.407783,-49.339019
6,26583,-25.439577,-49.269716
7,30903,-25.467922,-49.230765
8,33717,-25.424029,-49.279715
9,7554,-25.504891,-49.356074


In [12]:
stops_dist_table = o_busStops.crossJoin(d_busStops)

In [13]:
print_df(stops_dist_table)

Unnamed: 0,o_busStopId,o_shapeLat,o_shapeLon,d_busStopId,d_shapeLat,d_shapeLon
0,31261,-25.545885,-49.263491,31261,-25.545885,-49.263491
1,31261,-25.545885,-49.263491,32396,-25.468642,-49.299404
2,31261,-25.545885,-49.263491,31951,-25.494953,-49.20858
3,31261,-25.545885,-49.263491,29993,-25.541054,-49.294021
4,31261,-25.545885,-49.263491,33868,-25.517789,-49.324665
5,31261,-25.545885,-49.263491,33722,-25.407783,-49.339019
6,31261,-25.545885,-49.263491,26583,-25.439577,-49.269716
7,31261,-25.545885,-49.263491,30903,-25.467922,-49.230765
8,31261,-25.545885,-49.263491,33717,-25.424029,-49.279715
9,31261,-25.545885,-49.263491,7554,-25.504891,-49.356074


In [14]:
stops_dist_table.count()

28366276

In [15]:
def dist(long_x, lat_x, long_y, lat_y):
    return F.acos(
        F.sin(F.toRadians(lat_x)) * F.sin(F.toRadians(lat_y)) + 
        F.cos(F.toRadians(lat_x)) * F.cos(F.toRadians(lat_y)) * 
            F.cos(F.toRadians(long_x) - F.toRadians(long_y))
    ) * F.lit(6371.0)

In [16]:
stops_dist_table = stops_dist_table.withColumn("dist", dist('o_shapeLon', 'o_shapeLat','d_shapeLon','d_shapeLat'))

In [17]:
print_df(stops_dist_table)

Unnamed: 0,o_busStopId,o_shapeLat,o_shapeLon,d_busStopId,d_shapeLat,d_shapeLon,dist
0,31261,-25.545885,-49.263491,31261,-25.545885,-49.263491,0.0
1,31261,-25.545885,-49.263491,32396,-25.468642,-49.299404,9.314479
2,31261,-25.545885,-49.263491,31951,-25.494953,-49.20858,7.901535
3,31261,-25.545885,-49.263491,29993,-25.541054,-49.294021,3.109689
4,31261,-25.545885,-49.263491,33868,-25.517789,-49.324665,6.887185
5,31261,-25.545885,-49.263491,33722,-25.407783,-49.339019,17.125818
6,31261,-25.545885,-49.263491,26583,-25.439577,-49.269716,11.837379
7,31261,-25.545885,-49.263491,30903,-25.467922,-49.230765,9.270331
8,31261,-25.545885,-49.263491,33717,-25.424029,-49.279715,13.647174
9,31261,-25.545885,-49.263491,7554,-25.504891,-49.356074,10.347941


In [20]:
def get_stops_dist(stops_dist_df,stopA,stopB):
    return stops_dist_df.where(
        (F.col('o_busStopId') == stopA) &
        (F.col('d_busStopId') == stopB)).select('dist').collect()[0][0]

get_stops_dist_udf = F.udf(get_stops_dist)

In [21]:
get_stops_dist(stops_dist_table,31261,33717)

13.647173507372901

### Finding the closest alighting stop to next trip boarding

In [51]:
def find_closest_alighting_stop(pre_od_df,stops_dist_df,route,bus,trip_num,o_stop_id,next_trip_o_stop_id):
    stops_dist_df.where()
    
    return # Some lookup logic

find_closest_alighting_stop_udf = F.udf(find_closest_alighting_stop)

In [None]:
df.withColumn("foo", iplookup("c0"))

In [25]:
def df_loop(df):
    i = 0
    for row in df.rdd.collect():
        if i > 10:
            break
        print row['d_busStopId']
        i+=1

In [26]:
df_loop(d_busStops)

31261
32396
31951
29993
33868
33722
26583
30903
33717
7554
37146


In [68]:
def get_trip_next_stops(bus_trips_df,route,bus,trip_num,o_timestamp):
    return bus_trips_df.filter(((F.col('route') == route) & (F.col('busCode') == bus)) &
                        ((F.col('tripNum') == trip_num) & (F.col('gps_timestamp_in_secs') > o_timestamp)))

In [82]:
next_stops = get_trip_next_stops(bus_trip_data,'022','BL307',1,1494324183)
print_df(next_stops)

Unnamed: 0,route,busCode,tripNum,busStopId,gps_timestamp_in_secs,timestamp
0,22,BL307,1,25821,1494324731,07:12:11
1,22,BL307,1,26246,1494325361,07:22:41
2,22,BL307,1,26240,1494325564,07:26:04
3,22,BL307,1,25681,1494326340,07:39:00
4,22,BL307,1,26284,1494326896,07:48:16


In [None]:
8 	022 	BL307 	1 	25821 	1494324731 	07:12:11
9 	022 	BL307 	1 	26246 	1494325361 	07:22:41
10 	022 	BL307 	1 	26240 	1494325564 	07:26:04
11 	022 	BL307 	1 	25681 	1494326340 	07:39:00
12 	022 	BL307 	1 	26284 	1494326896 	07:48:16

In [86]:
next_stops = next_stops.withColumn('d_stopId',F.lit(33717))

In [107]:
print_df(next_stops)

Unnamed: 0,route,busCode,tripNum,busStopId,gps_timestamp_in_secs,timestamp,d_stopId
0,22,BL307,1,25821,1494324731,07:12:11,33717
1,22,BL307,1,26246,1494325361,07:22:41,33717
2,22,BL307,1,26240,1494325564,07:26:04,33717
3,22,BL307,1,25681,1494326340,07:39:00,33717
4,22,BL307,1,26284,1494326896,07:48:16,33717


In [106]:
next_stops = next_stops.withColumn('dist',get_stops_dist(stops_dist_table,next_stops.busStopId,next_stops.d_stopid))

AttributeError: 'DataFrame' object has no attribute 'd_stopid'

In [100]:
next_stops_dist = next_stops.join(stops_dist_table, (next_stops.busStopId == stops_dist_table.o_busStopId) & (next_stops.d_stopId == stops_dist_table.d_busStopId), 'inner') \
                    .select(['route','busCode','tripNum','o_busStopId','o_shapeLat','o_shapeLon','d_busStopId','d_shapeLat','d_shapeLon','timestamp','dist']) \
                    .orderBy('dist').first()

In [101]:
print_df(next_stops_dist)

Unnamed: 0,route,busCode,tripNum,o_busStopId,o_shapeLat,o_shapeLon,d_busStopId,d_shapeLat,d_shapeLon,timestamp,dist
0,22,BL307,1,26284,-25.436276,-49.307493,33717,-25.424029,-49.279715,07:48:16,3.104156
1,22,BL307,1,25681,-25.45856,-49.302303,33717,-25.424029,-49.279715,07:39:00,4.45951
2,22,BL307,1,26240,-25.475982,-49.292311,33717,-25.424029,-49.279715,07:26:04,5.913663
3,22,BL307,1,26246,-25.491782,-49.293027,33717,-25.424029,-49.279715,07:22:41,7.65141
4,22,BL307,1,25821,-25.496312,-49.270431,33717,-25.424029,-49.279715,07:12:11,8.091291


In [105]:
type(next_stops_dist.first())

pyspark.sql.types.Row