In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np
import time

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def read_buste_data_v3(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.unix_timestamp(F.date_sub(F.col("date"),1),'yyyy-MM-dd'))
    
    return data_frame

def read_buste_data_old(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.from_unixtime(
        F.unix_timestamp(F.date_sub(F.col("date"),1),'yyyy-MM-dd'),'yyyy-MM-dd'))
    
    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read Pre-OD Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [4]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
pre_od_data = read_data(sqlContext,exp_data_folder_path + 'pre_od_matrix') \
                .withColumn('o_date',F.from_unixtime(F.unix_timestamp(F.col('o_date'),'yyyy-MM-dd'), 'yyyy-MM-dd')) \
                .withColumn('next_o_date',F.from_unixtime(F.unix_timestamp(F.col('next_o_date'),'yyyy-MM-dd'), 'yyyy-MM-dd')) \
                .orderBy(['cardNum','o_date','o_timestamp'])

In [5]:
print_df(pre_od_data)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,...,next_o_timestamp,next_o_shape_id,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration
0,175,BC289,2017-05-10,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,...,17:46:02,2789,5510463,-25.432528,-49.272411,26355,1,72165,74762,43.283333
1,370,BC032,2017-05-10,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,...,17:02:45,1743,5444509,-25.451133,-49.25293,31749,0,74762,72165,-1.0
2,372,BC312,2017-05-10,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,...,17:09:27,1719,5305706,-25.452114,-49.223827,14404,3,35721,72567,614.1
3,50,LB602,2017-05-10,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,...,06:55:21,1891,6136884,-25.427901,-49.263238,29914,2,72567,35721,-1.0
4,542,GA124,2017-05-10,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,...,17:05:57,1785,5950930,-25.466362,-49.279723,29165,5,36100,72357,604.283333
5,216,CA600,2017-05-10,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,...,07:01:40,1988,6403964,-25.550431,-49.265282,31256,4,72357,36100,-1.0
6,533,EA166,2017-05-10,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,...,16:47:25,3260,6011921,-25.447553,-49.225671,32704,7,39018,71245,537.116667
7,20,KB605,2017-05-10,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,...,07:50:18,2872,6247890,-25.516804,-49.230914,30037,6,71245,39018,-1.0
8,779,JC865,2017-05-10,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,...,15:02:51,2194,4299859,-25.43515,-49.273297,26149,9,32117,64971,547.566667
9,777,JC008,2017-05-10,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,...,05:55:17,2904,5602222,-25.459817,-49.319788,33567,8,64971,32117,-1.0


In [6]:
#pre_od_data.columns

### Reading Bus Trip Data

In [43]:
old_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/back-bulma-output'
bus_trip_data = read_buste_data_old(old_exp_data_folder_path + '/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [44]:
bus_trip_data = bus_trip_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
bus_trip_data = bus_trip_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))

In [45]:
bus_trip_data = bus_trip_data.na.drop(subset=["route","busCode","busStopId","gps_timestamp_in_secs","tripNum"]) \
                            .dropDuplicates(['route','busCode','tripNum','busStopId']) \
                            .orderBy(['route','busCode','tripNum','gps_timestamp_in_secs']) \
                            .withColumn('id',F.monotonically_increasing_id()) \
                            .withColumn('route', F.col('route').cast(T.IntegerType()))

In [10]:
#bus_trip_data = bus_trip_data.na.drop(subset=["route","busCode","busStopId","gps_timestamp_in_secs","tripNum"]) \
#                             .withColumn('route', F.col('route').cast(T.IntegerType()))

In [11]:
#print_df(bus_trip_data)

### Trying a different approach

In [13]:
pre_od_data.printSchema()

root
 |-- o_route: integer (nullable = true)
 |-- o_bus_code: string (nullable = true)
 |-- o_date: string (nullable = true)
 |-- o_tripNum: integer (nullable = true)
 |-- o_timestamp: string (nullable = true)
 |-- o_shape_id: integer (nullable = true)
 |-- o_shape_seq: integer (nullable = true)
 |-- o_shape_lat: double (nullable = true)
 |-- o_shape_lon: double (nullable = true)
 |-- o_stop_id: integer (nullable = true)
 |-- o_boarding_id: long (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- next_o_route: integer (nullable = true)
 |-- next_o_bus_code: string (nullable = true)
 |-- next_o_date: string (nullable = true)
 |-- next_o_tripNum: integer (nullable = true)
 |-- next_o_timestamp: string (nullable = true)
 |-- next_o_shape_id: integer (nullable = true)
 |-- next_o_shape_seq: integer (nullable = true)
 |-- next_o_shape_lat: double (nullable = true)
 |-- next_o_shape_lon: double (nullable = true)
 |-- next_o_stop_id: integer (nullable = true)
 |-- next_o_boarding_i

In [14]:
bus_trip_data.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- gps_timestamp: string (nullable = true)
 |-- gps_timestamp_in_secs: long (nullable = true)



In [46]:
sample_size = 10
pre_od_data_sample = pre_od_data.limit(sample_size)

In [55]:
cond = [bus_trip_data.route == pre_od_data_sample.o_route, 
        bus_trip_data.busCode == pre_od_data_sample.o_bus_code, 
        bus_trip_data.date == pre_od_data_sample.o_date,
        bus_trip_data.tripNum == pre_od_data_sample.o_tripNum]
#od_trips = bus_trip_data.join(pre_od_data_sample, (((bus_trip_data.route == pre_od_data_sample.o_route) & (bus_trip_data.busCode == pre_od_data_sample.o_bus_code)) & ((bus_trip_data.date == pre_od_data_sample.o_date) & (bus_trip_data.tripNum == pre_od_data_sample.o_tripNum))), 'inner')
od_trips = bus_trip_data.join(pre_od_data_sample, cond, 'left_outer')

In [56]:
print_df(od_trips.orderBy(['route','busCode','tripNum','gps_timestamp_in_secs']))

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,next_o_timestamp,next_o_shape_id,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration
0,,1,2934,6446501,-25.452183,-49.22404,0.0,DC852,,-25.45222,...,,,,,,,,,,
1,,1,2934,6447496,-25.450932,-49.21997,432.312,DC852,,-25.450976,...,,,,,,,,,,
2,,1,2934,6447510,-25.44871,-49.22002,739.771,DC852,,-25.448616,...,,,,,,,,,,
3,,1,2934,6447525,-25.446724,-49.221252,1001.839,DC852,,-25.446655,...,,,,,,,,,,
4,,1,2934,6447538,-25.444639,-49.223163,1393.901,DC852,,-25.444756,...,,,,,,,,,,
5,,1,2934,6447574,-25.4402,-49.221386,2247.402,DC852,,-25.440206,...,,,,,,,,,,
6,1.0,1,2938,6451385,-25.428133,-49.264531,338.081,BN997,,-25.428198,...,,,,,,,,,,
7,1.0,1,2938,6451394,-25.430622,-49.263426,635.815,BN997,,-25.43083,...,,,,,,,,,,
8,1.0,1,2938,6451403,-25.433455,-49.262219,972.378,BN997,,,...,,,,,,,,,,
9,1.0,1,2938,6451415,-25.435452,-49.265462,1428.729,BN997,,-25.435406,...,,,,,,,,,,


In [57]:
print od_trips.count()

533757


In [66]:
trips_with_boardings = od_trips.filter(((od_trips.route == 20) & (od_trips.tripNum == 6)) & (od_trips.busCode == 'KB605')) \
                                .select(['cardNum','route','tripNum','busStopId','shapeLat','shapeLon','timestamp','next_o_stop_id','next_o_shape_lat','next_o_shape_lon'])
print_df(trips_with_boardings,l=100)

Unnamed: 0,cardNum,route,tripNum,busStopId,shapeLat,shapeLon,timestamp,next_o_stop_id,next_o_shape_lat,next_o_shape_lon
0,321530,20,6,32703,-25.444614,-49.223128,16:43:13,30037,-25.516804,-49.230914
1,321530,20,6,32704,-25.447553,-49.225671,16:44:40,30037,-25.516804,-49.230914
2,321530,20,6,30943,-25.452297,-49.230028,16:47:54,30037,-25.516804,-49.230914
3,321530,20,6,30946,-25.455187,-49.231111,16:48:22,30037,-25.516804,-49.230914
4,321530,20,6,30949,-25.457847,-49.232321,16:49:24,30037,-25.516804,-49.230914
5,321530,20,6,32707,-25.461880,-49.234023,16:52:41,30037,-25.516804,-49.230914
6,321530,20,6,32708,-25.465981,-49.234664,16:53:13,30037,-25.516804,-49.230914
7,321530,20,6,28120,-25.475859,-49.239960,16:56:30,30037,-25.516804,-49.230914
8,321530,20,6,28121,-25.478030,-49.243555,16:58:14,30037,-25.516804,-49.230914
9,321530,20,6,28122,-25.479222,-49.246492,16:58:41,30037,-25.516804,-49.230914


In [74]:
def dist(lat_x, long_x, lat_y, long_y):
    return F.acos(
        F.sin(F.toRadians(lat_x)) * F.sin(F.toRadians(lat_y)) + 
        F.cos(F.toRadians(lat_x)) * F.cos(F.toRadians(lat_y)) * 
            F.cos(F.toRadians(long_x) - F.toRadians(long_y))
    ) * F.lit(6371.0)

In [75]:
trips_with_boardings = trips_with_boardings.withColumn('dist',dist(F.col('shapeLat'),F.col('shapeLon'),F.col('next_o_shape_lat'),F.col('next_o_shape_lon')))

In [76]:
print_df(trips_with_boardings,l=100)

Unnamed: 0,cardNum,route,tripNum,busStopId,shapeLat,shapeLon,timestamp,next_o_stop_id,next_o_shape_lat,next_o_shape_lon,dist
0,321530,20,6,32703,-25.444614,-49.223128,16:43:13,30037,-25.516804,-49.230914,8.065129
1,321530,20,6,32704,-25.447553,-49.225671,16:44:40,30037,-25.516804,-49.230914,7.718385
2,321530,20,6,30943,-25.452297,-49.230028,16:47:54,30037,-25.516804,-49.230914,7.173394
3,321530,20,6,30946,-25.455187,-49.231111,16:48:22,30037,-25.516804,-49.230914,6.851498
4,321530,20,6,30949,-25.457847,-49.232321,16:49:24,30037,-25.516804,-49.230914,6.557319
5,321530,20,6,32707,-25.461880,-49.234023,16:52:41,30037,-25.516804,-49.230914,6.115269
6,321530,20,6,32708,-25.465981,-49.234664,16:53:13,30037,-25.516804,-49.230914,5.663797
7,321530,20,6,28120,-25.475859,-49.239960,16:56:30,30037,-25.516804,-49.230914,4.642533
8,321530,20,6,28121,-25.478030,-49.243555,16:58:14,30037,-25.516804,-49.230914,4.494297
9,321530,20,6,28122,-25.479222,-49.246492,16:58:41,30037,-25.516804,-49.230914,4.461865


In [None]:
w = Window().partitionBy(['').orderBy(col("cnt").desc())

(cnts
  .withColumn("rn", rowNumber().over(w))
  .where(col("rn") == 1)
  .select("id_sa", "id_sb"))

In [20]:
print_df(bus_trip_data.filter(((bus_trip_data.route == 175) & (bus_trip_data.busCode == 'BC289')) &
                             ((bus_trip_data.tripNum == 10) & (bus_trip_data.busStopId == 31749))))

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,date,gps_timestamp,gps_timestamp_in_secs
0,175,10,1743,5444509,-25.451133,-49.25293,8002.63,BC289,,-25.451213,-49.252883,10.023218,17:01:16,31749,NO_PROBLEM,15,2017-05-10,2017-05-10 17:01:16,1494446476
1,175,10,2743,5404834,-25.451146,-49.252929,0.0,BC289,,-25.450663,-49.252911,53.752724,17:05:54,31749,NO_PROBLEM,0,2017-05-10,2017-05-10 17:05:54,1494446754


In [21]:
print_df(pre_od_data_sample.limit(sample_size))

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,...,next_o_timestamp,next_o_shape_id,next_o_shape_seq,next_o_shape_lat,next_o_shape_lon,next_o_stop_id,next_o_boarding_id,o_unixtimestamp,next_o_unixtimestamp,leg_duration
0,175,BC289,2017-05-10,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,...,17:46:02,2789,5510463,-25.432528,-49.272411,26355,1,72165,74762,43.283333
1,370,BC032,2017-05-10,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,...,17:02:45,1743,5444509,-25.451133,-49.25293,31749,0,74762,72165,-1.0
2,372,BC312,2017-05-10,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,...,17:09:27,1719,5305706,-25.452114,-49.223827,14404,3,35721,72567,614.1
3,50,LB602,2017-05-10,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,...,06:55:21,1891,6136884,-25.427901,-49.263238,29914,2,72567,35721,-1.0
4,542,GA124,2017-05-10,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,...,17:05:57,1785,5950930,-25.466362,-49.279723,29165,5,36100,72357,604.283333
5,216,CA600,2017-05-10,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,...,07:01:40,1988,6403964,-25.550431,-49.265282,31256,4,72357,36100,-1.0
6,533,EA166,2017-05-10,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,...,16:47:25,3260,6011921,-25.447553,-49.225671,32704,7,39018,71245,537.116667
7,20,KB605,2017-05-10,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,...,07:50:18,2872,6247890,-25.516804,-49.230914,30037,6,71245,39018,-1.0
8,779,JC865,2017-05-10,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,...,15:02:51,2194,4299859,-25.43515,-49.273297,26149,9,32117,64971,547.566667
9,777,JC008,2017-05-10,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,...,05:55:17,2904,5602222,-25.459817,-49.319788,33567,8,64971,32117,-1.0
