In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read Boarding Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [4]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
boarding_data = read_data(sqlContext,exp_data_folder_path + 'clean_boardings')

In [5]:
boarding_data.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- userGender: string (nullable = true)
 |-- date: timestamp (nullable = true)



In [6]:
print_df(boarding_data)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,386,11,2802,5577654,-25.432806,-49.270301,9827.101,BN611,,-25.432878,...,8.001811,18:25:34,26342,NO_PROBLEM,20/12/78,18:27:58,CAJURU,413674,F,2017-05-10
1,633,4,2065,6487165,-25.526466,-49.270083,550.27,GA212,,-25.526411,...,17.169727,07:23:35,36066,NO_PROBLEM,01/07/85,07:24:25,PARIGOT DE SOUZA,550073,F,2017-05-10
2,901,8,2257,6191372,-25.428454,-49.272983,8195.073,MC076,,-25.428393,...,8.328148,13:40:38,29598,NO_PROBLEM,14/08/80,13:42:41,STA. FELICIDADE,597648,F,2017-05-10
3,547,7,1994,6165297,-25.540439,-49.267179,4399.381,GA138,,-25.540383,...,6.269928,07:31:54,31269,NO_PROBLEM,11/04/86,07:32:28,OP. CONTIGENCIA,623748,F,2017-05-10
4,216,7,1785,5950904,-25.46056,-49.273292,10413.488,CA600,,-25.4606,...,5.880426,17:02:09,29145,NO_PROBLEM,21/10/86,17:02:25,CABRAL / PORTÃO,654450,F,2017-05-10
5,630,7,2060,6483927,-25.536582,-49.318214,4623.365,HA260,,-25.536581,...,1.36423,17:17:10,35159,NO_PROBLEM,02/10/76,17:18:01,VITÓRIA RÉGIA,942814,M,2017-05-10
6,535,1,1979,6393361,-25.553457,-49.251038,1220.197,EA601,,-25.553476,...,4.830964,06:34:54,36874,NO_PROBLEM,05/07/82,06:35:25,OSTERNACK/BOQ.,975700,F,2017-05-10
7,614,7,2031,5830806,-25.477055,-49.296942,6993.931,GR405,,-25.477413,...,39.8981,19:01:15,31800,NO_PROBLEM,29/06/63,19:02:54,FAZENDINHA/PUC,1034205,F,2017-05-10
8,650,5,4166,6543269,-25.565282,-49.333792,1742.133,HA031,,-25.565225,...,6.928054,10:07:30,36299,NO_PROBLEM,05/08/85,10:07:37,STA.RITA/PINHEIRINHO,1049145,F,2017-05-10
9,703,4,2165,5889283,-25.448696,-49.281666,9226.879,JC309,,-25.448716,...,9.136992,12:39:17,33480,NO_PROBLEM,27/08/85,12:39:57,CAIUÁ,1051414,F,2017-05-10


In [7]:
boarding_data = boarding_data.orderBy(['cardNum','cardTimestamp'])
print_df(boarding_data.select(['cardNum','cardTimestamp','route','busCode']), l=20)

Unnamed: 0,cardNum,cardTimestamp,route,busCode
0,304127,17:02:45,175,BC289
1,304127,17:46:02,370,BC032
2,313992,06:55:21,372,BC312
3,313992,17:09:27,50,LB602
4,317896,07:01:40,542,GA124
5,317896,17:05:57,216,CA600
6,321530,07:50:18,533,EA166
7,321530,16:47:25,20,KB605
8,321916,05:55:17,779,JC865
9,321916,15:02:51,777,JC008


### Analyzing Boarding data

In [8]:
boarding_data = boarding_data.withColumn('boarding_id',F.monotonically_increasing_id())
print_df(boarding_data)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date,boarding_id
0,175,10,1743,5444509,-25.451133,-49.25293,8002.63,BC289,,-25.451213,...,17:01:16,31749,NO_PROBLEM,09/06/54,17:02:45,BOM RETIRO / PUC,304127,M,2017-05-10,0
1,370,9,2789,5510463,-25.432528,-49.272411,3061.094,BC032,,-25.43251,...,17:45:02,26355,NO_PROBLEM,09/06/54,17:46:02,RUA XV / BARIGUI,304127,M,2017-05-10,1
2,372,2,1891,6136884,-25.427901,-49.263238,8989.204,BC312,,-25.427911,...,06:54:19,29914,NO_PROBLEM,29/07/54,06:55:21,TARUMÃ,313992,M,2017-05-10,2
3,50,6,1719,5305706,-25.452114,-49.223827,14883.248,LB602,,-25.452128,...,17:08:39,14404,NO_PROBLEM,29/07/54,17:09:27,INTERBAIRROS V,313992,M,2017-05-10,3
4,542,6,1988,6403964,-25.550431,-49.265282,3509.356,GA124,,-25.550511,...,07:01:18,31256,NO_PROBLEM,16/09/60,07:01:40,BAIRRO NOVO B,317896,F,2017-05-10,4
5,216,7,1785,5950930,-25.466362,-49.279723,11325.265,CA600,,-25.466531,...,17:05:48,29165,NO_PROBLEM,16/09/60,17:05:57,CABRAL / PORTÃO,317896,F,2017-05-10,5
6,533,3,2872,6247890,-25.516804,-49.230914,2645.339,EA166,,-25.516953,...,07:30:20,30037,NO_PROBLEM,12/06/81,07:50:18,OP. CONTIGENCIA,321530,F,2017-05-10,6
7,20,6,3260,6011921,-25.447553,-49.225671,1047.327,KB605,,-25.447411,...,16:44:40,32704,NO_PROBLEM,12/06/81,16:47:25,OP. CONTIGENCIA,321530,F,2017-05-10,7
8,779,1,2904,5602222,-25.459817,-49.319788,2787.974,JC865,,-25.459835,...,05:54:43,33567,NO_PROBLEM,17/11/52,05:55:17,V.VELHA / BURITI,321916,F,2017-05-10,8
9,777,8,2194,4299859,-25.43515,-49.273297,8945.611,JC008,,-25.435151,...,15:01:33,26149,NO_PROBLEM,17/11/52,15:02:51,V. VELHA,321916,F,2017-05-10,9


In [9]:
user_boarding_w = Window.partitionBy(boarding_data.cardNum, boarding_data.date).orderBy(boarding_data.cardTimestamp)
od_matrix_ids = boarding_data.select(F.col('cardNum'),
                     F.col('boarding_id'), 
                     F.lead('boarding_id',default=-1).over(user_boarding_w).alias('alighting_id'),
                     F.first('boarding_id',True).over(user_boarding_w).alias('first_boarding'))

print_df(od_matrix_ids)

Unnamed: 0,cardNum,boarding_id,alighting_id,first_boarding
0,634049,8589934649,8589934650,8589934649
1,634049,8589934650,-1,8589934649
2,740288,17179869242,17179869243,17179869242
3,740288,17179869243,-1,17179869242
4,824709,25769803915,25769803916,25769803915
5,824709,25769803916,-1,25769803915
6,825966,25769803919,25769803920,25769803919
7,825966,25769803920,-1,25769803919
8,938570,42949672981,42949672982,42949672981
9,938570,42949672982,-1,42949672981


In [10]:
od_matrix_ids = od_matrix_ids.withColumn('alighting_id', 
                     F.when(F.col('alighting_id') == -1,F.col('first_boarding')). \
                                         otherwise(F.col('alighting_id'))).drop('first_boarding')

print_df(od_matrix_ids)

Unnamed: 0,cardNum,boarding_id,alighting_id
0,634049,8589934693,8589934694
1,634049,8589934694,8589934693
2,740288,17179869256,17179869257
3,740288,17179869257,17179869256
4,824709,25769803888,25769803889
5,824709,25769803889,25769803888
6,825966,25769803892,25769803893
7,825966,25769803893,25769803892
8,938570,42949673066,42949673067
9,938570,42949673067,42949673066


In [11]:
boarding_data.printSchema()

root
 |-- route: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- userGender: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- boarding_id: long (nullable = false)



In [14]:
origin_matrix = boarding_data.select(F.col("route").alias("o_route"),
                                    F.col("busCode").alias("o_bus_code"),
                                    F.col("date").alias("o_date"),
                                    F.col("tripNum").alias("o_tripNum"),
                                    F.col("cardTimestamp").alias("o_timestamp"),
                                    F.col("shapeId").alias("o_shape_id"),
                                    F.col("shapeSequence").alias("o_shape_seq"),
                                    F.col("shapeLat").alias("o_shape_lat"),
                                    F.col("shapeLon").alias("o_shape_lon"),
                                    F.col("busStopId").alias("o_stop_id"),
                                    F.col("boarding_id").alias("o_boarding_id"))

In [15]:
dest_matrix = boarding_data.select(F.col("route").alias("d_route"),
                                    F.col("busCode").alias("d_bus_code"),
                                    F.col("date").alias("d_date"),
                                    F.col("tripNum").alias("d_tripNum"),
                                    F.col("cardTimestamp").alias("d_timestamp"),
                                    F.col("shapeId").alias("d_shape_id"),
                                    F.col("shapeSequence").alias("d_shape_seq"),
                                    F.col("shapeLat").alias("d_shape_lat"),
                                    F.col("shapeLon").alias("d_shape_lon"),
                                    F.col("busStopId").alias("d_stop_id"),
                                    F.col("boarding_id").alias("d_boarding_id"))

In [16]:
print_df(origin_matrix)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,o_boarding_id
0,175,BC289,2017-05-10,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,0
1,370,BC032,2017-05-10,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,1
2,372,BC312,2017-05-10,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,2
3,50,LB602,2017-05-10,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,3
4,542,GA124,2017-05-10,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,4
5,216,CA600,2017-05-10,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,5
6,533,EA166,2017-05-10,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,6
7,20,KB605,2017-05-10,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,7
8,779,JC865,2017-05-10,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,8
9,777,JC008,2017-05-10,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,9


In [17]:
print_df(dest_matrix)

Unnamed: 0,d_route,d_bus_code,d_date,d_tripNum,d_timestamp,d_shape_id,d_shape_seq,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id
0,175,BC289,2017-05-10,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,0
1,370,BC032,2017-05-10,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,1
2,372,BC312,2017-05-10,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,2
3,50,LB602,2017-05-10,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,3
4,542,GA124,2017-05-10,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,4
5,216,CA600,2017-05-10,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,5
6,533,EA166,2017-05-10,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,6
7,20,KB605,2017-05-10,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,7
8,779,JC865,2017-05-10,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,8
9,777,JC008,2017-05-10,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,9


In [18]:
origin_dest_matrix = origin_matrix.join(od_matrix_ids, 
                                        origin_matrix.o_boarding_id == od_matrix_ids.boarding_id, 'inner') \
                                    .join(dest_matrix, od_matrix_ids.alighting_id == dest_matrix.d_boarding_id, 'inner')

In [19]:
print_df(origin_dest_matrix)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,...,d_bus_code,d_date,d_tripNum,d_timestamp,d_shape_id,d_shape_seq,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id
0,175,BC289,2017-05-10,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,...,BC032,2017-05-10,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,1
1,370,BC032,2017-05-10,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,...,BC289,2017-05-10,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,0
2,372,BC312,2017-05-10,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,...,LB602,2017-05-10,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,3
3,50,LB602,2017-05-10,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,...,BC312,2017-05-10,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,2
4,542,GA124,2017-05-10,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,...,CA600,2017-05-10,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,5
5,216,CA600,2017-05-10,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,...,GA124,2017-05-10,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,4
6,533,EA166,2017-05-10,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,...,KB605,2017-05-10,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,7
7,20,KB605,2017-05-10,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,...,EA166,2017-05-10,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,6
8,779,JC865,2017-05-10,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,...,JC008,2017-05-10,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,9
9,777,JC008,2017-05-10,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,...,JC865,2017-05-10,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,8


In [20]:
print_df(origin_dest_matrix.select(['cardNum','o_route','o_bus_code','o_timestamp','o_stop_id','d_route','d_bus_code','d_timestamp','d_stop_id']).
        orderBy(['cardNum','o_timestamp',]), l=30)

Unnamed: 0,cardNum,o_route,o_bus_code,o_timestamp,o_stop_id,d_route,d_bus_code,d_timestamp,d_stop_id
0,304127,175,BC289,17:02:45,31749,370,BC032,17:46:02,26355
1,304127,370,BC032,17:46:02,26355,175,BC289,17:02:45,31749
2,313992,372,BC312,06:55:21,29914,50,LB602,17:09:27,14404
3,313992,50,LB602,17:09:27,14404,372,BC312,06:55:21,29914
4,317896,542,GA124,07:01:40,31256,216,CA600,17:05:57,29165
5,317896,216,CA600,17:05:57,29165,542,GA124,07:01:40,31256
6,321530,533,EA166,07:50:18,30037,20,KB605,16:47:25,32704
7,321530,20,KB605,16:47:25,32704,533,EA166,07:50:18,30037
8,321916,779,JC865,05:55:17,33567,777,JC008,15:02:51,26149
9,321916,777,JC008,15:02:51,26149,779,JC865,05:55:17,33567


In [21]:
origin_dest_matrix.count()

40480

In [22]:
origin_dest_matrix = origin_dest_matrix.withColumn('o_unixtimestamp',F.unix_timestamp(F.col('o_timestamp'), 'HH:mm:ss')) \
                                        .withColumn('d_unixtimestamp',F.unix_timestamp(F.col('d_timestamp'), 'HH:mm:ss'))
                                        

In [23]:
print_df(origin_dest_matrix)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_id,o_shape_seq,o_shape_lat,o_shape_lon,o_stop_id,...,d_tripNum,d_timestamp,d_shape_id,d_shape_seq,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id,o_unixtimestamp,d_unixtimestamp
0,175,BC289,2017-05-10,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,...,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,1,72165,74762
1,370,BC032,2017-05-10,9,17:46:02,2789,5510463,-25.432528,-49.272411,26355,...,10,17:02:45,1743,5444509,-25.451133,-49.25293,31749,0,74762,72165
2,372,BC312,2017-05-10,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,...,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,3,35721,72567
3,50,LB602,2017-05-10,6,17:09:27,1719,5305706,-25.452114,-49.223827,14404,...,2,06:55:21,1891,6136884,-25.427901,-49.263238,29914,2,72567,35721
4,542,GA124,2017-05-10,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,...,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,5,36100,72357
5,216,CA600,2017-05-10,7,17:05:57,1785,5950930,-25.466362,-49.279723,29165,...,6,07:01:40,1988,6403964,-25.550431,-49.265282,31256,4,72357,36100
6,533,EA166,2017-05-10,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,...,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,7,39018,71245
7,20,KB605,2017-05-10,6,16:47:25,3260,6011921,-25.447553,-49.225671,32704,...,3,07:50:18,2872,6247890,-25.516804,-49.230914,30037,6,71245,39018
8,779,JC865,2017-05-10,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,...,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,9,32117,64971
9,777,JC008,2017-05-10,8,15:02:51,2194,4299859,-25.43515,-49.273297,26149,...,1,05:55:17,2904,5602222,-25.459817,-49.319788,33567,8,64971,32117


In [24]:
origin_dest_matrix = origin_dest_matrix.withColumn('leg_duration',
                                                   F.when(F.col('d_unixtimestamp') > F.col('o_unixtimestamp'),
                                                       ((F.col('d_unixtimestamp') - F.col('o_unixtimestamp'))/60.0)). \
                                                   otherwise(-1))                                        

In [25]:
print_df(origin_dest_matrix.select(['cardNum','o_timestamp','o_unixtimestamp','d_timestamp','d_unixtimestamp','leg_duration']),l=30)

Unnamed: 0,cardNum,o_timestamp,o_unixtimestamp,d_timestamp,d_unixtimestamp,leg_duration
0,304127,17:02:45,72165,17:46:02,74762,43.283333
1,304127,17:46:02,74762,17:02:45,72165,-1.0
2,313992,06:55:21,35721,17:09:27,72567,614.1
3,313992,17:09:27,72567,06:55:21,35721,-1.0
4,317896,07:01:40,36100,17:05:57,72357,604.283333
5,317896,17:05:57,72357,07:01:40,36100,-1.0
6,321530,07:50:18,39018,16:47:25,71245,537.116667
7,321530,16:47:25,71245,07:50:18,39018,-1.0
8,321916,05:55:17,32117,15:02:51,64971,547.566667
9,321916,15:02:51,64971,05:55:17,32117,-1.0


In [26]:
print_df(origin_dest_matrix.filter('leg_duration < -1 or leg_duration > 15').describe(['leg_duration']))

Unnamed: 0,summary,leg_duration
0,count,17707.0
1,mean,375.8280632894711
2,stddev,220.45251361904772
3,min,15.033333333333331
4,max,963.9


In [27]:
origin_dest_matrix.filter('leg_duration > 0').approxQuantile("leg_duration", [0.5], 0)

[309.28333333333336]

In [29]:
origin_dest_matrix.write.csv(exp_data_folder_path + '/pre_od_matrix', header=True, mode='overwrite')