In [33]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [34]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read GPS Data

In [35]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [36]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
boarding_data = read_data(sqlContext,exp_data_folder_path + 'clean_boardings')

In [37]:
boarding_data.printSchema()

root
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- sec_group: integer (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp_in_secs: integer (nullable = true)
 |-- card_date_in_secs: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem

In [38]:
print_df(boarding_data)

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,654,HA014,2017-05-10,1037,02/10/64,"10/05/17 17:17:26,000000",CAMPO ALEGRE,1033826,F,1494447446,...,-25.50024,-49.324503,10.657221,17:17:11,32429,NO_PROBLEM,6,2017-05-10 17:17:11,1494447431,1494385200
1,654,HA014,2017-05-10,705,02/10/64,"10/05/17 11:45:59,000000",CAMPO ALEGRE,1033826,F,1494427559,...,,,,11:45:29,32429,BETWEEN,3,2017-05-10 11:45:29,1494427529,1494385200
2,654,HA014,2017-05-10,706,02/10/64,"10/05/17 11:46:04,000000",CAMPO ALEGRE,1033826,F,1494427564,...,-25.503165,-49.325878,9.850805,11:46:21,36100,NO_PROBLEM,6,2017-05-10 11:46:21,1494427581,1494385200
3,216,CA602,2017-05-10,1156,31/05/78,"10/05/17 19:16:55,000000",CABRAL / PORTÃO,1137936,M,1494454615,...,-25.457998,-49.27044,4.359058,19:16:15,29144,NO_PROBLEM,0,2017-05-10 19:16:15,1494454575,1494385200
4,685,HA619,2017-05-10,470,31/05/78,"10/05/17 07:50:34,000000",RIO BONITO/CIC,1137936,M,1494413434,...,-25.585283,-49.337953,3.388541,07:50:30,36354,NO_PROBLEM,2,2017-05-10 07:50:30,1494413430,1494385200
5,386,BN611,2017-05-10,821,01/03/74,"10/05/17 13:41:01,000000",CAJURU,1197880,F,1494434461,...,-25.432758,-49.270313,5.518969,13:41:44,26342,NO_PROBLEM,0,2017-05-10 13:41:44,1494434504,1494385200
6,701,JC315,2017-05-10,785,01/03/74,"10/05/17 13:05:02,000000",FAZENDINHA,1197880,F,1494432302,...,-25.462178,-49.305425,4.164158,13:05:54,34960,NO_PROBLEM,0,2017-05-10 13:05:54,1494432354,1494385200
7,380,LC303,2017-05-10,1139,03/08/59,"10/05/17 18:59:13,000000",DETRAN/V.MACHADO,1212238,F,1494453553,...,-25.440286,-49.294715,7.842351,18:59:41,30161,NO_PROBLEM,2,2017-05-10 18:59:41,1494453581,1494385200
8,233,BA142,2017-05-10,668,03/08/59,"10/05/17 11:08:37,000000",OLARIA,1212238,F,1494425317,...,,,,11:08:45,34994,BETWEEN,0,2017-05-10 11:08:45,1494425325,1494385200
9,20,BB607,2017-05-10,657,07/03/54,"10/05/17 10:57:50,000000",INTERBAIRR II H,1456735,F,1494424670,...,-25.494483,-49.282535,41.95102,10:57:20,29863,NO_PROBLEM,2,2017-05-10 10:57:20,1494424640,1494385200


In [39]:
boarding_data = boarding_data.orderBy(['cardNum','cardTimestamp'])
print_df(boarding_data.select(['cardNum','cardTimestamp','route','busCode']), l=20)

Unnamed: 0,cardNum,cardTimestamp,route,busCode
0,313992,"10/05/17 06:55:21,000000",372,BC312
1,313992,"10/05/17 17:09:27,000000",50,LB602
2,317896,"10/05/17 07:01:40,000000",542,GA124
3,317896,"10/05/17 17:05:57,000000",216,CA600
4,327103,"10/05/17 07:51:17,000000",777,JC012
5,327103,"10/05/17 16:23:50,000000",777,JC008
6,327455,"10/05/17 12:36:08,000000",657,EA184
7,327455,"10/05/17 13:05:19,000000",657,EA180
8,334492,"10/05/17 08:28:45,000000",777,LC027
9,334492,"10/05/17 17:40:55,000000",701,JC604


### Analyzing Boarding data

In [40]:
boarding_data = boarding_data.withColumn('boarding_id',F.monotonically_increasing_id())
print_df(boarding_data)

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs,boarding_id
0,372,BC312,2017-05-10,415,29/07/54,"10/05/17 06:55:21,000000",TARUMÃ,313992,M,1494410121,...,-49.267136,2.304582,06:55:53,26376,NO_PROBLEM,0,2017-05-10 06:55:53,1494410153,1494385200,0
1,50,LB602,2017-05-10,1029,29/07/54,"10/05/17 17:09:27,000000",INTERBAIRROS V,313992,M,1494446967,...,-49.219595,28.89999,17:09:44,32415,NO_PROBLEM,0,2017-05-10 17:09:44,1494446984,1494385200,1
2,542,GA124,2017-05-10,421,16/09/60,"10/05/17 07:01:40,000000",BAIRRO NOVO B,317896,F,1494410500,...,-49.2652,12.107932,07:01:18,31256,NO_PROBLEM,4,2017-05-10 07:01:18,1494410478,1494385200,2
3,216,CA600,2017-05-10,1025,16/09/60,"10/05/17 17:05:57,000000",CABRAL / PORTÃO,317896,F,1494446757,...,-49.279881,24.603973,17:05:48,29165,NO_PROBLEM,3,2017-05-10 17:05:48,1494446748,1494385200,3
4,777,JC012,2017-05-10,471,23/06/66,"10/05/17 07:51:17,000000",V. VELHA,327103,F,1494413477,...,-49.336766,8.566314,07:51:52,33626,NO_PROBLEM,4,2017-05-10 07:51:52,1494413512,1494385200,4
5,777,JC008,2017-05-10,983,23/06/66,"10/05/17 16:23:50,000000",V. VELHA,327103,F,1494444230,...,-49.287671,15.687315,16:23:55,32105,NO_PROBLEM,0,2017-05-10 16:23:55,1494444235,1494385200,5
6,657,EA184,2017-05-10,756,14/11/69,"10/05/17 12:36:08,000000",XAXIM/CAPÃO RASO,327455,F,1494430568,...,-49.272696,10.025153,12:36:50,38546,NO_PROBLEM,0,2017-05-10 12:36:50,1494430610,1494385200,6
7,657,EA180,2017-05-10,785,14/11/69,"10/05/17 13:05:19,000000",XAXIM/CAPÃO RASO,327455,F,1494432319,...,-49.285306,5.071379,13:05:23,30235,NO_PROBLEM,2,2017-05-10 13:05:23,1494432323,1494385200,7
8,777,LC027,2017-05-10,508,17/07/73,"10/05/17 08:28:45,000000",V. VELHA,334492,F,1494415725,...,-49.290488,23.01042,08:28:48,32110,NO_PROBLEM,1,2017-05-10 08:28:48,1494415728,1494385200,8
9,701,JC604,2017-05-10,1060,17/07/73,"10/05/17 17:40:55,000000",FAZENDINHA,334492,F,1494448855,...,-49.297051,10.848413,17:40:02,32827,NO_PROBLEM,0,2017-05-10 17:40:02,1494448802,1494385200,9


In [41]:
user_boarding_w = Window.partitionBy(boarding_data.cardNum, boarding_data.date).orderBy(boarding_data.card_timestamp_in_secs)
od_matrix_ids = boarding_data.select(F.col('cardNum'),
                     F.col('boarding_id'), 
                     F.lead('boarding_id',default=-1).over(user_boarding_w).alias('alighting_id'),
                     F.first('boarding_id',True).over(user_boarding_w).alias('first_boarding'))

print_df(od_matrix_ids)

Unnamed: 0,cardNum,boarding_id,alighting_id,first_boarding
0,878939,34359738396,34359738397,34359738396
1,878939,34359738397,-1,34359738396
2,1548177,154618822674,154618822675,154618822674
3,1548177,154618822675,-1,154618822674
4,1634438,171798691882,171798691883,171798691882
5,1634438,171798691883,-1,171798691882
6,1642997,180388626435,180388626436,180388626435
7,1642997,180388626436,-1,180388626435
8,1664594,180388626489,180388626490,180388626489
9,1664594,180388626490,-1,180388626489


In [42]:
od_matrix_ids = od_matrix_ids.withColumn('alighting_id', 
                     F.when(F.col('alighting_id') == -1,F.col('first_boarding')). \
                                         otherwise(F.col('alighting_id'))).drop('first_boarding')

print_df(od_matrix_ids)

Unnamed: 0,cardNum,boarding_id,alighting_id
0,878939,34359738394,34359738395
1,878939,34359738395,34359738394
2,1548177,154618822667,154618822668
3,1548177,154618822668,154618822667
4,1634438,171798691882,171798691883
5,1634438,171798691883,171798691882
6,1642997,180388626433,180388626434
7,1642997,180388626434,180388626433
8,1664594,180388626487,180388626488
9,1664594,180388626488,180388626487


In [43]:
#od_matrix = boarding_data.join(od_matrix_ids, ['boarding_id'], 'inner')

In [44]:
#print_df(od_matrix.select(['cardNum','cardTimestamp','boarding_id','alighting_id']), l=30)

In [45]:
od_matrix.printSchema()

root
 |-- boarding_id: long (nullable = false)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- sec_group: integer (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp_in_secs: integer (nullable = true)
 |-- card_date_in_secs: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStop

In [46]:
origin_matrix = od_matrix.select(F.col("route").alias("o_route"),
                                    F.col("busCode").alias("o_bus_code"),
                                    F.col("date").alias("o_date"),
                                    F.col("tripNum").alias("o_tripNum"),
                                    F.col("cardTimestamp").alias("o_timestamp"),
                                    F.col("shapeLat").alias("o_shape_lat"),
                                    F.col("shapeLon").alias("o_shape_lon"),
                                    F.col("busStopId").alias("o_stop_id"),
                                    F.col("boarding_id").alias("o_boarding_id"))

In [47]:
dest_matrix = od_matrix.select(F.col("route").alias("d_route"),
                                    F.col("busCode").alias("d_bus_code"),
                                    F.col("date").alias("d_date"),
                                    F.col("tripNum").alias("d_tripNum"),
                                    F.col("cardTimestamp").alias("d_timestamp"),
                                    F.col("shapeLat").alias("d_shape_lat"),
                                    F.col("shapeLon").alias("d_shape_lon"),
                                    F.col("busStopId").alias("d_stop_id"),
                                    F.col("boarding_id").alias("d_boarding_id"))

In [48]:
print_df(origin_matrix)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_lat,o_shape_lon,o_stop_id,o_boarding_id
0,372,BC312,2017-05-10,2,"10/05/17 06:55:21,000000",-25.430325,-49.26712,26376,0
1,50,LB602,2017-05-10,6,"10/05/17 17:09:27,000000",-25.4509,-49.219871,32415,1
2,542,GA124,2017-05-10,6,"10/05/17 07:01:40,000000",-25.550431,-49.265282,31256,2
3,216,CA600,2017-05-10,7,"10/05/17 17:05:57,000000",-25.466362,-49.279723,29165,3
4,777,JC012,2017-05-10,3,"10/05/17 07:51:17,000000",-25.460767,-49.336851,33626,4
5,777,JC008,2017-05-10,9,"10/05/17 16:23:50,000000",-25.449024,-49.287549,32105,5
6,657,EA184,2017-05-10,11,"10/05/17 12:36:08,000000",-25.499064,-49.272755,38546,6
7,657,EA180,2017-05-10,12,"10/05/17 13:05:19,000000",-25.494636,-49.285331,30235,7
8,777,LC027,2017-05-10,2,"10/05/17 08:28:45,000000",-25.450028,-49.290266,32110,8
9,701,JC604,2017-05-10,10,"10/05/17 17:40:55,000000",-25.454563,-49.296987,32827,9


In [49]:
print_df(dest_matrix)

Unnamed: 0,d_route,d_bus_code,d_date,d_tripNum,d_timestamp,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id
0,372,BC312,2017-05-10,2,"10/05/17 06:55:21,000000",-25.430325,-49.26712,26376,0
1,50,LB602,2017-05-10,6,"10/05/17 17:09:27,000000",-25.4509,-49.219871,32415,1
2,542,GA124,2017-05-10,6,"10/05/17 07:01:40,000000",-25.550431,-49.265282,31256,2
3,216,CA600,2017-05-10,7,"10/05/17 17:05:57,000000",-25.466362,-49.279723,29165,3
4,777,JC012,2017-05-10,3,"10/05/17 07:51:17,000000",-25.460767,-49.336851,33626,4
5,777,JC008,2017-05-10,9,"10/05/17 16:23:50,000000",-25.449024,-49.287549,32105,5
6,657,EA184,2017-05-10,11,"10/05/17 12:36:08,000000",-25.499064,-49.272755,38546,6
7,657,EA180,2017-05-10,12,"10/05/17 13:05:19,000000",-25.494636,-49.285331,30235,7
8,777,LC027,2017-05-10,2,"10/05/17 08:28:45,000000",-25.450028,-49.290266,32110,8
9,701,JC604,2017-05-10,10,"10/05/17 17:40:55,000000",-25.454563,-49.296987,32827,9


In [50]:
origin_dest_matrix = origin_matrix.join(od_matrix_ids, 
                                        origin_matrix.o_boarding_id == od_matrix_ids.boarding_id, 'inner') \
                                    .join(dest_matrix, od_matrix_ids.alighting_id == dest_matrix.d_boarding_id, 'inner')

In [51]:
print_df(origin_dest_matrix)

Unnamed: 0,o_route,o_bus_code,o_date,o_tripNum,o_timestamp,o_shape_lat,o_shape_lon,o_stop_id,o_boarding_id,cardNum,...,alighting_id,d_route,d_bus_code,d_date,d_tripNum,d_timestamp,d_shape_lat,d_shape_lon,d_stop_id,d_boarding_id
0,625,GA148,2017-05-10,1,"10/05/17 06:17:33,000000",-25.52445,-49.26915,36062,25,372350,...,26,633,GA128,2017-05-10,14,"10/05/17 17:23:38,000000",-25.522962,-49.270309,36059,26
1,40,BB615,2017-05-10,4,"10/05/17 15:08:08,000000",-25.407783,-49.339019,33722,30,386400,...,29,901,MC305,2017-05-10,2,"10/05/17 06:55:47,000000",-25.413462,-49.313217,33666,29
2,821,MA005,2017-05-10,2,"10/05/17 07:35:09,000000",-25.416093,-49.358629,34066,8589934657,712466,...,8589934658,821,MA005,2017-05-10,12,"10/05/17 18:43:03,000000",-25.407058,-49.336972,34153,8589934658
3,40,LB603,2017-05-10,5,"10/05/17 16:01:03,000000",-25.450025,-49.351437,34117,34359738399,878939,...,34359738398,638,GA120,2017-05-10,1,"10/05/17 05:29:07,000000",-25.552042,-49.26981,38169,34359738398
4,475,EC304,2017-05-10,10,"10/05/17 19:04:44,000000",-25.47059,-49.246411,29692,60129542214,1120026,...,60129542213,475,EC301,2017-05-10,2,"10/05/17 06:52:35,000000",-25.495564,-49.224429,33550,60129542213
5,685,HA619,2017-05-10,3,"10/05/17 07:50:34,000000",-25.585284,-49.337919,36354,68719476786,1149971,...,68719476787,216,CA602,2017-05-10,8,"10/05/17 19:16:55,000000",-25.457964,-49.270418,29144,68719476787
6,561,EC012,2017-05-10,14,"10/05/17 17:40:41,000000",-25.480355,-49.268323,32015,68719476799,1155351,...,68719476798,561,EC006,2017-05-10,3,"10/05/17 08:01:17,000000",-25.480276,-49.268259,32014,68719476798
7,685,HA619,2017-05-10,6,"10/05/17 17:03:39,000000",-25.544357,-49.316016,35164,77309411360,1186687,...,77309411361,821,MA006,2017-05-10,2,"10/05/17 06:44:46,000000",-25.42355,-49.358639,34037,77309411361
8,370,LC016,2017-05-10,4,"10/05/17 11:38:27,000000",-25.449311,-49.299678,30162,137438953477,1473049,...,137438953476,701,JC315,2017-05-10,5,"10/05/17 11:03:03,000000",-25.434807,-49.272409,26141,137438953476
9,30,KB602,2017-05-10,1,"10/05/17 07:11:13,000000",-25.45079,-49.208483,32544,146028888067,1494065,...,146028888066,811,BA018,2017-05-10,16,"10/05/17 17:19:48,000000",-25.425567,-49.342664,30499,146028888066


In [52]:
print_df(origin_dest_matrix.select(['cardNum','o_route','o_bus_code','o_timestamp','o_stop_id','d_route','d_bus_code','d_timestamp','d_stop_id']).
        orderBy(['cardNum','o_timestamp',]), l=30)

Unnamed: 0,cardNum,o_route,o_bus_code,o_timestamp,o_stop_id,d_route,d_bus_code,d_timestamp,d_stop_id
0,313992,372,BC312,"10/05/17 06:55:21,000000",26376,50,LB602,"10/05/17 17:09:27,000000",32415
1,313992,50,LB602,"10/05/17 17:09:27,000000",32415,372,BC312,"10/05/17 06:55:21,000000",26376
2,317896,542,GA124,"10/05/17 07:01:40,000000",31256,216,CA600,"10/05/17 17:05:57,000000",29165
3,317896,216,CA600,"10/05/17 17:05:57,000000",29165,542,GA124,"10/05/17 07:01:40,000000",31256
4,327103,777,JC012,"10/05/17 07:51:17,000000",33626,777,JC008,"10/05/17 16:23:50,000000",32105
5,327103,777,JC008,"10/05/17 16:23:50,000000",32105,777,JC012,"10/05/17 07:51:17,000000",33626
6,327455,657,EA184,"10/05/17 12:36:08,000000",38546,657,EA180,"10/05/17 13:05:19,000000",30235
7,327455,657,EA180,"10/05/17 13:05:19,000000",30235,657,EA184,"10/05/17 12:36:08,000000",38546
8,334492,777,LC027,"10/05/17 08:28:45,000000",32110,701,JC604,"10/05/17 17:40:55,000000",32827
9,334492,701,JC604,"10/05/17 17:40:55,000000",32827,777,LC027,"10/05/17 08:28:45,000000",32110


In [53]:
origin_dest_matrix.count()

14862