In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read GPS Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [4]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
boarding_data = read_data(sqlContext,exp_data_folder_path + 'clean_boardings')

In [5]:
boarding_data.printSchema()

root
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- sec_group: integer (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp_in_secs: integer (nullable = true)
 |-- card_date_in_secs: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem

In [6]:
print_df(boarding_data)

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,654,HA014,2017-05-10,1037,02/10/64,"10/05/17 17:17:26,000000",CAMPO ALEGRE,1033826,F,1494447446,...,-25.50024,-49.324503,10.657221,17:17:11,32429,NO_PROBLEM,6,2017-05-10 17:17:11,1494447431,1494385200
1,654,HA014,2017-05-10,705,02/10/64,"10/05/17 11:45:59,000000",CAMPO ALEGRE,1033826,F,1494427559,...,,,,11:45:29,32429,BETWEEN,3,2017-05-10 11:45:29,1494427529,1494385200
2,654,HA014,2017-05-10,706,02/10/64,"10/05/17 11:46:04,000000",CAMPO ALEGRE,1033826,F,1494427564,...,-25.503165,-49.325878,9.850805,11:46:21,36100,NO_PROBLEM,6,2017-05-10 11:46:21,1494427581,1494385200
3,216,CA602,2017-05-10,1156,31/05/78,"10/05/17 19:16:55,000000",CABRAL / PORTÃO,1137936,M,1494454615,...,-25.457998,-49.27044,4.359058,19:16:15,29144,NO_PROBLEM,0,2017-05-10 19:16:15,1494454575,1494385200
4,685,HA619,2017-05-10,470,31/05/78,"10/05/17 07:50:34,000000",RIO BONITO/CIC,1137936,M,1494413434,...,-25.585283,-49.337953,3.388541,07:50:30,36354,NO_PROBLEM,2,2017-05-10 07:50:30,1494413430,1494385200
5,386,BN611,2017-05-10,821,01/03/74,"10/05/17 13:41:01,000000",CAJURU,1197880,F,1494434461,...,-25.432758,-49.270313,5.518969,13:41:44,26342,NO_PROBLEM,0,2017-05-10 13:41:44,1494434504,1494385200
6,701,JC315,2017-05-10,785,01/03/74,"10/05/17 13:05:02,000000",FAZENDINHA,1197880,F,1494432302,...,-25.462178,-49.305425,4.164158,13:05:54,34960,NO_PROBLEM,0,2017-05-10 13:05:54,1494432354,1494385200
7,380,LC303,2017-05-10,1139,03/08/59,"10/05/17 18:59:13,000000",DETRAN/V.MACHADO,1212238,F,1494453553,...,-25.440286,-49.294715,7.842351,18:59:41,30161,NO_PROBLEM,2,2017-05-10 18:59:41,1494453581,1494385200
8,233,BA142,2017-05-10,668,03/08/59,"10/05/17 11:08:37,000000",OLARIA,1212238,F,1494425317,...,,,,11:08:45,34994,BETWEEN,0,2017-05-10 11:08:45,1494425325,1494385200
9,20,BB607,2017-05-10,657,07/03/54,"10/05/17 10:57:50,000000",INTERBAIRR II H,1456735,F,1494424670,...,-25.494483,-49.282535,41.95102,10:57:20,29863,NO_PROBLEM,2,2017-05-10 10:57:20,1494424640,1494385200


In [7]:
boarding_data = boarding_data.orderBy(['cardNum','cardTimestamp'])
print_df(boarding_data.select(['cardNum','cardTimestamp','route','busCode']).orderBy(['cardNum','cardTimestamp','route','busCode']), l=20)

Unnamed: 0,cardNum,cardTimestamp,route,busCode
0,313992,"10/05/17 06:55:21,000000",372,BC312
1,313992,"10/05/17 17:09:27,000000",50,LB602
2,317896,"10/05/17 07:01:40,000000",542,GA124
3,317896,"10/05/17 17:05:57,000000",216,CA600
4,327103,"10/05/17 07:51:17,000000",777,JC012
5,327103,"10/05/17 16:23:50,000000",777,JC008
6,327455,"10/05/17 12:36:08,000000",657,EA184
7,327455,"10/05/17 13:05:19,000000",657,EA180
8,334492,"10/05/17 08:28:45,000000",777,LC027
9,334492,"10/05/17 17:40:55,000000",701,JC604


### Analyzing Boarding data

In [8]:
boarding_data = boarding_data.withColumn('boarding_id',F.monotonically_increasing_id())
print_df(boarding_data)

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs,boarding_id
0,372,BC312,2017-05-10,415,29/07/54,"10/05/17 06:55:21,000000",TARUMÃ,313992,M,1494410121,...,-49.267136,2.304582,06:55:53,26376,NO_PROBLEM,0,2017-05-10 06:55:53,1494410153,1494385200,0
1,50,LB602,2017-05-10,1029,29/07/54,"10/05/17 17:09:27,000000",INTERBAIRROS V,313992,M,1494446967,...,-49.219595,28.89999,17:09:44,32415,NO_PROBLEM,0,2017-05-10 17:09:44,1494446984,1494385200,1
2,542,GA124,2017-05-10,421,16/09/60,"10/05/17 07:01:40,000000",BAIRRO NOVO B,317896,F,1494410500,...,-49.2652,12.107932,07:01:18,31256,NO_PROBLEM,4,2017-05-10 07:01:18,1494410478,1494385200,2
3,216,CA600,2017-05-10,1025,16/09/60,"10/05/17 17:05:57,000000",CABRAL / PORTÃO,317896,F,1494446757,...,-49.279881,24.603973,17:05:48,29165,NO_PROBLEM,3,2017-05-10 17:05:48,1494446748,1494385200,3
4,777,JC012,2017-05-10,471,23/06/66,"10/05/17 07:51:17,000000",V. VELHA,327103,F,1494413477,...,-49.336766,8.566314,07:51:52,33626,NO_PROBLEM,4,2017-05-10 07:51:52,1494413512,1494385200,4
5,777,JC008,2017-05-10,983,23/06/66,"10/05/17 16:23:50,000000",V. VELHA,327103,F,1494444230,...,-49.287671,15.687315,16:23:55,32105,NO_PROBLEM,0,2017-05-10 16:23:55,1494444235,1494385200,5
6,657,EA184,2017-05-10,756,14/11/69,"10/05/17 12:36:08,000000",XAXIM/CAPÃO RASO,327455,F,1494430568,...,-49.272696,10.025153,12:36:50,38546,NO_PROBLEM,0,2017-05-10 12:36:50,1494430610,1494385200,6
7,657,EA180,2017-05-10,785,14/11/69,"10/05/17 13:05:19,000000",XAXIM/CAPÃO RASO,327455,F,1494432319,...,-49.285306,5.071379,13:05:23,30235,NO_PROBLEM,2,2017-05-10 13:05:23,1494432323,1494385200,7
8,777,LC027,2017-05-10,508,17/07/73,"10/05/17 08:28:45,000000",V. VELHA,334492,F,1494415725,...,-49.290488,23.01042,08:28:48,32110,NO_PROBLEM,1,2017-05-10 08:28:48,1494415728,1494385200,8
9,701,JC604,2017-05-10,1060,17/07/73,"10/05/17 17:40:55,000000",FAZENDINHA,334492,F,1494448855,...,-49.297051,10.848413,17:40:02,32827,NO_PROBLEM,0,2017-05-10 17:40:02,1494448802,1494385200,9


In [12]:
user_boarding_w = Window.partitionBy(boarding_data.cardNum).orderBy(boarding_data.card_timestamp_in_secs)
od_matrix_ids = boarding_data.select(F.col('boarding_id'), 
                     F.lead('boarding_id',default=-1).over(user_boarding_w).alias('alighting_id'),
                     F.first('boarding_id',True).over(user_boarding_w).alias('first_boarding'))

print_df(od_matrix_ids)

Unnamed: 0,boarding_id,alighting_id,first_boarding
0,59,60,59
1,60,-1,59
2,34359738373,34359738374,34359738373
3,34359738374,-1,34359738373
4,51539607618,51539607619,51539607618
5,51539607619,-1,51539607618
6,77309411343,77309411344,77309411343
7,77309411344,-1,77309411343
8,180388626435,180388626436,180388626435
9,180388626436,-1,180388626435


In [13]:
od_matrix_ids = od_matrix_ids.withColumn('alighting_id', 
                     F.when(F.col('alighting_id') == -1,F.col('first_boarding')). \
                                         otherwise(F.col('alighting_id'))).drop('first_boarding')

print_df(od_matrix_ids)

Unnamed: 0,boarding_id,alighting_id
0,59,60
1,60,59
2,34359738379,34359738380
3,34359738380,34359738379
4,51539607613,51539607614
5,51539607614,51539607613
6,77309411344,77309411345
7,77309411345,77309411344
8,180388626495,180388626496
9,180388626496,180388626495


In [14]:
od_matrix = boarding_data.join(od_matrix_ids, ['boarding_id'], 'inner')

In [15]:
print_df(od_matrix.select(['cardNum','cardTimestamp','boarding_id','alighting_id']), l=30)

Unnamed: 0,cardNum,cardTimestamp,boarding_id,alighting_id
0,313992,"10/05/17 06:55:21,000000",0,1
1,313992,"10/05/17 17:09:27,000000",1,0
2,317896,"10/05/17 07:01:40,000000",2,3
3,317896,"10/05/17 17:05:57,000000",3,2
4,327103,"10/05/17 07:51:17,000000",4,5
5,327103,"10/05/17 16:23:50,000000",5,4
6,327455,"10/05/17 12:36:08,000000",6,7
7,327455,"10/05/17 13:05:19,000000",7,6
8,334492,"10/05/17 08:28:45,000000",8,9
9,334492,"10/05/17 17:40:55,000000",9,8


In [16]:
od_matrix.printSchema()

root
 |-- boarding_id: long (nullable = false)
 |-- route: integer (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- sec_group: integer (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp_in_secs: integer (nullable = true)
 |-- card_date_in_secs: integer (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStop

In [None]:
#od_matrix = od_matrix.withColumn('d_lat',F.when(F.col('d_lat') == -1.0,
                                                F.first(F.col('gpsLat'))).
                                                 otherwise(F.col('d_lat')).over(user_boarding_w))

In [None]:
od_matrix = clean_boardings.withColumn('d_lat',F.when(clean_boardings.d_lat == None,
                                                      F.first(F.col('gpsLat').over(user_boarding_w))). \
                                                       otherwise(clean_boardings.d_lat))

In [None]:
#ticketing_data.filter(F.col('cardNum') == '0001080534').toPandas()

In [None]:
#clean_boardings.count()

In [None]:
#clean_boardings.select('cardNum').distinct().count()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0002986469').limit(20).toPandas()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0003372920').limit(20).toPandas()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0002986469').limit(20).toPandas()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0003372920').limit(20).toPandas()

In [None]:
#Checking if there are any duplicate boarding entries
#duplicate_board_entries = user_boardings.groupby(['cardNum','date','sec_group']).count().filter('count > 1')
#print duplicate_board_entries.count()
#print_df(duplicate_board_entries)

In [None]:
#Taking a look at a sample:
#print_df(user_boardings.filter(F.col('cardNum') == '0001884144'))

In [None]:
#print_df(user_boardings.filter(F.col('cardNum') == '0001884144').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

In [None]:
#print_df(user_boardings.filter(F.col('cardNum') == '0003826824').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

In [None]:
#print_df(user_boardings.filter(F.col('cardNum') == '0002195541').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

As we can see above, the duplicated boarding records are due to a problem with the BULMA output, as GPS records which occurred in the same trip are being associated to different trips. We will exclude such entries from our analysis.