In [5]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [6]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read GPS Data

In [7]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [8]:
exp_data_folder_path = '/local/tarciso/data/sample-data/'
trips_data = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [9]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:13:43', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:45:14', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=u'05:41:14', busStopId=None, problem=

In [10]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)



### Read Bus Card Data

In [11]:
ticketing_data = sqlContext.read.json(exp_data_folder_path + '/ticketing-sample/doc1-2017051115.txt')

In [12]:
#Renaming columns to english
ticketing_data = ticketing_data.select(F.col("CODLINHA").alias("route"),
                                       F.col("CODVEICULO").alias("busCode"),
                                       F.col("DATANASCIMENTO").alias("userBirthdate"),
                                       F.col("DATAUTILIZACAO").alias("cardTimestamp"),
                                       F.col("NOMELINHA").alias("lineName"),
                                       F.col("NUMEROCARTAO").alias("cardNum"),
                                       F.col("SEXO").alias("gender"))

In [13]:
ticketing_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)



In [14]:
ticketing_data.head(5)

[Row(route=u'021', busCode=u'08046', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 20:15:16,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'021', busCode=u'08027', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 13:10:24,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'623', busCode=u'HA022', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 08:23:45,000000', lineName=u'PQ.INDUSTRIAL', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'000', busCode=u'03023', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 11:54:19,000000', lineName=u'OPER S/LINHA', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'TPH', busCode=u'03019', userBirthdate=u'23/11/79', cardTimestamp=u'10/05/17 13:30:10,000000', lineName=u'TERMINAL PINHEIRINHO', cardNum=u'0002425635', gender=u'F')]

### Pre-processing GPS data

In [15]:
#Warning: Both data sources dates refer to previous day, not to the day in the file name. Fixing this issue here to match bus card data.
trips_data = trips_data.withColumn("date", F.date_sub(F.col("date"), 1))
trips_data = trips_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
trips_data = trips_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))
trips_data = trips_data.withColumn("gps_date_in_secs", F.unix_timestamp(F.col("date"), "yyyy-MM-dd"))

In [16]:
#trips_data.select(["date","gps_date_in_secs","timestamp","gps_timestamp_in_secs"]).show()

In [17]:
def get_N_sec_group(timestamp_in_secs,date_in_secs,N):
    secs_since_midnight = timestamp_in_secs-date_in_secs
    return(F.floor(secs_since_midnight/N))

In [18]:
trips_data = trips_data.withColumn("sec_group",get_N_sec_group(F.col("gps_timestamp_in_secs"),F.col("gps_date_in_secs"),60))
#trips_data.select(["gps_timestamp","gps_timestamp_in_secs","sec_group"]).show()

## Pre-processing Bus Card data

In [19]:
ticketing_data = ticketing_data.withColumn("card_timestamp_in_secs", F.unix_timestamp(F.col("cardTimestamp"), "dd/MM/yy HH:mm:ss"))
ticketing_data = ticketing_data.withColumn("date",F.from_unixtime(F.col("card_timestamp_in_secs"), "yyyy-MM-dd"))
ticketing_data = ticketing_data.withColumn("card_date_in_secs",F.unix_timestamp(F.col("date"),"yyyy-MM-dd"))
#ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","date","card_date_in_secs"]).show()

In [20]:
ticketing_data = ticketing_data.withColumn("sec_group",get_N_sec_group(F.col("card_timestamp_in_secs"),F.col("card_date_in_secs"),60))
#ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","sec_group"]).orderBy("card_timestamp_in_secs").show()

In [21]:
trips_data.count()

7900150

In [22]:
ticketing_data.count()

306906

### Removing duplicate GPS records (occurred in the same time period)

In [23]:
filtered_trips_data = trips_data.na.drop(subset=["route","busCode","busStopId","sec_group"]).dropDuplicates(["route","busCode", "tripNum", "date","sec_group"])
print filtered_trips_data.count()
#filtered_trips_data.select(["route","busCode", "tripNum","busStopId", "sec_group"]).orderBy("route","busCode","tripNum","busStopId").limit(20).toPandas()

414149


### Removing Duplicate entries in ticketing data

In [24]:
ticketing_data = ticketing_data.dropDuplicates(["route","busCode","cardNum","date","sec_group"])

In [25]:
ticketing_data.count()

292993

### Merging GPS and ticketing data 

In [26]:
#ticketing_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

In [27]:
#filtered_trips_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

In [28]:
user_boardings = ticketing_data.join(filtered_trips_data, ['route','busCode','date','sec_group'], 'inner')

In [29]:
user_boardings.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: string (nullable = true)
 |-- sec_group: long (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp_in_secs: long (nullable = true)
 |-- card_date_in_secs: long (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (null

In [30]:
#user_boardings.select(['route','busCode','sec_group','timestamp','cardTimestamp']).limit(20).toPandas()

In [31]:
user_boardings.count()

74187

In [32]:
user_boardings.select('cardNum').distinct().count()

65960

In [33]:
#user_boardings.select(['route','busCode','tripNum','busStopId','sec_group','cardNum','cardTimestamp','gps_timestamp']).orderBy('cardNum','cardTimestamp').limit(20).toPandas()

### Removing duplicated boarding data

In [62]:
filtered_boardings = user_boardings.dropDuplicates(['cardNum','date','sec_group'])

In [63]:
print filtered_boardings.count()
print_df(filtered_boardings)

74031


Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,779,JC864,2017-05-10,440,05/12/77,"10/05/17 07:20:43,000000",V.VELHA / BURITI,429183,F,1494411643,...,-25.457356,-49.325236,6.807708,07:20:36,33558,NO_PROBLEM,2,2017-05-10 07:20:36,1494411636,1494385200
1,654,HA014,2017-05-10,1130,01/04/71,"10/05/17 18:50:17,000000",CAMPO ALEGRE,448669,F,1494453017,...,,,,18:50:18,36082,BETWEEN,1,2017-05-10 18:50:18,1494453018,1494385200
2,654,HA240,2017-05-10,481,01/07/75,"10/05/17 08:01:32,000000",CAMPO ALEGRE,559711,F,1494414092,...,-25.497476,-49.318153,20.220858,08:01:43,36086,NO_PROBLEM,1,2017-05-10 08:01:43,1494414103,1494385200
3,629,KA698,2017-05-10,779,15/08/52,"10/05/17 12:59:01,000000",ALTO BOQUEIRÃO,616545,F,1494431941,...,,,,12:59:01,29825,BETWEEN,0,2017-05-10 12:59:01,1494431941,1494385200
4,658,JA022,2017-05-10,816,09/10/55,"10/05/17 13:36:51,000000",C.RASO/CAIUÁ,719912,F,1494434211,...,,,,13:36:26,35888,BETWEEN,3,2017-05-10 13:36:26,1494434186,1494385200
5,40,LB603,2017-05-10,320,03/09/64,"10/05/17 05:20:24,000000",INTERBAIRROS IV,832937,M,1494404424,...,-25.420408,-49.348453,12.099618,05:20:00,34173,NO_PROBLEM,1,2017-05-10 05:20:00,1494404400,1494385200
6,20,BB696,2017-05-10,1079,26/07/85,"10/05/17 17:59:29,000000",INTERBAIRR II H,833222,F,1494449969,...,-25.420213,-49.239881,5.035887,17:59:39,29094,NO_PROBLEM,2,2017-05-10 17:59:39,1494449979,1494385200
7,542,GA144,2017-05-10,723,15/10/64,"10/05/17 12:03:11,000000",BAIRRO NOVO B,833735,F,1494428591,...,-25.543178,-49.268458,0.695652,12:03:08,31250,NO_PROBLEM,4,2017-05-10 12:03:08,1494428588,1494385200
8,522,EA188,2017-05-10,488,05/03/76,"10/05/17 08:08:17,000000",MARINGÁ,859535,M,1494414497,...,-25.507096,-49.246703,5.863058,08:08:34,32597,NO_PROBLEM,0,2017-05-10 08:08:34,1494414514,1494385200
9,515,EA198,2017-05-10,502,01/10/86,"10/05/17 08:22:17,000000",IGUAPE II,868864,F,1494415337,...,-25.51526,-49.223325,57.406258,08:22:10,33521,NO_PROBLEM,1,2017-05-10 08:22:10,1494415330,1494385200


### Removing single-trip users records

In [64]:
boarding_count = filtered_boardings.groupby('cardNum').count()

In [65]:
#Filtering only users with more than one ride per day
multiple_boardings = boarding_count.filter(F.col('count') > 1)

In [66]:
total_passengers = user_boardings.select('cardNum').distinct().count()
print "Total #Passengers:", total_passengers

Total #Passengers: 65960


In [67]:
passengers_mult_boardings = multiple_boardings.count()
prop_mult_boardings = 100*(passengers_mult_boardings)/total_passengers
print "Passengers with Multiple Boardings:", passengers_mult_boardings, "(", prop_mult_boardings, "%)" 
multiple_boardings.show()

Passengers with Multiple Boardings: 7467 ( 11 %)
+----------+-----+
|   cardNum|count|
+----------+-----+
|0001600147|    2|
|0003359882|    2|
|0002699865|    2|
|0003598037|    2|
|0002633338|    2|
|0002786520|    2|
|0002986469|    3|
|0003605113|    2|
|0002167105|    2|
|0003372920|    4|
|0002578507|    2|
|0003397992|    2|
|0003269568|    2|
|0003801759|    2|
|0003408486|    2|
|0003759461|    2|
|0002602692|    2|
|0003637275|    2|
|0003205994|    2|
|0002651765|    2|
+----------+-----+
only showing top 20 rows



In [68]:
multiple_boardings = multiple_boardings.select(F.col("cardNum").alias("cardNum1"),
                                                 F.col("count").alias("count1"))

In [69]:
clean_boardings = filtered_boardings.join(multiple_boardings, filtered_boardings.cardNum == multiple_boardings.cardNum1, 'leftsemi')

In [70]:
clean_boardings.count()

15538

In [71]:
print_df(clean_boardings)

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,625,GA165,2017-05-10,460,01/08/77,"10/05/17 07:40:11,000000",GRAMADOS,1080534,F,1494412811,...,-25.514443,-49.274506,19.57775,07:40:51,35747,NO_PROBLEM,0,2017-05-10 07:40:51,1494412851,1494385200
1,21,DR102,2017-05-10,507,01/08/77,"10/05/17 08:27:29,000000",INTERB II ANTI H,1080534,F,1494415649,...,-25.428238,-49.230296,39.798256,08:27:55,30192,NO_PROBLEM,0,2017-05-10 08:27:55,1494415675,1494385200
2,777,JC003,2017-05-10,747,24/08/01,"10/05/17 12:27:28,000000",V. VELHA,1600147,F,1494430048,...,-25.460755,-49.336276,3.516852,12:27:32,33624,NO_PROBLEM,3,2017-05-10 12:27:32,1494430052,1494385200
3,777,JC008,2017-05-10,1252,24/08/01,"10/05/17 20:52:13,000000",V. VELHA,1600147,F,1494460333,...,,,,20:52:37,32903,BETWEEN,0,2017-05-10 20:52:37,1494460357,1494385200
4,614,GR403,2017-05-10,1071,01/06/71,"10/05/17 17:51:49,000000",FAZENDINHA/PUC,2083421,F,1494449509,...,-25.477115,-49.323938,16.106558,17:51:15,31636,NO_PROBLEM,0,2017-05-10 17:51:15,1494449475,1494385200
5,828,LA001,2017-05-10,414,01/06/71,"10/05/17 06:54:54,000000",C.COMP/C.RASO,2083421,F,1494410094,...,-25.479908,-49.32405,6.740887,06:54:15,33811,NO_PROBLEM,0,2017-05-10 06:54:15,1494410055,1494385200
6,30,KB699,2017-05-10,527,07/08/91,"10/05/17 08:47:49,000000",INTERBAIRROS III,2167105,M,1494416869,...,-25.5135,-49.264663,32.41715,08:47:10,32609,TRIP_PROBLEM,0,2017-05-10 08:47:10,1494416830,1494385200
7,30,GR123,2017-05-10,704,07/08/91,"10/05/17 11:44:57,000000",INTERBAIRROS III,2167105,M,1494427497,...,-25.51372,-49.26571,75.32579,11:44:32,32848,NO_PROBLEM,0,2017-05-10 11:44:32,1494427472,1494385200
8,338,EA014,2017-05-10,1097,26/05/71,"10/05/17 18:17:10,000000",CENTENÁRIO/HAUER,2277554,F,1494451030,...,,,,18:17:18,30051,BETWEEN,0,2017-05-10 18:17:18,1494451038,1494385200
9,338,EA193,2017-05-10,418,26/05/71,"10/05/17 06:58:30,000000",CENTENÁRIO/HAUER,2277554,F,1494410310,...,-25.486555,-49.205793,15.684446,06:58:30,30183,NO_PROBLEM,5,2017-05-10 06:58:30,1494410310,1494385200


In [72]:
clean_boardings.write.csv(path=exp_data_folder_path+'/clean_boardings')

### Analyzing Boarding data

In [83]:
user_boarding_w = Window.partitionBy(clean_boardings.cardNum).orderBy(clean_boardings.card_timestamp_in_secs)

In [93]:
od_matrix = clean_boardings.withColumn('d_lat',F.lead(F.col('gpsLat')).over(user_boarding_w)). \
                            withColumn('d_lat',F.lead(F.col('gpsLat')).over(user_boarding_w)). \

In [94]:
print_df(od_matrix)

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs,d_lat
0,625,GA165,2017-05-10,460,01/08/77,"10/05/17 07:40:11,000000",GRAMADOS,1080534,F,1494412811,...,-49.274506,19.57775,07:40:51,35747,NO_PROBLEM,0,2017-05-10 07:40:51,1494412851,1494385200,-25.428238
1,21,DR102,2017-05-10,507,01/08/77,"10/05/17 08:27:29,000000",INTERB II ANTI H,1080534,F,1494415649,...,-49.230296,39.798256,08:27:55,30192,NO_PROBLEM,0,2017-05-10 08:27:55,1494415675,1494385200,
2,777,JC003,2017-05-10,747,24/08/01,"10/05/17 12:27:28,000000",V. VELHA,1600147,F,1494430048,...,-49.336276,3.516852,12:27:32,33624,NO_PROBLEM,3,2017-05-10 12:27:32,1494430052,1494385200,
3,777,JC008,2017-05-10,1252,24/08/01,"10/05/17 20:52:13,000000",V. VELHA,1600147,F,1494460333,...,,,20:52:37,32903,BETWEEN,0,2017-05-10 20:52:37,1494460357,1494385200,
4,828,LA001,2017-05-10,414,01/06/71,"10/05/17 06:54:54,000000",C.COMP/C.RASO,2083421,F,1494410094,...,-49.32405,6.740887,06:54:15,33811,NO_PROBLEM,0,2017-05-10 06:54:15,1494410055,1494385200,-25.477115
5,614,GR403,2017-05-10,1071,01/06/71,"10/05/17 17:51:49,000000",FAZENDINHA/PUC,2083421,F,1494449509,...,-49.323938,16.106558,17:51:15,31636,NO_PROBLEM,0,2017-05-10 17:51:15,1494449475,1494385200,
6,30,KB699,2017-05-10,527,07/08/91,"10/05/17 08:47:49,000000",INTERBAIRROS III,2167105,M,1494416869,...,-49.264663,32.41715,08:47:10,32609,TRIP_PROBLEM,0,2017-05-10 08:47:10,1494416830,1494385200,-25.51372
7,30,GR123,2017-05-10,704,07/08/91,"10/05/17 11:44:57,000000",INTERBAIRROS III,2167105,M,1494427497,...,-49.26571,75.32579,11:44:32,32848,NO_PROBLEM,0,2017-05-10 11:44:32,1494427472,1494385200,
8,338,EA193,2017-05-10,418,26/05/71,"10/05/17 06:58:30,000000",CENTENÁRIO/HAUER,2277554,F,1494410310,...,-49.205793,15.684446,06:58:30,30183,NO_PROBLEM,5,2017-05-10 06:58:30,1494410310,1494385200,
9,338,EA014,2017-05-10,1097,26/05/71,"10/05/17 18:17:10,000000",CENTENÁRIO/HAUER,2277554,F,1494451030,...,,,18:17:18,30051,BETWEEN,0,2017-05-10 18:17:18,1494451038,1494385200,


In [96]:
od_matrix = clean_boardings.withColumn('d_lat',F.when(clean_boardings.d_lat == None,
                                                      F.first(F.col('gpsLat').over(user_boarding_w))). \
                                                       otherwise(clean_boardings.d_lat))

AttributeError: 'DataFrame' object has no attribute 'd_lat'

In [None]:
#ticketing_data.filter(F.col('cardNum') == '0001080534').toPandas()

In [None]:
#clean_boardings.count()

In [None]:
#clean_boardings.select('cardNum').distinct().count()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0002986469').limit(20).toPandas()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0003372920').limit(20).toPandas()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0002986469').limit(20).toPandas()

In [None]:
#Taking a look at a sample:
#user_boardings.filter(F.col('cardNum') == '0003372920').limit(20).toPandas()

In [None]:
#Checking if there are any duplicate boarding entries
#duplicate_board_entries = user_boardings.groupby(['cardNum','date','sec_group']).count().filter('count > 1')
#print duplicate_board_entries.count()
#print_df(duplicate_board_entries)

In [None]:
#Taking a look at a sample:
#print_df(user_boardings.filter(F.col('cardNum') == '0001884144'))

In [None]:
#print_df(user_boardings.filter(F.col('cardNum') == '0001884144').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

In [None]:
#print_df(user_boardings.filter(F.col('cardNum') == '0003826824').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

In [None]:
#print_df(user_boardings.filter(F.col('cardNum') == '0002195541').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

As we can see above, the duplicated boarding records are due to a problem with the BULMA output, as GPS records which occurred in the same trip are being associated to different trips. We will exclude such entries from our analysis.