In [73]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [74]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read GPS Data

In [75]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [76]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
trips_data = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [77]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:13:43', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:45:14', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=u'05:41:14', busStopId=None, problem=

In [78]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)



### Read Bus Card Data

In [79]:
ticketing_data = sqlContext.read.json(exp_data_folder_path + '/ticketing-sample/doc1-2017051115.txt')

In [80]:
#Renaming columns to english
ticketing_data = ticketing_data.select(F.col("CODLINHA").alias("route"),
                                       F.col("CODVEICULO").alias("busCode"),
                                       F.col("DATANASCIMENTO").alias("userBirthdate"),
                                       F.col("DATAUTILIZACAO").alias("cardTimestamp"),
                                       F.col("NOMELINHA").alias("lineName"),
                                       F.col("NUMEROCARTAO").alias("cardNum"),
                                       F.col("SEXO").alias("gender"))

In [81]:
ticketing_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)



In [82]:
ticketing_data.head(5)

[Row(route=u'021', busCode=u'08046', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 20:15:16,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'021', busCode=u'08027', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 13:10:24,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'623', busCode=u'HA022', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 08:23:45,000000', lineName=u'PQ.INDUSTRIAL', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'000', busCode=u'03023', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 11:54:19,000000', lineName=u'OPER S/LINHA', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'TPH', busCode=u'03019', userBirthdate=u'23/11/79', cardTimestamp=u'10/05/17 13:30:10,000000', lineName=u'TERMINAL PINHEIRINHO', cardNum=u'0002425635', gender=u'F')]

### Pre-processing GPS data

In [83]:
#Warning: Both data sources dates refer to previous day, not to the day in the file name. Fixing this issue here to match bus card data.
trips_data = trips_data.withColumn("date", F.date_sub(F.col("date"), 1))
trips_data = trips_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
trips_data = trips_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))
trips_data = trips_data.withColumn("gps_date_in_secs", F.unix_timestamp(F.col("date"), "yyyy-MM-dd"))

In [84]:
#trips_data.select(["date","gps_date_in_secs","timestamp","gps_timestamp_in_secs"]).show()

In [85]:
def get_N_sec_group(timestamp_in_secs,date_in_secs,N):
    secs_since_midnight = timestamp_in_secs-date_in_secs
    return(F.floor(secs_since_midnight/N))

In [86]:
trips_data = trips_data.withColumn("sec_group",get_N_sec_group(F.col("gps_timestamp_in_secs"),F.col("gps_date_in_secs"),60))
#trips_data.select(["gps_timestamp","gps_timestamp_in_secs","sec_group"]).show()

## Pre-processing Bus Card data

In [87]:
ticketing_data = ticketing_data.withColumn("card_timestamp_in_secs", F.unix_timestamp(F.col("cardTimestamp"), "dd/MM/yy HH:mm:ss"))
ticketing_data = ticketing_data.withColumn("date",F.from_unixtime(F.col("card_timestamp_in_secs"), "yyyy-MM-dd"))
ticketing_data = ticketing_data.withColumn("card_date_in_secs",F.unix_timestamp(F.col("date"),"yyyy-MM-dd"))
#ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","date","card_date_in_secs"]).show()

In [88]:
ticketing_data = ticketing_data.withColumn("sec_group",get_N_sec_group(F.col("card_timestamp_in_secs"),F.col("card_date_in_secs"),60))
#ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","sec_group"]).orderBy("card_timestamp_in_secs").show()

In [89]:
trips_data.count()

7900150

In [90]:
ticketing_data.count()

306906

### Removing duplicate GPS records (occurred in the same time period)

In [91]:
filtered_trips_data = trips_data.na.drop(subset=["route","busCode","busStopId","sec_group"]).dropDuplicates(["route","busCode", "tripNum", "date","sec_group"])
print filtered_trips_data.count()
#filtered_trips_data.select(["route","busCode", "tripNum","busStopId", "sec_group"]).orderBy("route","busCode","tripNum","busStopId").limit(20).toPandas()

414149


### Removing Duplicate entries in ticketing data

In [92]:
ticketing_data = ticketing_data.dropDuplicates(["route","busCode","cardNum","date","sec_group"])

In [93]:
ticketing_data.count()

292993

### Merging GPS and ticketing data 

In [94]:
#ticketing_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

In [95]:
#filtered_trips_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

In [96]:
user_boardings = ticketing_data.join(filtered_trips_data, ['route','busCode','date','sec_group'], 'inner')

In [97]:
user_boardings.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: string (nullable = true)
 |-- sec_group: long (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp_in_secs: long (nullable = true)
 |-- card_date_in_secs: long (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (null

In [98]:
#user_boardings.select(['route','busCode','sec_group','timestamp','cardTimestamp']).limit(20).toPandas()

In [99]:
user_boardings.count()

74187

In [100]:
user_boardings.select('cardNum').distinct().count()

65960

In [101]:
#user_boardings.select(['route','busCode','tripNum','busStopId','sec_group','cardNum','cardTimestamp','gps_timestamp']).orderBy('cardNum','cardTimestamp').limit(20).toPandas()

### Removing duplicated boarding data

In [102]:
#Keeping only non-duplicated boarding data
non_duplicated_boardings = user_boardings.groupby(['cardNum','date','sec_group']).count().filter('count == 1')

In [126]:
non_duplicated_boardings = non_duplicated_boardings.select(F.col("cardNum").alias("cardNum1"),
                                       F.col("date").alias("date1"),
                                       F.col("sec_group").alias("sec_group1"),
                                       F.col("count").alias("count1"))

In [127]:
non_duplicated_boardings.show()

+----------+----------+----------+------+
|  cardNum1|     date1|sec_group1|count1|
+----------+----------+----------+------+
|0002793259|2017-05-10|       800|     1|
|0003655283|2017-05-10|       495|     1|
|0003588558|2017-05-10|       431|     1|
|0002401786|2017-05-10|       417|     1|
|0002066650|2017-05-10|       457|     1|
|0003654850|2017-05-10|       342|     1|
|0002650979|2017-05-10|       850|     1|
|0003386754|2017-05-10|       403|     1|
|0002266612|2017-05-10|       342|     1|
|0003813755|2017-05-10|      1152|     1|
|0003806918|2017-05-10|       371|     1|
|0003098051|2017-05-10|       458|     1|
|0003149580|2017-05-10|      1076|     1|
|0002935536|2017-05-10|       834|     1|
|0003680105|2017-05-10|       432|     1|
|0003779782|2017-05-10|       777|     1|
|0002082975|2017-05-10|       570|     1|
|0001045338|2017-05-10|       562|     1|
|0003037502|2017-05-10|       387|     1|
|0001578775|2017-05-10|       412|     1|
+----------+----------+----------+

In [103]:
non_duplicated_boardings.select('cardNum').distinct().count()

65833

In [104]:
user_boardings.select('cardNum').distinct().count()

65960

In [129]:
filtered_boardings = user_boardings.join(non_duplicated_boardings, user_boardings.cardNum == non_duplicated_boardings.cardNum1, 'leftsemi')

In [131]:
print filtered_boardings.count()
print_df(filtered_boardings)

73931


Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,643,GA164,2017-05-10,715,27/04/84,"10/05/17 11:55:20,000000",UMBARÁ,328918,F,1494428120,...,-25.537648,-49.294951,10.533369,11:55:17,32046,NO_PROBLEM,1,2017-05-10 11:55:17,1494428117,1494385200
1,548,GA205,2017-05-10,380,21/01/82,"10/05/17 06:20:30,000000",OSTERN/S.CERCADO,353208,F,1494408030,...,-25.563181,-49.253741,7.823349,06:20:23,48762,NO_PROBLEM,9,2017-05-10 06:20:23,1494408023,1494385200
2,876,BC299,2017-05-10,487,12/12/79,"10/05/17 08:07:11,000000",SAVÓIA,425429,M,1494414431,...,,,,08:07:36,34223,BETWEEN,0,2017-05-10 08:07:36,1494414456,1494385200
3,332,DA019,2017-05-10,370,28/02/72,"10/05/17 06:10:07,000000",ACRÓPOLE,430891,F,1494407407,...,-25.46651,-49.195546,8.007099,06:10:50,32836,NO_PROBLEM,0,2017-05-10 06:10:50,1494407450,1494385200
4,650,HA287,2017-05-10,430,03/05/74,"10/05/17 07:10:25,000000",STA.RITA/PINHEIRINHO,456908,F,1494411025,...,-25.566778,-49.332958,4.608405,07:10:14,36294,NO_PROBLEM,3,2017-05-10 07:10:14,1494411014,1494385200
5,366,BC908,2017-05-10,491,11/06/53,"10/05/17 08:11:50,000000",ITUPAV/H.MILITAR,537146,F,1494414710,...,-25.430113,-49.267255,29.698534,08:11:12,26376,NO_PROBLEM,3,2017-05-10 08:11:12,1494414672,1494385200
6,20,JB601,2017-05-10,778,09/11/66,"10/05/17 12:58:41,000000",INTERBAIRR II H,570345,M,1494431921,...,-25.389128,-49.263821,28.195213,12:58:46,32685,NO_PROBLEM,1,2017-05-10 12:58:46,1494431926,1494385200
7,535,EA076,2017-05-10,391,26/10/60,"10/05/17 06:31:08,000000",OSTERNACK/BOQ.,594815,F,1494408668,...,-25.54914,-49.246665,8.406152,06:31:08,36724,NO_PROBLEM,7,2017-05-10 06:31:08,1494408668,1494385200
8,972,MC491,2017-05-10,484,29/11/41,"10/05/17 08:04:55,000000",JD. ITÁLIA,607182,M,1494414295,...,-25.423941,-49.290993,9.131085,08:04:05,32797,NO_PROBLEM,1,2017-05-10 08:04:05,1494414245,1494385200
9,703,JC301,2017-05-10,1032,26/01/56,"10/05/17 17:12:00,000000",CAIUÁ,743319,F,1494447120,...,-25.450145,-49.290473,26.893307,17:12:32,32110,NO_PROBLEM,2,2017-05-10 17:12:32,1494447152,1494385200


### Removing single-trip users records

In [132]:
boarding_count = filtered_boardings.groupby('cardNum').count()

In [133]:
#Filtering only users with more than one ride per day
multiple_boardings = boarding_count.filter(F.col('count') > 1)

In [134]:
passengers_mult_boardings = multiple_boardings.count()
prop_mult_boardings = 100*(passengers_mult_boardings)/total_passengers
print "Passengers with Multiple Boardings:", passengers_mult_boardings, "(", prop_mult_boardings, "%)" 
multiple_boardings.show()

Passengers with Multiple Boardings: 7466 ( 11 %)
+----------+-----+
|   cardNum|count|
+----------+-----+
|0001080534|    2|
|0001600147|    2|
|0002083421|    2|
|0002167105|    2|
|0002277554|    2|
|0002306754|    2|
|0002578507|    2|
|0002602692|    2|
|0002633338|    2|
|0002651765|    2|
|0002699865|    2|
|0002752794|    2|
|0002786520|    2|
|0002986469|    3|
|0003118767|    2|
|0003205994|    2|
|0003269568|    2|
|0003276005|    2|
|0003359882|    2|
|0003372920|    4|
+----------+-----+
only showing top 20 rows



### Analyzing Boarding data

In [135]:
total_passengers = user_boardings.select('cardNum').distinct().count()
print "Total #Passengers:", total_passengers

Total #Passengers: 65960


In [136]:
ticketing_data.filter(F.col('cardNum') == '0001080534').toPandas()

Unnamed: 0,route,busCode,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,date,card_date_in_secs,sec_group
0,625,GA165,01/08/77,"10/05/17 07:40:11,000000",GRAMADOS,1080534,F,1494412811,2017-05-10,1494385200,460
1,21,DR102,01/08/77,"10/05/17 08:27:29,000000",INTERB II ANTI H,1080534,F,1494415649,2017-05-10,1494385200,507


In [49]:
clean_boardings.count()

73875

In [50]:
clean_boardings.select('cardNum').distinct().count()

65833

In [137]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0002986469').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,182,BC310,2017-05-10,774,18/05/73,"10/05/17 12:54:19,000000",ABRANCHES,2986469,M,1494431659,...,,,,12:54:05,30800,BETWEEN,2,2017-05-10 12:54:05,1494431645,1494385200
1,243,BA299,2017-05-10,431,18/05/73,"10/05/17 07:11:45,000000",STA. TEREZINHA,2986469,M,1494411105,...,-25.350926,-49.26875,45.49196,07:11:14,34522,NO_PROBLEM,1,2017-05-10 07:11:14,1494411074,1494385200
2,243,BA299,2017-05-10,810,18/05/73,"10/05/17 13:30:54,000000",STA. TEREZINHA,2986469,M,1494433854,...,-25.350905,-49.268806,39.894855,13:30:24,34522,NO_PROBLEM,0,2017-05-10 13:30:24,1494433824,1494385200


In [138]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0003372920').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,777,JC006,2017-05-10,1049,28/02/70,"10/05/17 17:29:47,000000",V. VELHA,3372920,F,1494448187,...,-25.45985,-49.319845,2.983394,17:29:08,33567,NO_PROBLEM,1,2017-05-10 17:29:08,1494448148,1494385200
1,703,JC314,2017-05-10,733,28/02/70,"10/05/17 12:13:31,000000",CAIUÁ,3372920,F,1494429211,...,-25.451515,-49.294311,15.038858,12:13:42,32829,NO_PROBLEM,0,2017-05-10 12:13:42,1494429222,1494385200
2,777,JC008,2017-05-10,1257,28/02/70,"10/05/17 20:57:15,000000",V. VELHA,3372920,F,1494460635,...,,,,20:57:33,32829,BETWEEN,0,2017-05-10 20:57:33,1494460653,1494385200
3,776,JC002,2017-05-10,384,28/02/70,"10/05/17 06:24:52,000000",CARMELA DUTRA,3372920,F,1494408292,...,,,,06:24:38,33567,BETWEEN,1,2017-05-10 06:24:38,1494408278,1494385200


In [40]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0002986469').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,182,BC310,2017-05-10,774,18/05/73,"10/05/17 12:54:19,000000",ABRANCHES,2986469,M,1494431659,...,,,,12:54:05,30800,BETWEEN,2,2017-05-10 12:54:05,1494431645,1494385200
1,243,BA299,2017-05-10,431,18/05/73,"10/05/17 07:11:45,000000",STA. TEREZINHA,2986469,M,1494411105,...,-25.350926,-49.26875,45.49196,07:11:14,34522,NO_PROBLEM,1,2017-05-10 07:11:14,1494411074,1494385200
2,243,BA299,2017-05-10,810,18/05/73,"10/05/17 13:30:54,000000",STA. TEREZINHA,2986469,M,1494433854,...,-25.350905,-49.268806,39.894855,13:30:24,34522,NO_PROBLEM,0,2017-05-10 13:30:24,1494433824,1494385200


In [41]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0003372920').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,777,JC006,2017-05-10,1049,28/02/70,"10/05/17 17:29:47,000000",V. VELHA,3372920,F,1494448187,...,-25.45985,-49.319845,2.983394,17:29:08,33567,NO_PROBLEM,1,2017-05-10 17:29:08,1494448148,1494385200
1,703,JC314,2017-05-10,733,28/02/70,"10/05/17 12:13:31,000000",CAIUÁ,3372920,F,1494429211,...,-25.451515,-49.294311,15.038858,12:13:42,32829,NO_PROBLEM,0,2017-05-10 12:13:42,1494429222,1494385200
2,777,JC008,2017-05-10,1257,28/02/70,"10/05/17 20:57:15,000000",V. VELHA,3372920,F,1494460635,...,,,,20:57:33,32829,BETWEEN,0,2017-05-10 20:57:33,1494460653,1494385200
3,776,JC002,2017-05-10,384,28/02/70,"10/05/17 06:24:52,000000",CARMELA DUTRA,3372920,F,1494408292,...,,,,06:24:38,33567,BETWEEN,1,2017-05-10 06:24:38,1494408278,1494385200


In [42]:
#Checking if there are any duplicate boarding entries
duplicate_board_entries = user_boardings.groupby(['cardNum','date','sec_group']).count().filter('count > 1')
print duplicate_board_entries.count()
print_df(duplicate_board_entries)

156


Unnamed: 0,cardNum,date,sec_group,count
0,1884144,2017-05-10,496,2
1,3826824,2017-05-10,1076,2
2,1194455,2017-05-10,500,2
3,3534417,2017-05-10,637,2
4,3824163,2017-05-10,484,2
5,3547094,2017-05-10,803,2
6,1735152,2017-05-10,375,2
7,2195541,2017-05-10,364,2
8,3476589,2017-05-10,810,2
9,3562158,2017-05-10,500,2


In [43]:
#Taking a look at a sample:
print_df(user_boardings.filter(F.col('cardNum') == '0001884144'))

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,205,BC306,2017-05-10,971,02/10/85,"10/05/17 16:11:54,000000",BARREIRINHA,1884144,F,1494443514,...,,,,16:11:11,35642,BETWEEN,2,2017-05-10 16:11:11,1494443471,1494385200
1,242,BA128,2017-05-10,496,02/10/85,"10/05/17 08:16:28,000000",V. LEONICE,1884144,F,1494414988,...,-25.349963,-49.253775,7.314722,08:16:32,2154,NO_PROBLEM,0,2017-05-10 08:16:32,1494414992,1494385200
2,242,BA128,2017-05-10,496,02/10/85,"10/05/17 08:16:28,000000",V. LEONICE,1884144,F,1494414988,...,-25.351126,-49.253041,36.294613,08:16:14,33115,NO_PROBLEM,2,2017-05-10 08:16:14,1494414974,1494385200


In [44]:
print_df(user_boardings.filter(F.col('cardNum') == '0001884144').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

Unnamed: 0,route,busCode,tripNum,date,sec_group,gps_timestamp,problem,timestamp,cardTimestamp
0,205,BC306,11,2017-05-10,971,2017-05-10 16:11:11,BETWEEN,16:11:11,"10/05/17 16:11:54,000000"
1,242,BA128,4,2017-05-10,496,2017-05-10 08:16:32,NO_PROBLEM,08:16:32,"10/05/17 08:16:28,000000"
2,242,BA128,3,2017-05-10,496,2017-05-10 08:16:14,NO_PROBLEM,08:16:14,"10/05/17 08:16:28,000000"


In [45]:
print_df(user_boardings.filter(F.col('cardNum') == '0003826824').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

Unnamed: 0,route,busCode,tripNum,date,sec_group,gps_timestamp,problem,timestamp,cardTimestamp
0,777,JC008,10,2017-05-10,1076,2017-05-10 17:56:49,NO_PROBLEM,17:56:49,"10/05/17 17:56:45,000000"
1,777,JC008,11,2017-05-10,1076,2017-05-10 17:56:49,NO_PROBLEM,17:56:49,"10/05/17 17:56:45,000000"


In [46]:
print_df(user_boardings.filter(F.col('cardNum') == '0002195541').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

Unnamed: 0,route,busCode,tripNum,date,sec_group,gps_timestamp,problem,timestamp,cardTimestamp
0,684,HA019,2,2017-05-10,364,2017-05-10 06:04:35,TRIP_PROBLEM,06:04:35,"10/05/17 06:04:21,000000"
1,684,HA019,1,2017-05-10,364,2017-05-10 06:04:35,NO_PROBLEM,06:04:35,"10/05/17 06:04:21,000000"


As we can see above, the duplicated boarding records are due to a problem with the BULMA output, as GPS records which occurred in the same trip are being associated to different trips. We will exclude such entries from our analysis.