In [190]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [266]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

def print_df(df,l=20):
    return df.limit(l).toPandas()

### Read GPS Data

In [192]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [193]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
trips_data = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [194]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:13:43', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:45:14', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=u'05:41:14', busStopId=None, problem=

In [195]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)



### Read Bus Card Data

In [196]:
ticketing_data = sqlContext.read.json(exp_data_folder_path + '/ticketing-sample/doc1-2017051115.txt')

In [197]:
#Renaming columns to english
ticketing_data = ticketing_data.select(F.col("CODLINHA").alias("route"),
                                       F.col("CODVEICULO").alias("busCode"),
                                       F.col("DATANASCIMENTO").alias("userBirthdate"),
                                       F.col("DATAUTILIZACAO").alias("cardTimestamp"),
                                       F.col("NOMELINHA").alias("lineName"),
                                       F.col("NUMEROCARTAO").alias("cardNum"),
                                       F.col("SEXO").alias("gender"))

In [198]:
ticketing_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)



In [199]:
ticketing_data.head(5)

[Row(route=u'021', busCode=u'08046', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 20:15:16,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'021', busCode=u'08027', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 13:10:24,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'623', busCode=u'HA022', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 08:23:45,000000', lineName=u'PQ.INDUSTRIAL', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'000', busCode=u'03023', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 11:54:19,000000', lineName=u'OPER S/LINHA', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'TPH', busCode=u'03019', userBirthdate=u'23/11/79', cardTimestamp=u'10/05/17 13:30:10,000000', lineName=u'TERMINAL PINHEIRINHO', cardNum=u'0002425635', gender=u'F')]

### Pre-processing GPS data

In [200]:
#Warning: Both data sources dates refer to previous day, not to the day in the file name. Fixing this issue here to match bus card data.
trips_data = trips_data.withColumn("date", F.date_sub(F.col("date"), 1))
trips_data = trips_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
trips_data = trips_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))
trips_data = trips_data.withColumn("gps_date_in_secs", F.unix_timestamp(F.col("date"), "yyyy-MM-dd"))

In [201]:
trips_data.select(["date","gps_date_in_secs","timestamp","gps_timestamp_in_secs"]).show()

+----------+----------------+---------+---------------------+
|      date|gps_date_in_secs|timestamp|gps_timestamp_in_secs|
+----------+----------------+---------+---------------------+
|2017-05-10|      1494385200| 12:13:43|           1494429223|
|2017-05-10|      1494385200| 12:45:14|           1494431114|
|2017-05-10|      1494385200| 05:41:14|           1494405674|
|2017-05-10|      1494385200| 05:41:16|           1494405676|
|2017-05-10|      1494385200| 05:41:20|           1494405680|
|2017-05-10|      1494385200| 05:41:27|           1494405687|
|2017-05-10|      1494385200| 05:41:31|           1494405691|
|2017-05-10|      1494385200| 05:41:33|           1494405693|
|2017-05-10|      1494385200| 05:41:37|           1494405697|
|2017-05-10|      1494385200| 05:41:45|           1494405705|
|2017-05-10|      1494385200| 05:41:47|           1494405707|
|2017-05-10|      1494385200| 05:41:52|           1494405712|
|2017-05-10|      1494385200| 05:42:00|           1494405720|
|2017-05

In [202]:
def get_N_sec_group(timestamp_in_secs,date_in_secs,N):
    secs_since_midnight = timestamp_in_secs-date_in_secs
    return(F.floor(secs_since_midnight/N))

In [203]:
trips_data = trips_data.withColumn("sec_group",get_N_sec_group(F.col("gps_timestamp_in_secs"),F.col("gps_date_in_secs"),60))
trips_data.select(["gps_timestamp","gps_timestamp_in_secs","sec_group"]).show()

+-------------------+---------------------+---------+
|      gps_timestamp|gps_timestamp_in_secs|sec_group|
+-------------------+---------------------+---------+
|2017-05-10 12:13:43|           1494429223|      733|
|2017-05-10 12:45:14|           1494431114|      765|
|2017-05-10 05:41:14|           1494405674|      341|
|2017-05-10 05:41:16|           1494405676|      341|
|2017-05-10 05:41:20|           1494405680|      341|
|2017-05-10 05:41:27|           1494405687|      341|
|2017-05-10 05:41:31|           1494405691|      341|
|2017-05-10 05:41:33|           1494405693|      341|
|2017-05-10 05:41:37|           1494405697|      341|
|2017-05-10 05:41:45|           1494405705|      341|
|2017-05-10 05:41:47|           1494405707|      341|
|2017-05-10 05:41:52|           1494405712|      341|
|2017-05-10 05:42:00|           1494405720|      342|
|2017-05-10 05:42:03|           1494405723|      342|
|2017-05-10 05:42:08|           1494405728|      342|
|2017-05-10 05:42:12|       

## Pre-processing Bus Card data

In [204]:
ticketing_data = ticketing_data.withColumn("card_timestamp_in_secs", F.unix_timestamp(F.col("cardTimestamp"), "dd/MM/yy HH:mm:ss"))
ticketing_data = ticketing_data.withColumn("date",F.from_unixtime(F.col("card_timestamp_in_secs"), "yyyy-MM-dd"))
ticketing_data = ticketing_data.withColumn("card_date_in_secs",F.unix_timestamp(F.col("date"),"yyyy-MM-dd"))
ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","date","card_date_in_secs"]).show()

+--------------------+----------------------+----------+-----------------+
|       cardTimestamp|card_timestamp_in_secs|      date|card_date_in_secs|
+--------------------+----------------------+----------+-----------------+
|10/05/17 20:15:16...|            1494458116|2017-05-10|       1494385200|
|10/05/17 13:10:24...|            1494432624|2017-05-10|       1494385200|
|10/05/17 08:23:45...|            1494415425|2017-05-10|       1494385200|
|10/05/17 11:54:19...|            1494428059|2017-05-10|       1494385200|
|10/05/17 13:30:10...|            1494433810|2017-05-10|       1494385200|
|10/05/17 07:52:52...|            1494413572|2017-05-10|       1494385200|
|10/05/17 18:34:06...|            1494452046|2017-05-10|       1494385200|
|10/05/17 06:15:31...|            1494407731|2017-05-10|       1494385200|
|10/05/17 17:57:28...|            1494449848|2017-05-10|       1494385200|
|10/05/17 10:03:56...|            1494421436|2017-05-10|       1494385200|
|10/05/17 13:35:56...|   

In [207]:
ticketing_data = ticketing_data.withColumn("sec_group",get_N_sec_group(F.col("card_timestamp_in_secs"),F.col("card_date_in_secs"),60))
ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","sec_group"]).orderBy("card_timestamp_in_secs").show()

+--------------------+----------------------+---------+
|       cardTimestamp|card_timestamp_in_secs|sec_group|
+--------------------+----------------------+---------+
|10/05/17 00:20:55...|            1494386455|       20|
|10/05/17 00:31:16...|            1494387076|       31|
|10/05/17 00:31:23...|            1494387083|       31|
|10/05/17 00:35:54...|            1494387354|       35|
|10/05/17 00:40:08...|            1494387608|       40|
|10/05/17 00:41:31...|            1494387691|       41|
|10/05/17 00:46:24...|            1494387984|       46|
|10/05/17 00:46:33...|            1494387993|       46|
|10/05/17 00:53:52...|            1494388432|       53|
|10/05/17 00:56:33...|            1494388593|       56|
|10/05/17 01:00:49...|            1494388849|       60|
|10/05/17 01:01:39...|            1494388899|       61|
|10/05/17 01:01:43...|            1494388903|       61|
|10/05/17 01:05:56...|            1494389156|       65|
|10/05/17 01:07:46...|            1494389266|   

In [208]:
trips_data.count()

7900150

In [209]:
ticketing_data.count()

306906

### Removing duplicate GPS records (occurred in the same time period)

In [242]:
filtered_trips_data = trips_data.na.drop(subset=["route","busCode","busStopId","sec_group"]).dropDuplicates(["route","busCode", "tripNum", "date","sec_group"])
print filtered_trips_data.count()
filtered_trips_data.select(["route","busCode", "tripNum","busStopId", "sec_group"]).orderBy("route","busCode","tripNum","busStopId").limit(20).toPandas()

414149


Unnamed: 0,route,busCode,tripNum,busStopId,sec_group
0,1,BN997,1,26166,389
1,1,BN997,1,26360,401
2,1,BN997,1,26375,395
3,1,BN997,1,29082,394
4,1,BN997,1,29420,393
5,1,BN997,1,29887,381
6,1,BN997,1,30748,384
7,1,BN997,1,31453,382
8,1,BN997,1,35216,386
9,1,BN997,1,35219,391


### Removing Duplicate entries in ticketing data

In [245]:
ticketing_data = ticketing_data.dropDuplicates(["route","busCode","cardNum","date","sec_group"])

In [246]:
ticketing_data.count()

292993

### Merging GPS and ticketing data 

In [215]:
ticketing_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group
0,0,1,2017-05-10,295
1,0,1,2017-05-10,295
2,0,1,2017-05-10,301
3,0,1,2017-05-10,326
4,0,1,2017-05-10,356
5,0,1,2017-05-10,357
6,0,1,2017-05-10,358
7,0,1,2017-05-10,364
8,0,1,2017-05-10,364
9,0,1,2017-05-10,365


In [226]:
filtered_trips_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group
0,1,BN997,2017-05-10,381
1,1,BN997,2017-05-10,382
2,1,BN997,2017-05-10,384
3,1,BN997,2017-05-10,386
4,1,BN997,2017-05-10,389
5,1,BN997,2017-05-10,391
6,1,BN997,2017-05-10,393
7,1,BN997,2017-05-10,394
8,1,BN997,2017-05-10,395
9,1,BN997,2017-05-10,401


In [None]:
#def get_timediff(timeA,timeB):
#    if timeA == None or timeB == None:
#        return -1
#    else:
#        return timeB-timeA
#   
#get_timediff_udf = F.udf(get_timediff, T.LongType())
#
#single_located_buses = filtered_trips_data.withColumn("timediff",(filtered_trips_data.gps_timestamp_in_secs - F.lag(filtered_trips_data.gps_timestamp_in_secs)
#.over(Window.partitionBy(["route","busCode","sec_group"]) \
#.orderBy("gps_timestamp_in_secs"))))

In [None]:
#single_located_buses.select(['route','busCode','busStopId','gps_timestamp','sec_group','gps_timestamp_in_secs','timediff']).orderBy(['route','busCode','busStopId','gps_timestamp','sec_group']).limit(20).toPandas()

In [None]:
#single_located_buses = filtered_trips_data.dropDuplicates(subset=['route','busCode','sec_group'])

In [None]:
#single_located_buses.count()

In [247]:
user_boardings = ticketing_data.join(filtered_trips_data, ['route','busCode','date','sec_group'], 'inner')

In [248]:
user_boardings.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: string (nullable = true)
 |-- sec_group: long (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp_in_secs: long (nullable = true)
 |-- card_date_in_secs: long (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (null

In [249]:
user_boardings.select(['route','busCode','sec_group','timestamp','cardTimestamp']).limit(20).toPandas()

Unnamed: 0,route,busCode,sec_group,timestamp,cardTimestamp
0,11,BB306,825,13:45:55,"10/05/17 13:45:03,000000"
1,20,JB601,1125,18:45:15,"10/05/17 18:45:59,000000"
2,21,CB697,1015,16:55:07,"10/05/17 16:55:59,000000"
3,21,DR105,1217,20:17:21,"10/05/17 20:17:29,000000"
4,30,BB605,460,07:40:21,"10/05/17 07:40:38,000000"
5,30,BB605,1106,18:26:10,"10/05/17 18:26:42,000000"
6,30,BB608,800,13:20:46,"10/05/17 13:20:11,000000"
7,30,BR108,434,07:14:46,"10/05/17 07:14:21,000000"
8,30,CB603,893,14:53:43,"10/05/17 14:53:35,000000"
9,30,DR100,791,13:11:30,"10/05/17 13:11:23,000000"


In [250]:
user_boardings.count()

74187

In [251]:
user_boardings.select(['route','busCode','tripNum','busStopId','sec_group','cardNum','cardTimestamp','gps_timestamp']).orderBy('cardNum','cardTimestamp').limit(20).toPandas()

Unnamed: 0,route,busCode,tripNum,busStopId,sec_group,cardNum,cardTimestamp,gps_timestamp
0,828,LA053,4,35350,748,229948,"10/05/17 12:28:19,000000",2017-05-10 12:28:07
1,50,GB607,3,31751,1141,306573,"10/05/17 19:01:37,000000",2017-05-10 19:01:29
2,265,HN606,5,30640,685,307985,"10/05/17 11:25:35,000000",2017-05-10 11:25:03
3,50,LA054,3,32417,729,310241,"10/05/17 12:09:23,000000",2017-05-10 12:09:25
4,21,BB601,1,33068,370,312591,"10/05/17 06:10:17,000000",2017-05-10 06:10:59
5,372,BC312,2,26376,415,313992,"10/05/17 06:55:21,000000",2017-05-10 06:55:53
6,50,LB602,6,32415,1029,313992,"10/05/17 17:09:27,000000",2017-05-10 17:09:44
7,531,EA169,3,29960,448,314357,"10/05/17 07:28:31,000000",2017-05-10 07:28:46
8,335,DR406,3,33604,452,314918,"10/05/17 07:32:52,000000",2017-05-10 07:32:57
9,650,HA287,3,36274,507,316606,"10/05/17 08:27:53,000000",2017-05-10 08:27:25


### Analyzing Boarding data

In [252]:
ticketing_data.filter(F.col('cardNum') == '0000317896').toPandas()

Unnamed: 0,route,busCode,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,date,card_date_in_secs,sec_group
0,216,CA600,16/09/60,"10/05/17 17:05:57,000000",CABRAL / PORTÃO,317896,F,1494446757,2017-05-10,1494385200,1025
1,542,GA124,16/09/60,"10/05/17 07:01:40,000000",BAIRRO NOVO B,317896,F,1494410500,2017-05-10,1494385200,421


In [253]:
total_passengers = user_boardings.select('cardNum').distinct().count()
print "Total #Passengers:", total_passengers

Total #Passengers: 65960


In [254]:
boarding_count = user_boardings.groupby('cardNum').count()

In [255]:
#Filtering only users with more than one ride per day
multiple_boardings = boarding_count.filter(F.col('count') > 1)

In [256]:
passengers_mult_boardings = multiple_boardings.count()
prop_mult_boardings = 100*(passengers_mult_boardings)/total_passengers
print "Passengers with Multiple Boardings:", passengers_mult_boardings, "(", prop_mult_boardings, "%)" 
multiple_boardings.show()

Passengers with Multiple Boardings: 7593 ( 11 %)
+----------+-----+
|   cardNum|count|
+----------+-----+
|0002786520|    2|
|0003408486|    2|
|0002083421|    2|
|0003637275|    2|
|0002752794|    2|
|0003398072|    2|
|0003372920|    4|
|0003697174|    2|
|0002986469|    3|
|0003478513|    2|
|0003713833|    2|
|0003759461|    2|
|0003480781|    3|
|0002633338|    2|
|0003801759|    2|
|0002578507|    2|
|0002602692|    2|
|0003359882|    2|
|0003437796|    2|
|0002699865|    2|
+----------+-----+
only showing top 20 rows



In [257]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0002167105').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,30,GR123,2017-05-10,704,07/08/91,"10/05/17 11:44:57,000000",INTERBAIRROS III,2167105,M,1494427497,...,-25.51372,-49.26571,75.32579,11:44:32,32848,NO_PROBLEM,0,2017-05-10 11:44:32,1494427472,1494385200
1,30,KB699,2017-05-10,527,07/08/91,"10/05/17 08:47:49,000000",INTERBAIRROS III,2167105,M,1494416869,...,-25.5135,-49.264663,32.41715,08:47:10,32609,TRIP_PROBLEM,0,2017-05-10 08:47:10,1494416830,1494385200


In [258]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0003637275').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,901,MC305,2017-05-10,411,27/10/97,"10/05/17 06:51:18,000000",STA. FELICIDADE,3637275,M,1494409878,...,,,,06:51:04,33677,BETWEEN,1,2017-05-10 06:51:04,1494409864,1494385200
1,821,MA005,2017-05-10,1107,27/10/97,"10/05/17 18:27:49,000000",FERNÃO DIAS,3637275,M,1494451669,...,-25.411221,-49.349313,5.034809,18:27:13,34132,NO_PROBLEM,0,2017-05-10 18:27:13,1494451633,1494385200


In [259]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0002986469').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,182,BC310,2017-05-10,774,18/05/73,"10/05/17 12:54:19,000000",ABRANCHES,2986469,M,1494431659,...,,,,12:54:05,30800,BETWEEN,2,2017-05-10 12:54:05,1494431645,1494385200
1,243,BA299,2017-05-10,431,18/05/73,"10/05/17 07:11:45,000000",STA. TEREZINHA,2986469,M,1494411105,...,-25.350926,-49.26875,45.49196,07:11:14,34522,NO_PROBLEM,1,2017-05-10 07:11:14,1494411074,1494385200
2,243,BA299,2017-05-10,810,18/05/73,"10/05/17 13:30:54,000000",STA. TEREZINHA,2986469,M,1494433854,...,-25.350905,-49.268806,39.894855,13:30:24,34522,NO_PROBLEM,0,2017-05-10 13:30:24,1494433824,1494385200


In [260]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0003372920').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,777,JC006,2017-05-10,1049,28/02/70,"10/05/17 17:29:47,000000",V. VELHA,3372920,F,1494448187,...,-25.45985,-49.319845,2.983394,17:29:08,33567,NO_PROBLEM,1,2017-05-10 17:29:08,1494448148,1494385200
1,703,JC314,2017-05-10,733,28/02/70,"10/05/17 12:13:31,000000",CAIUÁ,3372920,F,1494429211,...,-25.451515,-49.294311,15.038858,12:13:42,32829,NO_PROBLEM,0,2017-05-10 12:13:42,1494429222,1494385200
2,777,JC008,2017-05-10,1257,28/02/70,"10/05/17 20:57:15,000000",V. VELHA,3372920,F,1494460635,...,,,,20:57:33,32829,BETWEEN,0,2017-05-10 20:57:33,1494460653,1494385200
3,776,JC002,2017-05-10,384,28/02/70,"10/05/17 06:24:52,000000",CARMELA DUTRA,3372920,F,1494408292,...,,,,06:24:38,33567,BETWEEN,1,2017-05-10 06:24:38,1494408278,1494385200


In [268]:
#Checking if there are any duplicate boarding entries
duplicate_board_entries = user_boardings.groupby(['cardNum','date','sec_group']).count().filter('count > 1')
print duplicate_board_entries.count()
print_df(duplicate_board_entries)

156


Unnamed: 0,cardNum,date,sec_group,count
0,1884144,2017-05-10,496,2
1,3826824,2017-05-10,1076,2
2,1194455,2017-05-10,500,2
3,3534417,2017-05-10,637,2
4,3824163,2017-05-10,484,2
5,3547094,2017-05-10,803,2
6,1735152,2017-05-10,375,2
7,2195541,2017-05-10,364,2
8,3476589,2017-05-10,810,2
9,3562158,2017-05-10,500,2


In [265]:
#Taking a look at a sample:
print_df(user_boardings.filter(F.col('cardNum') == '0001884144'))

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,card_timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,205,BC306,2017-05-10,971,02/10/85,"10/05/17 16:11:54,000000",BARREIRINHA,1884144,F,1494443514,...,,,,16:11:11,35642,BETWEEN,2,2017-05-10 16:11:11,1494443471,1494385200
1,242,BA128,2017-05-10,496,02/10/85,"10/05/17 08:16:28,000000",V. LEONICE,1884144,F,1494414988,...,-25.349963,-49.253775,7.314722,08:16:32,2154,NO_PROBLEM,0,2017-05-10 08:16:32,1494414992,1494385200
2,242,BA128,2017-05-10,496,02/10/85,"10/05/17 08:16:28,000000",V. LEONICE,1884144,F,1494414988,...,-25.351126,-49.253041,36.294613,08:16:14,33115,NO_PROBLEM,2,2017-05-10 08:16:14,1494414974,1494385200


In [275]:
print_df(user_boardings.filter(F.col('cardNum') == '0001884144').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

Unnamed: 0,route,busCode,tripNum,date,sec_group,gps_timestamp,problem,timestamp,cardTimestamp
0,205,BC306,11,2017-05-10,971,2017-05-10 16:11:11,BETWEEN,16:11:11,"10/05/17 16:11:54,000000"
1,242,BA128,4,2017-05-10,496,2017-05-10 08:16:32,NO_PROBLEM,08:16:32,"10/05/17 08:16:28,000000"
2,242,BA128,3,2017-05-10,496,2017-05-10 08:16:14,NO_PROBLEM,08:16:14,"10/05/17 08:16:28,000000"


In [276]:
print_df(user_boardings.filter(F.col('cardNum') == '0003826824').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

Unnamed: 0,route,busCode,tripNum,date,sec_group,gps_timestamp,problem,timestamp,cardTimestamp
0,777,JC008,10,2017-05-10,1076,2017-05-10 17:56:49,NO_PROBLEM,17:56:49,"10/05/17 17:56:45,000000"
1,777,JC008,11,2017-05-10,1076,2017-05-10 17:56:49,NO_PROBLEM,17:56:49,"10/05/17 17:56:45,000000"


In [277]:
print_df(user_boardings.filter(F.col('cardNum') == '0002195541').select(['route','busCode','tripNum','date','sec_group','gps_timestamp','problem','timestamp','cardTimestamp']))

Unnamed: 0,route,busCode,tripNum,date,sec_group,gps_timestamp,problem,timestamp,cardTimestamp
0,684,HA019,2,2017-05-10,364,2017-05-10 06:04:35,TRIP_PROBLEM,06:04:35,"10/05/17 06:04:21,000000"
1,684,HA019,1,2017-05-10,364,2017-05-10 06:04:35,NO_PROBLEM,06:04:35,"10/05/17 06:04:21,000000"


As we can see above, the duplicated boarding records are due to a problem with the BULMA output, as GPS records which occurred in the same trip are being associated to different trips. We will exclude such entries from our analysis.

In [278]:
#Keeping only non-duplicated boarding data
clean_boardings = user_boardings.groupby(['cardNum','date','sec_group']).count().filter('count == 1')

In [279]:
print_df(clean_boardings)

Unnamed: 0,cardNum,date,sec_group,count
0,2793259,2017-05-10,800,1
1,3655283,2017-05-10,495,1
2,3588558,2017-05-10,431,1
3,2401786,2017-05-10,417,1
4,2066650,2017-05-10,457,1
5,3654850,2017-05-10,342,1
6,2650979,2017-05-10,850,1
7,3386754,2017-05-10,403,1
8,2266612,2017-05-10,342,1
9,3813755,2017-05-10,1152,1


In [280]:
clean_boardings.count()

73875

In [282]:
clean_boardings.select('cardNum').distinct().count()

65833