In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

### Read GPS Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [85]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
trips_data = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [86]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:13:43', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:45:14', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=u'05:41:14', busStopId=None, problem=

In [87]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)



### Read Bus Card Data

In [29]:
ticketing_data = sqlContext.read.json(exp_data_folder_path + '/ticketing-sample/doc1-2017051115.txt')

In [30]:
#Renaming columns to english
ticketing_data = ticketing_data.select(F.col("CODLINHA").alias("route"),
                                       F.col("CODVEICULO").alias("busCode"),
                                       F.col("DATANASCIMENTO").alias("userBirthdate"),
                                       F.col("DATAUTILIZACAO").alias("cardTimestamp"),
                                       F.col("NOMELINHA").alias("lineName"),
                                       F.col("NUMEROCARTAO").alias("cardNum"),
                                       F.col("SEXO").alias("gender"))

In [31]:
ticketing_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)



In [32]:
ticketing_data.head(5)

[Row(route=u'021', busCode=u'08046', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 20:15:16,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'021', busCode=u'08027', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 13:10:24,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'623', busCode=u'HA022', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 08:23:45,000000', lineName=u'PQ.INDUSTRIAL', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'000', busCode=u'03023', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 11:54:19,000000', lineName=u'OPER S/LINHA', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'TPH', busCode=u'03019', userBirthdate=u'23/11/79', cardTimestamp=u'10/05/17 13:30:10,000000', lineName=u'TERMINAL PINHEIRINHO', cardNum=u'0002425635', gender=u'F')]

### Pre-processing GPS data

In [88]:
#Warning: Both data sources dates refer to previous day, not to the day in the file name. Fixing this issue here to match bus card data.
trips_data = trips_data.withColumn("date", F.date_sub(F.col("date"), 1))
trips_data = trips_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
trips_data = trips_data.withColumn("gps_timestamp_in_secs", F.unix_timestamp(F.col("gps_timestamp"), "yyyy-MM-dd HH:mm:ss"))
trips_data = trips_data.withColumn("gps_date_in_secs", F.unix_timestamp(F.col("date"), "yyyy-MM-dd"))

In [89]:
trips_data.select(["date","gps_date_in_secs","timestamp","gps_timestamp_in_secs"]).show()

+----------+----------------+---------+---------------------+
|      date|gps_date_in_secs|timestamp|gps_timestamp_in_secs|
+----------+----------------+---------+---------------------+
|2017-05-10|      1494385200| 12:13:43|           1494429223|
|2017-05-10|      1494385200| 12:45:14|           1494431114|
|2017-05-10|      1494385200| 05:41:14|           1494405674|
|2017-05-10|      1494385200| 05:41:16|           1494405676|
|2017-05-10|      1494385200| 05:41:20|           1494405680|
|2017-05-10|      1494385200| 05:41:27|           1494405687|
|2017-05-10|      1494385200| 05:41:31|           1494405691|
|2017-05-10|      1494385200| 05:41:33|           1494405693|
|2017-05-10|      1494385200| 05:41:37|           1494405697|
|2017-05-10|      1494385200| 05:41:45|           1494405705|
|2017-05-10|      1494385200| 05:41:47|           1494405707|
|2017-05-10|      1494385200| 05:41:52|           1494405712|
|2017-05-10|      1494385200| 05:42:00|           1494405720|
|2017-05

In [109]:
def get_N_sec_group(timestamp_in_secs,date_in_secs,N):
    secs_since_midnight = timestamp_in_secs-date_in_secs
    return(F.floor(secs_since_midnight/N))

In [110]:
trips_data = trips_data.withColumn("sec_group",get_N_sec_group(F.col("gps_timestamp_in_secs"),F.col("gps_date_in_secs"),30))
trips_data.select(["gps_timestamp","gps_timestamp_in_secs","sec_group"]).show()

+-------------------+---------------------+---------+
|      gps_timestamp|gps_timestamp_in_secs|sec_group|
+-------------------+---------------------+---------+
|2017-05-10 12:13:43|           1494429223|     1467|
|2017-05-10 12:45:14|           1494431114|     1530|
|2017-05-10 05:41:14|           1494405674|      682|
|2017-05-10 05:41:16|           1494405676|      682|
|2017-05-10 05:41:20|           1494405680|      682|
|2017-05-10 05:41:27|           1494405687|      682|
|2017-05-10 05:41:31|           1494405691|      683|
|2017-05-10 05:41:33|           1494405693|      683|
|2017-05-10 05:41:37|           1494405697|      683|
|2017-05-10 05:41:45|           1494405705|      683|
|2017-05-10 05:41:47|           1494405707|      683|
|2017-05-10 05:41:52|           1494405712|      683|
|2017-05-10 05:42:00|           1494405720|      684|
|2017-05-10 05:42:03|           1494405723|      684|
|2017-05-10 05:42:08|           1494405728|      684|
|2017-05-10 05:42:12|       

## Pre-processing Bus Card data

In [92]:
ticketing_data = ticketing_data.withColumn("card_timestamp_in_secs", F.unix_timestamp(F.col("cardTimestamp"), "dd/MM/yy HH:mm:ss"))
ticketing_data = ticketing_data.withColumn("date",F.from_unixtime(F.col("card_timestamp_in_secs"), "yyyy-MM-dd"))
ticketing_data = ticketing_data.withColumn("card_date_in_secs",F.unix_timestamp(F.col("date"),"yyyy-MM-dd"))
ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","date","card_date_in_secs"]).show()

+--------------------+----------------------+----------+-----------------+
|       cardTimestamp|card_timestamp_in_secs|      date|card_date_in_secs|
+--------------------+----------------------+----------+-----------------+
|10/05/17 18:04:56...|            1494450296|2017-05-10|       1494385200|
|10/05/17 10:12:52...|            1494421972|2017-05-10|       1494385200|
|10/05/17 12:57:27...|            1494431847|2017-05-10|       1494385200|
|10/05/17 06:04:53...|            1494407093|2017-05-10|       1494385200|
|10/05/17 10:35:53...|            1494423353|2017-05-10|       1494385200|
|10/05/17 12:10:32...|            1494429032|2017-05-10|       1494385200|
|10/05/17 17:20:10...|            1494447610|2017-05-10|       1494385200|
|10/05/17 17:41:32...|            1494448892|2017-05-10|       1494385200|
|10/05/17 16:09:48...|            1494443388|2017-05-10|       1494385200|
|10/05/17 12:55:18...|            1494431718|2017-05-10|       1494385200|
|10/05/17 07:12:03...|   

In [93]:
ticketing_data = ticketing_data.withColumn("sec_group",get_N_sec_group(F.col("card_timestamp_in_secs"),F.col("card_date_in_secs"),30))
ticketing_data.select(["cardTimestamp","card_timestamp_in_secs","sec_group"]).orderBy("timestamp_in_secs").show()

+--------------------+----------------------+---------+
|       cardTimestamp|card_timestamp_in_secs|sec_group|
+--------------------+----------------------+---------+
|10/05/17 00:20:55...|            1494386455|       41|
|10/05/17 00:31:16...|            1494387076|       62|
|10/05/17 00:31:23...|            1494387083|       62|
|10/05/17 00:35:54...|            1494387354|       71|
|10/05/17 00:40:08...|            1494387608|       80|
|10/05/17 00:41:31...|            1494387691|       83|
|10/05/17 00:46:24...|            1494387984|       92|
|10/05/17 00:46:33...|            1494387993|       93|
|10/05/17 00:53:52...|            1494388432|      107|
|10/05/17 00:56:33...|            1494388593|      113|
|10/05/17 01:00:49...|            1494388849|      121|
|10/05/17 01:01:39...|            1494388899|      123|
|10/05/17 01:01:43...|            1494388903|      123|
|10/05/17 01:05:56...|            1494389156|      131|
|10/05/17 01:07:46...|            1494389266|   

In [94]:
trips_data.count()

7900150

In [95]:
ticketing_data.count()

306904

### Removing duplicate GPS records (occurred in the same time period)

In [111]:
filtered_trips_data = trips_data.na.drop(subset=["busStopId"]).dropDuplicates(["route","busCode", "busStopId", "sec_group"])
print filtered_trips_data.count()
filtered_trips_data.select(["route","busCode", "busStopId", "sec_group"]).orderBy("route","busCode","busStopId").limit(20).toPandas()

541266


Unnamed: 0,route,busCode,busStopId,sec_group
0,1,,26360,
1,1,BN997,26166,1335.0
2,1,BN997,26166,1135.0
3,1,BN997,26166,1647.0
4,1,BN997,26166,1197.0
5,1,BN997,26166,2019.0
6,1,BN997,26166,1789.0
7,1,BN997,26166,1698.0
8,1,BN997,26166,1607.0
9,1,BN997,26166,939.0


### Removing Duplicate entries in ticketing data

In [97]:
ticketing_data = ticketing_data.dropDuplicates()

In [98]:
ticketing_data.count()

306904

### Merging GPS and ticketing data 

In [112]:
ticketing_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group
0,0,1,2017-05-10,591
1,0,1,2017-05-10,591
2,0,1,2017-05-10,603
3,0,1,2017-05-10,652
4,0,1,2017-05-10,713
5,0,1,2017-05-10,714
6,0,1,2017-05-10,716
7,0,1,2017-05-10,728
8,0,1,2017-05-10,729
9,0,1,2017-05-10,730


In [113]:
filtered_trips_data.select(['route','busCode','date','sec_group']).orderBy(['route','busCode','sec_group']).limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group
0,1,,2017-05-10,
1,1,BN997,2017-05-10,762.0
2,1,BN997,2017-05-10,764.0
3,1,BN997,2017-05-10,765.0
4,1,BN997,2017-05-10,769.0
5,1,BN997,2017-05-10,769.0
6,1,BN997,2017-05-10,772.0
7,1,BN997,2017-05-10,778.0
8,1,BN997,2017-05-10,782.0
9,1,BN997,2017-05-10,786.0


In [118]:
user_boardings = ticketing_data.join(filtered_trips_data, ['route','busCode','date','sec_group'], 'inner')

In [119]:
user_boardings.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- date: string (nullable = true)
 |-- sec_group: long (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- timestamp_in_secs: long (nullable = true)
 |-- date_in_secs: long (nullable = true)
 |-- card_timestamp_in_secs: long (nullable = true)
 |-- card_date_in_secs: long (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: stri

In [122]:
user_boardings.select(['route','busCode','sec_group','timestamp','cardTimestamp']).limit(20).toPandas()

Unnamed: 0,route,busCode,sec_group,timestamp,cardTimestamp
0,20,BB300,639,05:19:43,"10/05/17 05:19:35,000000"
1,20,CB604,2181,18:10:53,"10/05/17 18:10:43,000000"
2,20,CB604,2181,18:10:53,"10/05/17 18:10:46,000000"
3,20,CB604,2181,18:10:53,"10/05/17 18:10:55,000000"
4,20,CB604,2181,18:10:53,"10/05/17 18:10:36,000000"
5,20,CB604,2181,18:10:53,"10/05/17 18:10:49,000000"
6,20,CB604,2181,18:10:53,"10/05/17 18:10:33,000000"
7,21,BB601,1677,13:58:40,"10/05/17 13:58:41,000000"
8,21,CB697,1828,15:14:11,"10/05/17 15:14:25,000000"
9,30,BB498,782,06:31:19,"10/05/17 06:31:16,000000"


In [123]:
user_boardings.count()

52071

In [124]:
user_boardings.select(['route','busCode','sec_group','busStopId','cardNum','cardTimestamp']).orderBy('cardNum','cardTimestamp').limit(20).toPandas()

Unnamed: 0,route,busCode,sec_group,busStopId,cardNum,cardTimestamp
0,828,LA053,1496,35350,229948,"10/05/17 12:28:19,000000"
1,50,LA054,1458,32417,310241,"10/05/17 12:09:23,000000"
2,531,EA169,897,29960,314357,"10/05/17 07:28:31,000000"
3,335,DR406,905,33604,314918,"10/05/17 07:32:52,000000"
4,216,CA600,2051,29165,317896,"10/05/17 17:05:57,000000"
5,2,DN029,2270,40026,321169,"10/05/17 18:55:17,000000"
6,811,BA018,1975,30515,321469,"10/05/17 16:27:47,000000"
7,779,JC865,710,33571,321916,"10/05/17 05:55:17,000000"
8,646,HA006,1578,38516,322132,"10/05/17 13:09:01,000000"
9,205,BC306,984,29085,323896,"10/05/17 08:12:02,000000"


In [125]:
ticketing_data.filter(F.col('cardNum') == '0000323978').show()

+-----+-------+-------------+--------------------+------------+----------+------+-----------------+----------+------------+---------+----------------------+-----------------+
|route|busCode|userBirthdate|       cardTimestamp|    lineName|   cardNum|gender|timestamp_in_secs|      date|date_in_secs|sec_group|card_timestamp_in_secs|card_date_in_secs|
+-----+-------+-------------+--------------------+------------+----------+------+-----------------+----------+------------+---------+----------------------+-----------------+
|  000|  03009|     01/03/59|10/05/17 18:25:10...|OPER S/LINHA|0000323978|     F|       1494451510|2017-05-10|  1494385200|     2210|            1494451510|       1494385200|
|  673|  HN615|     01/03/59|10/05/17 07:31:54...|     FORMOSA|0000323978|     F|       1494412314|2017-05-10|  1494385200|      903|            1494412314|       1494385200|
|  665|  EC011|     01/03/59|10/05/17 17:01:34...|      V. REX|0000323978|     F|       1494446494|2017-05-10|  1494385200|  

In [137]:
single_located_buses = filtered_trips_data.dropDuplicates(subset=['route','busCode','gps_timestamp'])

In [140]:
single_located_buses.limit(20).toPandas()

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,date,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs,sec_group
0,1,3,2938,6451475,-25.433142,-49.276703,3090.819,BN997,,,...,,07:26:02,35219,BETWEEN,0,2017-05-10,2017-05-10 07:26:02,1494411962,1494385200,892
1,1,17,2938,6451490,-25.430831,-49.276405,3473.319,BN997,,-25.43072,...,16.932875,13:02:53,29420,NO_PROBLEM,0,2017-05-10,2017-05-10 13:02:53,1494432173,1494385200,1565
2,1,23,2938,6451415,-25.435452,-49.265462,1428.729,BN997,,-25.435375,...,9.929112,15:18:56,30748,NO_PROBLEM,0,2017-05-10,2017-05-10 15:18:56,1494440336,1494385200,1837
3,1,15,2938,6451495,-25.429963,-49.274031,3730.817,BN998,,,...,,12:52:19,29082,BETWEEN,0,2017-05-10,2017-05-10 12:52:19,1494431539,1494385200,1544
4,2,1,3077,4843598,-25.434697,-49.259186,6449.752,DN027,,-25.43478,...,19.806795,07:03:48,40030,NO_PROBLEM,0,2017-05-10,2017-05-10 07:03:48,1494410628,1494385200,847
5,2,15,3077,4843498,-25.433481,-49.277504,2102.369,DN027,,-25.433671,...,24.37203,16:08:26,47784,NO_PROBLEM,1,2017-05-10,2017-05-10 16:08:26,1494443306,1494385200,1936
6,2,2,3077,4843548,-25.437491,-49.266078,4576.746,DN028,,-25.437473,...,6.407318,07:53:20,10899,NO_PROBLEM,1,2017-05-10,2017-05-10 07:53:20,1494413600,1494385200,946
7,2,5,3077,4843558,-25.435805,-49.261686,5056.511,DN028,,-25.435816,...,18.653934,09:53:48,30224,NO_PROBLEM,0,2017-05-10,2017-05-10 09:53:48,1494420828,1494385200,1187
8,2,12,3077,4843537,-25.439944,-49.272466,3879.155,DN028,,-25.439885,...,10.611422,14:21:49,40031,NO_PROBLEM,0,2017-05-10,2017-05-10 14:21:49,1494436909,1494385200,1723
9,2,13,3077,4843614,-25.431378,-49.262145,7039.518,DN029,,,...,,14:52:00,31456,BETWEEN,0,2017-05-10,2017-05-10 14:52:00,1494438720,1494385200,1784


In [142]:
multi_located_buses = filtered_trips_data.join(single_located_buses,[],"anti")
multi_located_buses.limit(40).toPandas()

AnalysisException: u'USING column `anti` cannot be resolved on the left side of the join. The left-side columns: [route, tripNum, shapeId, shapeSequence, shapeLat, shapeLon, distanceTraveledShape, busCode, gpsPointId, gpsLat, gpsLon, distanceToShapePoint, timestamp, busStopId, problem, numPassengers, date, gps_timestamp, gps_timestamp_in_secs, gps_date_in_secs, sec_group];'

In [143]:
boarding_count = user_boardings.groupby('cardNum').count()

In [144]:
total_passengers = user_boardings.select('cardNum').distinct().count()
print "Total #Passengers:", total_passengers

Total #Passengers: 44264


In [145]:
#Filtering only users with more than one ride per day
multiple_boardings = boarding_count.filter(F.col('count') > 1)

In [146]:
passengers_mult_boardings = multiple_boardings.count()
prop_mult_boardings = 100*(passengers_mult_boardings)/total_passengers
print "Passengers with Multiple Boardings:", passengers_mult_boardings, "(", prop_mult_boardings, "%)" 
multiple_boardings.show()

Passengers with Multiple Boardings: 6739 ( 15 %)
+----------+-----+
|   cardNum|count|
+----------+-----+
|0002167105|    3|
|0003637275|    2|
|0002633338|    2|
|0003437796|    2|
|0002083421|    2|
|0003801759|    2|
|0003611815|    2|
|0002788327|    2|
|0002703900|    3|
|0003768761|    2|
|0003759461|    2|
|0002752794|    2|
|0003819969|    2|
|0002277554|    2|
|0003756853|    2|
|0002986469|    7|
|0003344773|    2|
|0002911856|    2|
|0003512933|    2|
|0003697174|    2|
+----------+-----+
only showing top 20 rows



In [147]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0002167105').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,30,GR123,2017-05-10,1409,07/08/91,"10/05/17 11:44:57,000000",INTERBAIRROS III,2167105,M,1494427497,...,-25.514071,-49.267173,50.379257,11:44:50,32611,NO_PROBLEM,2,2017-05-10 11:44:50,1494427490,1494385200
1,30,GR123,2017-05-10,1409,07/08/91,"10/05/17 11:44:57,000000",INTERBAIRROS III,2167105,M,1494427497,...,-25.51372,-49.26571,75.32579,11:44:32,32848,NO_PROBLEM,0,2017-05-10 11:44:32,1494427472,1494385200
2,30,KB699,2017-05-10,1055,07/08/91,"10/05/17 08:47:49,000000",INTERBAIRROS III,2167105,M,1494416869,...,-25.512981,-49.262606,20.480734,08:47:35,32607,TRIP_PROBLEM,1,2017-05-10 08:47:35,1494416855,1494385200


In [148]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0003637275').limit(20).toPandas()

Unnamed: 0,route,busCode,date,sec_group,userBirthdate,cardTimestamp,lineName,cardNum,gender,timestamp_in_secs,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,gps_timestamp,gps_timestamp_in_secs,gps_date_in_secs
0,901,MC305,2017-05-10,822,27/10/97,"10/05/17 06:51:18,000000",STA. FELICIDADE,3637275,M,1494409878,...,,,,06:51:04,33677,BETWEEN,1,2017-05-10 06:51:04,1494409864,1494385200
1,821,MA005,2017-05-10,2215,27/10/97,"10/05/17 18:27:49,000000",FERNÃO DIAS,3637275,M,1494451669,...,,,,18:27:36,34140,BETWEEN,3,2017-05-10 18:27:36,1494451656,1494385200
