In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

### Read Input Data

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [4]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
trips_data = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [5]:
trips_data.head(3)

[Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136916, shapeLat=-25.432724990605614, shapeLon=-49.27218701780396, distanceTraveledShape=10149.88, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:13:43', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=4, shapeId=1891, shapeSequence=6136915, shapeLat=-25.432470122014415, shapeLon=-49.27231020615159, distanceTraveledShape=10119.046, busCode=u'CC170', gpsPointId=None, gpsLat=None, gpsLon=None, distanceToShapePoint=None, timestamp=u'12:45:14', busStopId=None, problem=u'BETWEEN', numPassengers=None, date=u'2017-05-11'),
 Row(route=u'372', tripNum=1, shapeId=1891, shapeSequence=6136650, shapeLat=-25.413362156585787, shapeLon=-49.20592429766663, distanceTraveledShape=79.983, busCode=u'CC170', gpsPointId=None, gpsLat=-25.413378, gpsLon=-49.205836, distanceToShapePoint=9.041484, timestamp=u'05:41:14', busStopId=None, problem=

In [6]:
trips_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = false)



In [7]:
ticketing_data = sqlContext.read.json(exp_data_folder_path + '/ticketing-sample/doc1-2017051115.txt')

In [8]:
#Renaming columns to english
ticketing_data = ticketing_data.select(F.col("CODLINHA").alias("route"),
                                       F.col("CODVEICULO").alias("busCode"),
                                       F.col("DATANASCIMENTO").alias("userBirthdate"),
                                       F.col("DATAUTILIZACAO").alias("cardTimestamp"),
                                       F.col("NOMELINHA").alias("lineName"),
                                       F.col("NUMEROCARTAO").alias("cardNum"),
                                       F.col("SEXO").alias("gender"))

In [9]:
ticketing_data.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)



In [10]:
ticketing_data.head(5)

[Row(route=u'021', busCode=u'08046', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 20:15:16,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'021', busCode=u'08027', userBirthdate=u'26/01/72', cardTimestamp=u'10/05/17 13:10:24,000000', lineName=u'INTERB II ANTI H', cardNum=u'0001937533', gender=u'F'),
 Row(route=u'623', busCode=u'HA022', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 08:23:45,000000', lineName=u'PQ.INDUSTRIAL', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'000', busCode=u'03023', userBirthdate=u'06/03/71', cardTimestamp=u'10/05/17 11:54:19,000000', lineName=u'OPER S/LINHA', cardNum=u'0001311020', gender=u'F'),
 Row(route=u'TPH', busCode=u'03019', userBirthdate=u'23/11/79', cardTimestamp=u'10/05/17 13:30:10,000000', lineName=u'TERMINAL PINHEIRINHO', cardNum=u'0002425635', gender=u'F')]

## Pre-processing data

In [11]:
trips_data = trips_data.withColumn("gps_timestamp", F.concat(F.col("date"), F.lit(" "), F.col("timestamp")))
trips_data = trips_data.withColumn("gps_timestamp", F.unix_timestamp(F.col("gps_timestamp"), "YYYY-MM-DD HH:mm:ss"))

In [12]:
trips_data.select(["date","timestamp","gps_timestamp"]).show()

+----------+---------+-------------+
|      date|timestamp|gps_timestamp|
+----------+---------+-------------+
|2017-05-11| 12:13:43|   1483283623|
|2017-05-11| 12:45:14|   1483285514|
|2017-05-11| 05:41:14|   1483260074|
|2017-05-11| 05:41:16|   1483260076|
|2017-05-11| 05:41:20|   1483260080|
|2017-05-11| 05:41:27|   1483260087|
|2017-05-11| 05:41:31|   1483260091|
|2017-05-11| 05:41:33|   1483260093|
|2017-05-11| 05:41:37|   1483260097|
|2017-05-11| 05:41:45|   1483260105|
|2017-05-11| 05:41:47|   1483260107|
|2017-05-11| 05:41:52|   1483260112|
|2017-05-11| 05:42:00|   1483260120|
|2017-05-11| 05:42:03|   1483260123|
|2017-05-11| 05:42:08|   1483260128|
|2017-05-11| 05:42:12|   1483260132|
|2017-05-11| 05:42:17|   1483260137|
|2017-05-11| 05:42:21|   1483260141|
|2017-05-11| 05:42:23|   1483260143|
|2017-05-11| 05:42:25|   1483260145|
+----------+---------+-------------+
only showing top 20 rows



In [13]:
ticketing_data = ticketing_data.withColumn("card_timestamp", F.unix_timestamp(F.col("cardTimestamp"), "DD/MM/YY HH:mm:ss"))

In [14]:
ticketing_data.select(["cardTimestamp","card_timestamp"]).show()

+--------------------+--------------+
|       cardTimestamp|card_timestamp|
+--------------------+--------------+
|10/05/17 20:15:16...|    1483312516|
|10/05/17 13:10:24...|    1483287024|
|10/05/17 08:23:45...|    1483269825|
|10/05/17 11:54:19...|    1483282459|
|10/05/17 13:30:10...|    1483288210|
|10/05/17 07:52:52...|    1483267972|
|10/05/17 18:34:06...|    1483306446|
|10/05/17 06:15:31...|    1483262131|
|10/05/17 17:57:28...|    1483304248|
|10/05/17 10:03:56...|    1483275836|
|10/05/17 13:35:56...|    1483288556|
|10/05/17 07:50:18...|    1483267818|
|10/05/17 12:26:01...|    1483284361|
|10/05/17 14:34:27...|    1483292067|
|10/05/17 14:34:30...|    1483292070|
|10/05/17 14:34:32...|    1483292072|
|10/05/17 14:02:43...|    1483290163|
|10/05/17 11:04:25...|    1483279465|
|10/05/17 06:44:08...|    1483263848|
|10/05/17 17:38:18...|    1483303098|
+--------------------+--------------+
only showing top 20 rows



In [15]:
curr_date = trips_data.sample(False, 0.1, seed=0).limit(1).withColumn("curr_date", F.unix_timestamp(F.col("date"), "YYYY-MM-DD")).select("curr_date").rdd.flatMap(list).first()

In [16]:
print curr_date

1483239600


In [17]:
def get_group_N_sec(timestamp,curr_date_timestamp,N):
    secs_since_midnight = timestamp-curr_date_timestamp
    return(F.floor(secs_since_midnight/N))

In [18]:
ticketing_data = ticketing_data.withColumn("sec_group",get_group_N_sec(F.col("card_timestamp"),curr_date,30))
ticketing_data.select(["card_timestamp","sec_group"]).show()

+--------------+---------+
|card_timestamp|sec_group|
+--------------+---------+
|    1483312516|     2430|
|    1483287024|     1580|
|    1483269825|     1007|
|    1483282459|     1428|
|    1483288210|     1620|
|    1483267972|      945|
|    1483306446|     2228|
|    1483262131|      751|
|    1483304248|     2154|
|    1483275836|     1207|
|    1483288556|     1631|
|    1483267818|      940|
|    1483284361|     1492|
|    1483292067|     1748|
|    1483292070|     1749|
|    1483292072|     1749|
|    1483290163|     1685|
|    1483279465|     1328|
|    1483263848|      808|
|    1483303098|     2116|
+--------------+---------+
only showing top 20 rows



In [19]:
trips_data = trips_data.withColumn("sec_group",get_group_N_sec(F.col("gps_timestamp"),curr_date,30))
trips_data.select(["gps_timestamp","sec_group"]).show()

+-------------+---------+
|gps_timestamp|sec_group|
+-------------+---------+
|   1483283623|     1467|
|   1483285514|     1530|
|   1483260074|      682|
|   1483260076|      682|
|   1483260080|      682|
|   1483260087|      682|
|   1483260091|      683|
|   1483260093|      683|
|   1483260097|      683|
|   1483260105|      683|
|   1483260107|      683|
|   1483260112|      683|
|   1483260120|      684|
|   1483260123|      684|
|   1483260128|      684|
|   1483260132|      684|
|   1483260137|      684|
|   1483260141|      684|
|   1483260143|      684|
|   1483260145|      684|
+-------------+---------+
only showing top 20 rows



In [20]:
trips_data.count()

7900150

In [21]:
ticketing_data.count()

306906

### Removing duplicate GPS records (occurred in the same time period)

In [22]:
filtered_trips_data = trips_data.na.drop(subset=["busStopId"]).dropDuplicates(["route","busCode", "busStopId", "sec_group"])
print filtered_trips_data.count()
filtered_trips_data.select(["route","busCode", "busStopId", "sec_group"]).limit(20).toPandas()

541266


Unnamed: 0,route,busCode,busStopId,sec_group
0,372,CC170,30174,838
1,876,BC190,30334,1215
2,876,BC190,33774,1295
3,876,BC190,33740,2123
4,876,BC190,34228,2384
5,203,GD349,26187,1916
6,203,GD349,25707,2113
7,203,GD349,25385,2174
8,511,EA171,28067,1190
9,511,EA171,28431,1728


### Removing Duplicate entries in ticketing data

In [23]:
ticketing_data = ticketing_data.dropDuplicates()

In [24]:
ticketing_data.count()

306904

### Merging GPS and ticketing data 

In [25]:
user_boardings = ticketing_data.join(filtered_trips_data, ['route','busCode','sec_group'], 'inner')

In [26]:
user_boardings.printSchema()

root
 |-- route: string (nullable = true)
 |-- busCode: string (nullable = true)
 |-- sec_group: long (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- card_timestamp: long (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- numPassengers: integer (nullable = true)
 |-- date: string (nullable = tr

In [27]:
user_boardings.select(['route','busCode','sec_group']).limit(20).toPandas()

Unnamed: 0,route,busCode,sec_group
0,20,BB300,639
1,20,CB604,2181
2,20,CB604,2181
3,20,CB604,2181
4,20,CB604,2181
5,20,CB604,2181
6,20,CB604,2181
7,21,BB601,1677
8,21,CB697,1828
9,30,BB498,782


In [28]:
user_boardings.count()

52071

In [29]:
user_boardings.select(['route','busCode','sec_group','busStopId','cardNum','cardTimestamp']).orderBy('cardNum','cardTimestamp').limit(20).toPandas()

Unnamed: 0,route,busCode,sec_group,busStopId,cardNum,cardTimestamp
0,828,LA053,1496,35350,229948,"10/05/17 12:28:19,000000"
1,50,LA054,1458,32417,310241,"10/05/17 12:09:23,000000"
2,531,EA169,897,29960,314357,"10/05/17 07:28:31,000000"
3,335,DR406,905,33604,314918,"10/05/17 07:32:52,000000"
4,216,CA600,2051,29165,317896,"10/05/17 17:05:57,000000"
5,2,DN029,2270,40026,321169,"10/05/17 18:55:17,000000"
6,811,BA018,1975,30515,321469,"10/05/17 16:27:47,000000"
7,779,JC865,710,33571,321916,"10/05/17 05:55:17,000000"
8,646,HA006,1578,38516,322132,"10/05/17 13:09:01,000000"
9,205,BC306,984,29085,323896,"10/05/17 08:12:02,000000"


In [30]:
ticketing_data.filter(F.col('cardNum') == '0000323978').show()

+-----+-------+-------------+--------------------+------------+----------+------+--------------+---------+
|route|busCode|userBirthdate|       cardTimestamp|    lineName|   cardNum|gender|card_timestamp|sec_group|
+-----+-------+-------------+--------------------+------------+----------+------+--------------+---------+
|  673|  HN615|     01/03/59|10/05/17 07:31:54...|     FORMOSA|0000323978|     F|    1483266714|      903|
|  000|  03009|     01/03/59|10/05/17 18:25:10...|OPER S/LINHA|0000323978|     F|    1483305910|     2210|
|  665|  EC011|     01/03/59|10/05/17 17:01:34...|      V. REX|0000323978|     F|    1483300894|     2043|
+-----+-------+-------------+--------------------+------------+----------+------+--------------+---------+



In [31]:
single_located_buses = filtered_trips_data.dropDuplicates(subset=['route','busCode','gps_timestamp']).count()

In [32]:
multi_located_buses = filtered_trips_data.subtract(single_located_buses)
multi_located_buses.limit(40).toPandas()

AttributeError: 'int' object has no attribute '_jdf'

In [None]:
boarding_count = user_boardings.groupby('cardNum').count()

In [None]:
total_passengers = user_boardings.select('cardNum').distinct().count()
print "Total #Passengers:", total_passengers

In [None]:
#Filtering only users with more than one ride per day
multiple_boardings = boarding_count.filter(F.col('count') > 1)

In [None]:
passengers_mult_boardings = multiple_boardings.count()
prop_mult_boardings = 100*(passengers_mult_boardings)/total_passengers
print "Passengers with Multiple Boardings:", passengers_mult_boardings, "(", prop_mult_boardings, "%)" 
multiple_boardings.show()

In [None]:
#Taking a look at a sample:
user_boardings.filter(F.col('cardNum') == '0002167105').limit(20).toPandas()