In [26]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [83]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read BUSTE result Data

In [81]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [84]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
user_boardings = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [85]:
print_df(user_boardings)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,372,1,1891,6136720,-25.428213,-49.204401,2604.58,CC170,,,...,,05:46:53,30817,BETWEEN,24/04/83,05:46:55,TARUMÃ,3395018,M,2017-05-10
1,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,13/06/62,06:07:55,TARUMÃ,459714,F,2017-05-10
2,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,28/07/70,06:07:00,TARUMÃ,2354665,F,2017-05-10
3,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,08/01/90,06:07:10,TARUMÃ,2694276,M,2017-05-10
4,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:23,TARUMÃ,1707734,F,2017-05-10
5,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:27,TARUMÃ,1707734,F,2017-05-10
6,372,1,1890,6137169,-25.42329,-49.201495,8200.134,CC170,,-25.423383,...,11.410669,06:25:25,30792,NO_PROBLEM,29/06/93,06:26:16,TARUMÃ,3794429,M,2017-05-10
7,372,1,1890,6137188,-25.419992,-49.200146,8748.949,CC170,,-25.419886,...,11.780608,06:26:54,30781,NO_PROBLEM,21/03/73,06:27:06,TARUMÃ,2525065,F,2017-05-10
8,372,1,1890,6137197,-25.417748,-49.201223,9020.108,CC170,,-25.417635,...,12.612804,06:27:45,30779,NO_PROBLEM,13/06/75,06:28:25,TARUMÃ,3203155,F,2017-05-10
9,372,1,1890,6137205,-25.415692,-49.20221,9268.598,CC170,,-25.415638,...,8.902769,06:28:27,30769,NO_PROBLEM,09/02/85,06:29:14,TARUMÃ,3768283,M,2017-05-10


In [86]:
user_boardings.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- userGender: string (nullable = true)
 |-- date: date (nullable = true)



In [87]:
user_boardings.count()

159889

In [88]:
user_boardings.select('cardNum').distinct().count()

106885

In [90]:
#user_boardings.select(['route','busCode','tripNum','busStopId','sec_group','cardNum','cardTimestamp','gps_timestamp']).orderBy('cardNum','cardTimestamp').limit(20).toPandas()

### Removing rows with missing data

In [91]:
filtered_boardings = user_boardings.na.drop(subset=["route","busCode","busStopId","timestamp","gpsLat","gpsLon","cardNum","cardTimestamp"])

In [92]:
print filtered_boardings.count()
print_df(filtered_boardings)

117704


Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,13/06/62,06:07:55,TARUMÃ,459714,F,2017-05-10
1,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,28/07/70,06:07:00,TARUMÃ,2354665,F,2017-05-10
2,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,08/01/90,06:07:10,TARUMÃ,2694276,M,2017-05-10
3,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:23,TARUMÃ,1707734,F,2017-05-10
4,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:27,TARUMÃ,1707734,F,2017-05-10
5,372,1,1890,6137169,-25.42329,-49.201495,8200.134,CC170,,-25.423383,...,11.410669,06:25:25,30792,NO_PROBLEM,29/06/93,06:26:16,TARUMÃ,3794429,M,2017-05-10
6,372,1,1890,6137188,-25.419992,-49.200146,8748.949,CC170,,-25.419886,...,11.780608,06:26:54,30781,NO_PROBLEM,21/03/73,06:27:06,TARUMÃ,2525065,F,2017-05-10
7,372,1,1890,6137197,-25.417748,-49.201223,9020.108,CC170,,-25.417635,...,12.612804,06:27:45,30779,NO_PROBLEM,13/06/75,06:28:25,TARUMÃ,3203155,F,2017-05-10
8,372,1,1890,6137205,-25.415692,-49.20221,9268.598,CC170,,-25.415638,...,8.902769,06:28:27,30769,NO_PROBLEM,09/02/85,06:29:14,TARUMÃ,3768283,M,2017-05-10
9,372,1,1890,6137205,-25.415692,-49.20221,9268.598,CC170,,-25.415638,...,8.902769,06:28:27,30769,NO_PROBLEM,17/11/94,06:28:30,TARUMÃ,3599362,M,2017-05-10


### Removing duplicated passenger data

In [93]:
filtered_boardings = filtered_boardings.dropDuplicates(['cardNum','date','cardTimestamp'])

In [94]:
print filtered_boardings.count()
print_df(filtered_boardings)

117702


Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,547,4,1994,6165294,-25.541827,-49.264612,4099.014,GA142,,-25.54168,...,31.68976,05:53:33,31271,NO_PROBLEM,22/04/63,05:54:23,V. SÃO PEDRO,431802,F,2017-05-10
1,828,1,2941,4249004,-25.460141,-49.340811,9660.0,LA001,,-25.460058,...,9.355352,06:06:58,34257,NO_PROBLEM,28/05/63,06:07:54,C.COMP/C.RASO,433069,F,2017-05-10
2,777,11,2195,4299895,-25.442293,-49.269757,1126.717,JC006,,-25.442241,...,7.72891,17:59:39,32903,NO_PROBLEM,03/02/93,18:00:33,V. VELHA,444810,F,2017-05-10
3,658,1,2115,6642358,-25.491613,-49.349015,2871.251,JA027,,-25.491595,...,3.681068,06:18:32,35720,NO_PROBLEM,28/07/71,06:19:29,C.RASO/CAIUÁ,624028,M,2017-05-10
4,628,2,2055,6035022,-25.544264,-49.315882,11215.314,HA016,,-25.544026,...,40.47706,07:59:07,35163,NO_PROBLEM,06/03/89,07:59:30,CARBOMAFRA,768474,F,2017-05-10
5,614,4,2030,5808518,-25.451123,-49.252929,9972.134,GR405,,-25.45111,...,4.154369,12:56:27,31749,NO_PROBLEM,20/09/66,12:57:31,FAZENDINHA/PUC,829222,M,2017-05-10
6,561,15,2005,6405261,-25.462226,-49.262185,3596.773,EC002,,-25.46229,...,11.532125,18:59:00,31986,NO_PROBLEM,24/04/57,18:59:08,GUILHERMINA,846408,F,2017-05-10
7,911,10,2260,4981458,-25.381588,-49.364092,6371.622,MN400,,-25.381475,...,22.17709,13:22:15,38953,NO_PROBLEM,18/10/83,13:23:17,OP. CONTIGENCIA,883963,F,2017-05-10
8,684,2,2148,6636357,-25.592501,-49.332138,1905.881,HR408,,-25.592538,...,5.097825,07:11:26,39373,NO_PROBLEM,06/08/73,07:11:39,RIO BONITO,927336,F,2017-05-10
9,515,3,1956,6367399,-25.511734,-49.221422,5028.205,EA164,,-25.511803,...,8.828443,07:35:50,33516,NO_PROBLEM,24/10/84,07:36:14,IGUAPE II,952666,F,2017-05-10


### Removing single-trip users records

In [95]:
boarding_count = filtered_boardings.groupby('cardNum').count()

In [96]:
#Filtering only users with more than one ride per day
multiple_boardings = boarding_count.filter(F.col('count') > 1)

In [97]:
total_passengers = user_boardings.select('cardNum').distinct().count()
print "Total #Passengers:", total_passengers

Total #Passengers: 106885


In [98]:
passengers_mult_boardings = multiple_boardings.count()
prop_mult_boardings = 100*(passengers_mult_boardings)/total_passengers
print "Passengers with Multiple Boardings:", passengers_mult_boardings, "(", prop_mult_boardings, "%)" 
multiple_boardings.show()

Passengers with Multiple Boardings: 19895 ( 18 %)
+-------+-----+
|cardNum|count|
+-------+-----+
|3790541|    2|
|3754400|    2|
|2298968|    2|
|3821560|    2|
|3553903|    2|
|3639748|    4|
|3337195|    3|
|2921321|    2|
|1842089|    2|
|1065595|    2|
|3599737|    2|
|3513400|    2|
|3780899|    2|
|2570449|    2|
| 814837|    2|
|2356476|    3|
|2279865|    2|
|1921487|    3|
|3804401|    2|
|2695352|    2|
+-------+-----+
only showing top 20 rows



In [99]:
multiple_boardings = multiple_boardings.select(F.col("cardNum").alias("cardNum1"),
                                                 F.col("count").alias("count1"))

In [100]:
clean_boardings = filtered_boardings.join(multiple_boardings, filtered_boardings.cardNum == multiple_boardings.cardNum1, 'leftsemi')

In [101]:
clean_boardings.count()

43554

In [102]:
print_df(clean_boardings)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,777,11,2195,4299895,-25.442293,-49.269757,1126.717,JC006,,-25.442241,...,7.72891,17:59:39,32903,NO_PROBLEM,03/02/93,18:00:33,V. VELHA,444810,F,2017-05-10
1,658,1,2115,6642358,-25.491613,-49.349015,2871.251,JA027,,-25.491595,...,3.681068,06:18:32,35720,NO_PROBLEM,28/07/71,06:19:29,C.RASO/CAIUÁ,624028,M,2017-05-10
2,614,4,2030,5808518,-25.451123,-49.252929,9972.134,GR405,,-25.45111,...,4.154369,12:56:27,31749,NO_PROBLEM,20/09/66,12:57:31,FAZENDINHA/PUC,829222,M,2017-05-10
3,684,2,2148,6636357,-25.592501,-49.332138,1905.881,HR408,,-25.592538,...,5.097825,07:11:26,39373,NO_PROBLEM,06/08/73,07:11:39,RIO BONITO,927336,F,2017-05-10
4,20,4,3259,6011355,-25.486591,-49.255923,15948.485,BB613,,-25.486621,...,25.342314,12:51:26,32712,NO_PROBLEM,31/12/83,12:51:53,INTERBAIRR II H,993266,M,2017-05-10
5,40,1,1717,3366719,-25.43289,-49.352078,5740.658,JB499,,-25.433115,...,26.680422,06:41:13,33973,NO_PROBLEM,02/11/63,06:41:17,OP. CONTIGENCIA,1074995,F,2017-05-10
6,338,1,1871,5884653,-25.483813,-49.223565,9008.924,EA001,,-25.483675,...,15.946461,06:22:02,32574,NO_PROBLEM,22/12/54,06:22:13,CENTENÁRIO/HAUER,1366300,F,2017-05-10
7,860,9,2240,4494930,-25.44779,-49.308205,4419.491,LC506,,-25.447751,...,10.117585,18:04:40,30221,NO_PROBLEM,27/05/76,18:05:34,V. SANDRA,1393454,M,2017-05-10
8,650,6,4166,6543310,-25.559689,-49.338679,2885.735,HA031,,-25.559665,...,7.632886,11:23:40,36242,NO_PROBLEM,27/04/58,11:24:41,STA.RITA/PINHEIRINHO,1432524,M,2017-05-10
9,653,10,3119,6040541,-25.507529,-49.334542,1560.973,HA011,,-25.507698,...,18.89802,12:59:43,33855,NO_PROBLEM,08/09/58,05:19:17,SABARÁ,1536386,F,2017-05-10


In [103]:
clean_boardings.write.csv(path=exp_data_folder_path+'/clean_boardings',header=True, mode='overwrite')