In [26]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [35]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_file(filepath, sqlContext):
    data_frame = sqlContext.read.csv(filepath, header=False,
                                     inferSchema=True,nullValue="-")
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender"),
        ]
    )

    date = "-".join(filepath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))

    return data_frame

def print_df(df,l=10):
    return df.limit(l).toPandas()

### Read BUSTE result Data

In [36]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

In [53]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
user_boardings = read_file(exp_data_folder_path + '/bulma-output/2017_05_11_veiculos.csv/part-00000', sqlContext)

In [54]:
print_df(user_boardings)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,372,1,1891,6136720,-25.428213,-49.204401,2604.58,CC170,,,...,,05:46:53,30817,BETWEEN,24/04/83,05:46:55,TARUMÃ,3395018,M,2017-05-11
1,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,13/06/62,06:07:55,TARUMÃ,459714,F,2017-05-11
2,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,28/07/70,06:07:00,TARUMÃ,2354665,F,2017-05-11
3,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,08/01/90,06:07:10,TARUMÃ,2694276,M,2017-05-11
4,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:23,TARUMÃ,1707734,F,2017-05-11
5,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:27,TARUMÃ,1707734,F,2017-05-11
6,372,1,1890,6137169,-25.42329,-49.201495,8200.134,CC170,,-25.423383,...,11.410669,06:25:25,30792,NO_PROBLEM,29/06/93,06:26:16,TARUMÃ,3794429,M,2017-05-11
7,372,1,1890,6137188,-25.419992,-49.200146,8748.949,CC170,,-25.419886,...,11.780608,06:26:54,30781,NO_PROBLEM,21/03/73,06:27:06,TARUMÃ,2525065,F,2017-05-11
8,372,1,1890,6137197,-25.417748,-49.201223,9020.108,CC170,,-25.417635,...,12.612804,06:27:45,30779,NO_PROBLEM,13/06/75,06:28:25,TARUMÃ,3203155,F,2017-05-11
9,372,1,1890,6137205,-25.415692,-49.20221,9268.598,CC170,,-25.415638,...,8.902769,06:28:27,30769,NO_PROBLEM,09/02/85,06:29:14,TARUMÃ,3768283,M,2017-05-11


In [55]:
user_boardings.printSchema()

root
 |-- route: string (nullable = true)
 |-- tripNum: integer (nullable = true)
 |-- shapeId: integer (nullable = true)
 |-- shapeSequence: integer (nullable = true)
 |-- shapeLat: double (nullable = true)
 |-- shapeLon: double (nullable = true)
 |-- distanceTraveledShape: double (nullable = true)
 |-- busCode: string (nullable = true)
 |-- gpsPointId: string (nullable = true)
 |-- gpsLat: double (nullable = true)
 |-- gpsLon: double (nullable = true)
 |-- distanceToShapePoint: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- busStopId: integer (nullable = true)
 |-- problem: string (nullable = true)
 |-- userBirthdate: string (nullable = true)
 |-- cardTimestamp: string (nullable = true)
 |-- lineName: string (nullable = true)
 |-- cardNum: integer (nullable = true)
 |-- userGender: string (nullable = true)
 |-- date: string (nullable = false)



In [56]:
user_boardings.count()

159889

In [57]:
user_boardings.select('cardNum').distinct().count()

106885

In [58]:
#user_boardings.select(['route','busCode','tripNum','busStopId','sec_group','cardNum','cardTimestamp','gps_timestamp']).orderBy('cardNum','cardTimestamp').limit(20).toPandas()

### Removing rows with missing data

In [64]:
filtered_boardings = user_boardings.na.drop(subset=["route","busCode","busStopId","timestamp","gpsLat","gpsLon","cardNum","cardTimestamp"])

In [65]:
print filtered_boardings.count()
print_df(filtered_boardings)

117704


Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,13/06/62,06:07:55,TARUMÃ,459714,F,2017-05-11
1,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,28/07/70,06:07:00,TARUMÃ,2354665,F,2017-05-11
2,372,1,1891,6136922,-25.432867,-49.270472,10381.698,CC170,,-25.432988,...,14.326287,06:05:23,26336,NO_PROBLEM,08/01/90,06:07:10,TARUMÃ,2694276,M,2017-05-11
3,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:23,TARUMÃ,1707734,F,2017-05-11
4,372,1,1890,6137134,-25.428043,-49.204411,7388.022,CC170,,-25.428011,...,5.374579,06:23:08,30818,NO_PROBLEM,14/03/92,06:23:27,TARUMÃ,1707734,F,2017-05-11
5,372,1,1890,6137169,-25.42329,-49.201495,8200.134,CC170,,-25.423383,...,11.410669,06:25:25,30792,NO_PROBLEM,29/06/93,06:26:16,TARUMÃ,3794429,M,2017-05-11
6,372,1,1890,6137188,-25.419992,-49.200146,8748.949,CC170,,-25.419886,...,11.780608,06:26:54,30781,NO_PROBLEM,21/03/73,06:27:06,TARUMÃ,2525065,F,2017-05-11
7,372,1,1890,6137197,-25.417748,-49.201223,9020.108,CC170,,-25.417635,...,12.612804,06:27:45,30779,NO_PROBLEM,13/06/75,06:28:25,TARUMÃ,3203155,F,2017-05-11
8,372,1,1890,6137205,-25.415692,-49.20221,9268.598,CC170,,-25.415638,...,8.902769,06:28:27,30769,NO_PROBLEM,09/02/85,06:29:14,TARUMÃ,3768283,M,2017-05-11
9,372,1,1890,6137205,-25.415692,-49.20221,9268.598,CC170,,-25.415638,...,8.902769,06:28:27,30769,NO_PROBLEM,17/11/94,06:28:30,TARUMÃ,3599362,M,2017-05-11


### Removing duplicated passenger data

In [66]:
filtered_boardings = filtered_boardings.dropDuplicates(['cardNum','date','cardTimestamp'])

In [67]:
print filtered_boardings.count()
print_df(filtered_boardings)

117702


Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,216,7,1785,5950930,-25.466362,-49.279723,11325.265,CA600,,-25.466531,...,24.603973,17:05:48,29165,NO_PROBLEM,16/09/60,17:05:57,CABRAL / PORTÃO,317896,F,2017-05-11
1,471,3,1932,6356220,-25.436524,-49.261766,8309.606,EC294,,-25.436408,...,14.867619,08:47:41,30746,NO_PROBLEM,11/07/65,08:48:57,V. SÃO PAULO,361111,M,2017-05-11
2,629,10,2057,6460060,-25.512859,-49.258778,4986.604,KA698,,-25.512841,...,4.009584,16:28:16,29829,NO_PROBLEM,08/08/60,16:29:37,ALTO BOQUEIRÃO,384718,F,2017-05-11
3,30,1,1716,6293373,-25.426396,-49.202344,7173.084,BB608,,-25.426533,...,15.485907,06:19:30,30813,NO_PROBLEM,29/05/64,06:19:41,OP. CONTIGENCIA,432642,F,2017-05-11
4,265,6,2796,6414685,-25.42376,-49.270031,4828.191,HN616,,-25.423806,...,7.186085,15:15:27,28615,NO_PROBLEM,20/12/79,15:16:11,AHÚ/LOS ANGELES,567437,F,2017-05-11
5,876,1,2251,6065322,-25.414226,-49.334509,3759.8,BC299,,-25.414233,...,0.975568,06:40:56,34661,NO_PROBLEM,12/09/69,06:41:13,SAVÓIA,616792,F,2017-05-11
6,10,2,1708,5859650,-25.449428,-49.254131,8702.205,BB305,,-25.449436,...,10.500334,07:40:54,33171,NO_PROBLEM,28/11/85,08:04:58,INTERBAIRROS I H,691213,F,2017-05-11
7,924,4,2884,6322747,-25.390508,-49.283515,10180.475,BA140,,-25.390503,...,10.590269,12:47:31,27949,NO_PROBLEM,22/12/68,12:47:43,STA. FELICIDADE / STA. CÂNDIDA,703069,F,2017-05-11
8,542,7,1988,6403984,-25.549763,-49.259976,4240.869,GA124,,-25.549681,...,15.571715,07:29:32,31259,NO_PROBLEM,29/06/83,07:30:45,BAIRRO NOVO B,723951,M,2017-05-11
9,535,3,1979,6393338,-25.556785,-49.255025,592.564,EA076,,-25.556726,...,7.44891,06:25:50,36878,NO_PROBLEM,17/07/78,06:26:01,OSTERNACK/BOQ.,730511,M,2017-05-11


### Removing single-trip users records

In [68]:
boarding_count = filtered_boardings.groupby('cardNum').count()

In [69]:
#Filtering only users with more than one ride per day
multiple_boardings = boarding_count.filter(F.col('count') > 1)

In [70]:
total_passengers = user_boardings.select('cardNum').distinct().count()
print "Total #Passengers:", total_passengers

Total #Passengers: 106885


In [71]:
passengers_mult_boardings = multiple_boardings.count()
prop_mult_boardings = 100*(passengers_mult_boardings)/total_passengers
print "Passengers with Multiple Boardings:", passengers_mult_boardings, "(", prop_mult_boardings, "%)" 
multiple_boardings.show()

Passengers with Multiple Boardings: 19895 ( 18 %)
+-------+-----+
|cardNum|count|
+-------+-----+
|2869360|    2|
|3639748|    4|
|2298968|    2|
|3804401|    2|
|1920221|    2|
|3757862|    2|
|1959648|    2|
|3177352|    2|
|3834572|    2|
|3219791|    2|
|3773168|    2|
|2387514|    2|
|2570449|    2|
|3570263|    3|
|2417305|    2|
|3463650|    4|
|3686423|    2|
|1793246|    2|
|3556833|    2|
|3337195|    3|
+-------+-----+
only showing top 20 rows



In [72]:
multiple_boardings = multiple_boardings.select(F.col("cardNum").alias("cardNum1"),
                                                 F.col("count").alias("count1"))

In [73]:
clean_boardings = filtered_boardings.join(multiple_boardings, filtered_boardings.cardNum == multiple_boardings.cardNum1, 'leftsemi')

In [74]:
clean_boardings.count()

43554

In [75]:
print_df(clean_boardings)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,216,7,1785,5950930,-25.466362,-49.279723,11325.265,CA600,,-25.466531,...,24.603973,17:05:48,29165,NO_PROBLEM,16/09/60,17:05:57,CABRAL / PORTÃO,317896,F,2017-05-11
1,471,3,1932,6356220,-25.436524,-49.261766,8309.606,EC294,,-25.436408,...,14.867619,08:47:41,30746,NO_PROBLEM,11/07/65,08:48:57,V. SÃO PAULO,361111,M,2017-05-11
2,629,10,2057,6460060,-25.512859,-49.258778,4986.604,KA698,,-25.512841,...,4.009584,16:28:16,29829,NO_PROBLEM,08/08/60,16:29:37,ALTO BOQUEIRÃO,384718,F,2017-05-11
3,876,1,2251,6065322,-25.414226,-49.334509,3759.8,BC299,,-25.414233,...,0.975568,06:40:56,34661,NO_PROBLEM,12/09/69,06:41:13,SAVÓIA,616792,F,2017-05-11
4,10,2,1708,5859650,-25.449428,-49.254131,8702.205,BB305,,-25.449436,...,10.500334,07:40:54,33171,NO_PROBLEM,28/11/85,08:04:58,INTERBAIRROS I H,691213,F,2017-05-11
5,924,4,2884,6322747,-25.390508,-49.283515,10180.475,BA140,,-25.390503,...,10.590269,12:47:31,27949,NO_PROBLEM,22/12/68,12:47:43,STA. FELICIDADE / STA. CÂNDIDA,703069,F,2017-05-11
6,535,3,1979,6393338,-25.556785,-49.255025,592.564,EA076,,-25.556726,...,7.44891,06:25:50,36878,NO_PROBLEM,17/07/78,06:26:01,OSTERNACK/BOQ.,730511,M,2017-05-11
7,20,3,3260,6012518,-25.388074,-49.283011,27166.955,KB605,,-25.387981,...,11.500394,11:13:20,27950,NO_PROBLEM,16/10/57,11:13:55,OP. CONTIGENCIA,744699,F,2017-05-11
8,285,6,1837,5985394,-25.431418,-49.272796,5502.652,BN628,,-25.431406,...,14.338867,14:05:22,26553,NO_PROBLEM,26/01/82,14:11:32,JUVEVE/A.VERDE,794318,F,2017-05-11
9,243,2,1817,5894195,-25.356545,-49.274381,4768.989,BA015,,-25.356585,...,22.466381,06:41:57,27536,NO_PROBLEM,05/06/73,06:44:35,STA. TEREZINHA,866478,F,2017-05-11


In [77]:
clean_boardings.write.csv(path=exp_data_folder_path+'/clean_boardings',header=True, mode='overwrite')