In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql import types as T

import json
import numpy as np

In [2]:
def rename_columns(df, list_of_tuples):
    for (old_col, new_col) in list_of_tuples:
        df = df.withColumnRenamed(old_col, new_col)
    return df

def read_buste_data_v2(sqlContext, folderpath):
    data_frame = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "false")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(folderpath)
        
    while len(data_frame.columns) < 16:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "numPassengers")
        ]
    )

    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def read_buste_data_v3(sqlContext, folderpath):
    data_frame = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "false")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(folderpath)
        
    while len(data_frame.columns) < 20:
        col_name = "_c" + str(len(data_frame.columns))
        data_frame = data_frame.withColumn(col_name, F.lit(None))

    data_frame = rename_columns(
        data_frame,
        [
            ("_c0", "route"),
            ("_c1", "tripNum"),
            ("_c2", "shapeId"),
            ("_c3", "shapeSequence"),
            ("_c4", "shapeLat"),
            ("_c5", "shapeLon"),
            ("_c6", "distanceTraveledShape"),
            ("_c7", "busCode"),
            ("_c8", "gpsPointId"),
            ("_c9", "gpsLat"),
            ("_c10", "gpsLon"),
            ("_c11", "distanceToShapePoint"),
            ("_c12", "timestamp"),
            ("_c13", "busStopId"),
            ("_c14", "problem"),
            ("_c15", "userBirthdate"),
            ("_c16", "cardTimestamp"),
            ("_c17", "lineName"),
            ("_c18", "cardNum"),
            ("_c19", "userGender")
        ]
    )

    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))
    
    return data_frame

def read_buste_data_v3a(sqlContext, folderpath):
    data_frame = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(folderpath)
        
    data_frame = rename_columns(
        data_frame,
        [
            ("cardNum18", "cardNum"),
            ("cardNum19", "userGender"),
        ]
    )
    
    date = "-".join(folderpath.split("/")[-2].split("_")[:3])

    data_frame = data_frame.withColumn("date", F.lit(date))
    data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"),1))

    return data_frame

def printdf(df,l=10):
    return df.limit(l).toPandas()

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

### Analyze BUSTE v2 result Data

In [4]:
old_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v2/'

In [5]:
old_user_boardings = read_buste_data_v2(sqlContext, old_exp_data_folder_path + '/buste-output/2017_05_11_veiculos.csv/')

In [6]:
old_user_boardings.count()

7900150

In [7]:
main_features = ['route','busCode','tripNum','busStopId','timestamp']

In [8]:
clean_old_user_boardings = old_user_boardings.na.drop(subset=main_features) \
        .dropDuplicates(main_features) \
        .orderBy(['route','busCode','tripNum','timestamp'])
printdf(clean_old_user_boardings,l=100)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,gpsLon,distanceToShapePoint,timestamp,busStopId,problem,numPassengers,date
0,001,1,2938,6451385,-25.428133,-49.264531,338.081,BN997,,-25.428198,-49.264428,12.657534,06:21:08,29887,NO_PROBLEM,0,2017-05-10
1,001,1,2938,6451394,-25.430622,-49.263426,635.815,BN997,,-25.430830,-49.263316,25.616758,06:22:10,31453,NO_PROBLEM,0,2017-05-10
2,001,1,2938,6451403,-25.433455,-49.262219,972.378,BN997,,,,,06:22:44,31454,BETWEEN,0,2017-05-10
3,001,1,2938,6451415,-25.435452,-49.265462,1428.729,BN997,,-25.435406,-49.265455,5.110665,06:24:42,30748,NO_PROBLEM,0,2017-05-10
4,001,1,2938,6451418,-25.435893,-49.266571,1550.500,BN997,,-25.435816,-49.266521,9.893764,06:24:52,30749,NO_PROBLEM,0,2017-05-10
5,001,1,2938,6451424,-25.436991,-49.269392,1859.320,BN997,,-25.436960,-49.269536,14.841262,06:26:15,35216,NO_PROBLEM,0,2017-05-10
6,001,1,2938,6451461,-25.435749,-49.274886,2652.419,BN997,,-25.435708,-49.274871,4.839960,06:29:21,26166,NO_PROBLEM,0,2017-05-10
7,001,1,2938,6451475,-25.433142,-49.276703,3090.819,BN997,,,,,06:31:19,35219,BETWEEN,0,2017-05-10
8,001,1,2938,6451490,-25.430831,-49.276405,3473.319,BN997,,,,,06:33:21,29420,BETWEEN,0,2017-05-10
9,001,1,2938,6451495,-25.429963,-49.274031,3730.817,BN997,,,,,06:34:09,29082,BETWEEN,0,2017-05-10


In [9]:
printdf(clean_old_user_boardings.filter(clean_old_user_boardings.route == '002') \
        .select(main_features) \
        ,  l=30)

Unnamed: 0,route,busCode,tripNum,busStopId,timestamp
0,2,DN027,1,26550,06:46:03
1,2,DN027,1,26551,06:47:40
2,2,DN027,1,26552,06:48:16
3,2,DN027,1,28603,06:49:19
4,2,DN027,1,34249,06:50:08
5,2,DN027,1,28604,06:50:27
6,2,DN027,1,29080,06:50:53
7,2,DN027,1,47784,06:51:14
8,2,DN027,1,40026,06:52:20
9,2,DN027,1,39724,06:53:15


In [10]:
clean_old_user_boardings.count()

540908

### Analyze BUSTE v3 result Data

In [11]:
v3_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3/'

In [12]:
v3_user_boardings = read_buste_data_v3(sqlContext, v3_exp_data_folder_path + '/buste-output/2017_05_11_veiculos.csv/')

In [13]:
v3_user_boardings.count()

159889

In [14]:
main_featuresv3 = ['route','busCode','tripNum','busStopId','timestamp','cardNum']

In [15]:
clean_v3_user_boardings = v3_user_boardings.na.drop(subset=main_featuresv3) \
        .dropDuplicates(main_featuresv3) \
        .orderBy(['route','busCode','tripNum','timestamp'])
printdf(clean_v3_user_boardings,l=100)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,busStopId,problem,userBirthdate,cardTimestamp,lineName,cardNum,userGender,date
0,001,1,2938,6451534,-25.428539,-49.267148,4546.136,BN997,,-25.428566,...,3.468380,06:41:44,26360,NO_PROBLEM,07/07/52,06:46:35,C. CENTRO,1287114,F,2017-05-10
1,001,1,2938,6451534,-25.428539,-49.267148,4546.136,BN997,,-25.428566,...,3.468380,06:41:44,26360,NO_PROBLEM,30/06/76,06:45:29,C. CENTRO,1415447,F,2017-05-10
2,001,1,2938,6451534,-25.428539,-49.267148,4546.136,BN997,,-25.428566,...,3.468380,06:41:44,26360,NO_PROBLEM,,06:45:14,C. CENTRO,10054641,,2017-05-10
3,001,1,2938,6451534,-25.428539,-49.267148,4546.136,BN997,,-25.428566,...,3.468380,06:41:44,26360,NO_PROBLEM,,06:43:32,C. CENTRO,10060773,,2017-05-10
4,001,2,2938,6451403,-25.433455,-49.262219,972.378,BN997,,-25.433371,...,14.042668,06:49:45,31454,NO_PROBLEM,,06:50:12,C. CENTRO,10043861,,2017-05-10
5,001,2,2938,6451490,-25.430831,-49.276405,3473.319,BN997,,-25.430690,...,17.824696,07:00:16,29420,NO_PROBLEM,07/12/76,07:01:14,C. CENTRO,3265381,F,2017-05-10
6,001,3,2938,6451385,-25.428133,-49.264531,338.081,BN997,,-25.428088,...,4.959445,07:13:25,29887,NO_PROBLEM,16/12/68,07:14:12,C. CENTRO,2269224,M,2017-05-10
7,001,3,2938,6451509,-25.429985,-49.271273,4070.474,BN997,,,...,,07:30:34,26375,BETWEEN,03/12/63,07:33:08,C. CENTRO,3360767,M,2017-05-10
8,001,4,2938,6451385,-25.428133,-49.264531,338.081,BN997,,-25.428031,...,11.533046,07:38:36,29887,NO_PROBLEM,10/11/94,07:39:31,C. CENTRO,3461019,M,2017-05-10
9,001,4,2938,6451394,-25.430622,-49.263426,635.815,BN997,,-25.430811,...,22.178640,07:40:10,31453,NO_PROBLEM,17/02/98,07:40:42,C. CENTRO,3818927,F,2017-05-10


In [16]:
printdf(clean_v3_user_boardings.filter(clean_v3_user_boardings.route == '002') \
        .select(main_featuresv3) \
        ,  l=30)

Unnamed: 0,route,busCode,tripNum,busStopId,timestamp,cardNum
0,2,DN027,1,26552,06:48:16,10046062
1,2,DN027,1,47784,06:51:14,3396510
2,2,DN027,1,40029,07:02:44,2908816
3,2,DN027,1,31458,07:08:13,3815627
4,2,DN027,1,31458,07:08:13,3215276
5,2,DN027,1,31458,07:08:13,3367073
6,2,DN027,1,31458,07:08:13,2137936
7,2,DN027,1,31458,07:08:13,3499752
8,2,DN027,1,31458,07:08:13,2820014
9,2,DN027,2,40026,07:30:29,3587172


In [17]:
clean_v3_user_boardings.count()

132323

### Analyze BUSTE v3a result Data

In [18]:
v3a_exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/buste-v3a//'

In [19]:
v3a_user_boardings = read_buste_data_v3a(sqlContext, v3a_exp_data_folder_path + '/buste-output/2017_05_11_veiculos.csv/')

In [20]:
v3a_user_boardings.count()

60979

In [21]:
main_features_v3a = ['route','busCode','tripNum','stopPointId','timestamp','cardNum']

In [22]:
clean_v3a_user_boardings = v3a_user_boardings.na.drop(subset=main_features_v3a) \
        .dropDuplicates(main_features_v3a) \
        .orderBy(['route','busCode','tripNum','timestamp'])
printdf(clean_v3a_user_boardings,l=100)

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,stopPointId,problem,birthdate,cardTimestamp,lineName,cardNum,userGender,date
0,60,3,1721,4389251,-25.441576,-49.346829,0.000,HB302,,-25.416683,...,12679.947000,04:54:25,31003,OUTLIER_POINT,06/02/76,04:57:44,INTERBAIRROS VI,2584115,F,2017-05-10
1,60,3,1721,4389251,-25.441576,-49.346829,0.000,HB302,,-25.416683,...,12679.947000,04:54:25,31003,OUTLIER_POINT,20/05/86,05:07:52,INTERBAIRROS VI,3785710,F,2017-05-10
2,60,3,1721,4390315,-25.519425,-49.294535,19860.780,HB302,,,...,,06:13:12,33272,BETWEEN,23/07/82,06:13:57,INTERBAIRROS VI,2965373,M,2017-05-10
3,60,4,1722,4392541,-25.529083,-49.326491,6191.071,HB302,,-25.529088,...,1.683573,05:43:45,33238,NO_PROBLEM,29/04/75,05:44:38,INTERBAIRROS VI,3672578,F,2017-05-10
4,60,4,1722,4392558,-25.525203,-49.330124,6755.664,HB302,,,...,,05:45:35,33233,BETWEEN,07/09/80,05:45:57,INTERBAIRROS VI,1979370,F,2017-05-10
5,60,4,1722,4392558,-25.525203,-49.330124,6755.664,HB302,,,...,,05:45:35,33233,BETWEEN,16/10/72,05:45:48,INTERBAIRROS VI,3734104,F,2017-05-10
6,60,4,1722,4392586,-25.519669,-49.334358,7505.616,HB302,,-25.519696,...,7.450906,05:46:45,33232,NO_PROBLEM,25/04/76,05:47:25,INTERBAIRROS VI,3615381,F,2017-05-10
7,60,4,1722,4392586,-25.519669,-49.334358,7505.616,HB302,,-25.519696,...,7.450906,05:46:45,33232,NO_PROBLEM,20/02/80,05:47:29,INTERBAIRROS VI,2357789,M,2017-05-10
8,60,4,1722,4392708,-25.485663,-49.346612,11485.391,HB302,,,...,,05:53:37,33204,BETWEEN,19/04/56,05:56:15,INTERBAIRROS VI,3458036,F,2017-05-10
9,60,4,1722,4392708,-25.485663,-49.346612,11485.391,HB302,,,...,,05:53:37,33204,BETWEEN,25/01/81,05:56:25,INTERBAIRROS VI,3630182,M,2017-05-10


In [23]:
printdf(clean_v3a_user_boardings.filter(clean_v3a_user_boardings.route == 2) \
        .select(main_features_v3a) \
        ,  l=30)

Unnamed: 0,route,busCode,tripNum,stopPointId,timestamp,cardNum


In [24]:
clean_v3a_user_boardings.count()

10284

In [25]:
printdf(v3a_user_boardings.filter(v3a_user_boardings.route == 2) \
        .select(main_features_v3a) \
        ,  l=30)

Unnamed: 0,route,busCode,tripNum,stopPointId,timestamp,cardNum


### Read Bus Card Data

In [26]:
exp_data_folder_path = '/local/tarciso/masters/experiments/preliminary-exp/preliminary-exp-sample-data/'
ticketing_data = sqlContext.read.json(exp_data_folder_path + '/ticketing-sample/doc1-2017051115.txt')

In [27]:
#Renaming columns to english
ticketing_data = ticketing_data.select(F.col("CODLINHA").alias("route"),
                                       F.col("CODVEICULO").alias("busCode"),
                                       F.col("DATANASCIMENTO").alias("userBirthdate"),
                                       F.col("DATAUTILIZACAO").alias("cardTimestamp"),
                                       F.col("NOMELINHA").alias("lineName"),
                                       F.col("NUMEROCARTAO").alias("cardNum"),
                                       F.col("SEXO").alias("gender"))

In [28]:
printdf(ticketing_data.limit(10))

Unnamed: 0,route,busCode,userBirthdate,cardTimestamp,lineName,cardNum,gender
0,021,08046,26/01/72,"10/05/17 20:15:16,000000",INTERB II ANTI H,1937533,F
1,021,08027,26/01/72,"10/05/17 13:10:24,000000",INTERB II ANTI H,1937533,F
2,623,HA022,06/03/71,"10/05/17 08:23:45,000000",PQ.INDUSTRIAL,1311020,F
3,000,03023,06/03/71,"10/05/17 11:54:19,000000",OPER S/LINHA,1311020,F
4,TPH,03019,23/11/79,"10/05/17 13:30:10,000000",TERMINAL PINHEIRINHO,2425635,F
5,040,JB603,28/08/84,"10/05/17 07:52:52,000000",INTERBAIRROS IV,3591720,F
6,000,09025,28/08/84,"10/05/17 18:34:06,000000",OPER S/LINHA,3591720,F
7,000,05018,17/01/79,"10/05/17 06:15:31,000000",OPER S/LINHA,2699543,F
8,040,MB605,17/08/57,"10/05/17 17:57:28,000000",INTERBAIRROS IV,2293790,M
9,535,EA077,20/05/96,"10/05/17 10:03:56,000000",OSTERNACK/BOQ.,3606743,M


In [29]:
clean_ticketing_data = ticketing_data \
                        .na.drop(subset=["route","busCode","cardNum","cardTimestamp"]) \
                        .dropDuplicates(["route","busCode","cardNum","cardTimestamp"]) \
                        .orderBy(['cardNum','cardTimestamp'])

In [30]:
printdf(clean_ticketing_data,l=30)

Unnamed: 0,route,busCode,userBirthdate,cardTimestamp,lineName,cardNum,gender
0,000,08024,27/04/94,"10/05/17 07:02:44,000000",OPER S/LINHA,228696,F
1,828,LA053,28/04/95,"10/05/17 12:28:19,000000",C.COMP/C.RASO,229948,F
2,000,05062,28/04/95,"10/05/17 15:25:52,000000",OPER S/LINHA,229948,F
3,633,GA159,05/02/85,"10/05/17 06:29:20,000000",MARIA ANGÉLICA,242949,F
4,000,06024,05/02/85,"10/05/17 09:33:50,000000",OPER S/LINHA,242949,F
5,000,03048,17/09/74,"10/05/17 14:52:25,000000",OPER S/LINHA,272904,F
6,000,03047,17/09/74,"10/05/17 16:57:55,000000",OPER S/LINHA,272904,F
7,654,HA240,20/05/59,"10/05/17 06:58:37,000000",CAMPO ALEGRE,300327,F
8,000,03057,20/05/59,"10/05/17 16:49:14,000000",OPER S/LINHA,300327,F
9,000,03068,08/07/64,"10/05/17 13:47:01,000000",OPER S/LINHA,304096,F


In [31]:
printdf(clean_ticketing_data.filter((clean_ticketing_data.route != '000')) \
        .orderBy('route','busCode','cardTimestamp'),l=30)

Unnamed: 0,route,busCode,userBirthdate,cardTimestamp,lineName,cardNum,gender
0,2,DN027,23/02/75,"10/05/17 06:41:23,000000",C. CENTRO (AH),2825022,F
1,2,DN027,,"10/05/17 06:48:29,000000",C. CENTRO (AH),10046062,
2,2,DN027,15/07/97,"10/05/17 06:52:08,000000",C. CENTRO (AH),3396510,F
3,2,DN027,18/01/78,"10/05/17 07:03:21,000000",C. CENTRO (AH),2908816,F
4,2,DN027,13/03/81,"10/05/17 07:12:10,000000",C. CENTRO (AH),3815627,F
5,2,DN027,28/03/67,"10/05/17 07:12:17,000000",C. CENTRO (AH),3367073,F
6,2,DN027,06/04/74,"10/05/17 07:13:34,000000",C. CENTRO (AH),3215276,F
7,2,DN027,06/04/74,"10/05/17 07:13:37,000000",C. CENTRO (AH),3215276,F
8,2,DN027,04/01/84,"10/05/17 07:14:32,000000",C. CENTRO (AH),2820014,F
9,2,DN027,07/06/79,"10/05/17 07:14:40,000000",C. CENTRO (AH),3499752,F


### Analyze usage of line 000

In [32]:
clean_ticketing_data.select('cardNum').distinct().count()

185937

In [33]:
clean_ticketing_data.filter(clean_ticketing_data.route == '000').select('cardNum').distinct().count()

97518

In [34]:
num_boardings = clean_ticketing_data.groupby('cardNum').count()

In [35]:
printdf(num_boardings)

Unnamed: 0,cardNum,count
0,328918,2
1,353208,2
2,425245,2
3,425429,1
4,430520,1
5,430891,1
6,431802,2
7,456908,2
8,496775,2
9,537146,1


In [36]:
num_boardings.filter(F.col('count') > 1).count()

91434

In [37]:
print 91434/float(185937)

0.491747204698
