In [1]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Read Boarding Data

In [2]:
boarding_data = pd.read_csv('/local/tarciso/masters/data/bus_trips/test/doc1-2017_05_10.csv')

In [3]:
boarding_data.head()

Unnamed: 0,CODLINHA,NOMELINHA,CODVEICULO,NUMEROCARTAO,HORAUTILIZACAO,DATAUTILIZACAO,DATANASCIMENTO,SEXO
0,0,OPER S/LINHA,00070,1353891,11:05:57,09/05/17,22/11/58,M
1,542,BAIRRO NOVO B,GA117,2357837,17:25:14,09/05/17,23/03/72,F
2,0,OPER S/LINHA,09053,2357837,17:57:33,09/05/17,23/03/72,F
3,0,OPER S/LINHA,09053,2357837,17:57:28,09/05/17,23/03/72,F
4,21,INTERB II ANTI H,08046,1937533,20:17:34,09/05/17,26/01/72,F


In [6]:
boardings_total = len(boarding_data)
print boardings_total

320292


In [7]:
boarding_data.dtypes

CODLINHA          object
NOMELINHA         object
CODVEICULO        object
NUMEROCARTAO       int64
HORAUTILIZACAO    object
DATAUTILIZACAO    object
DATANASCIMENTO    object
SEXO              object
dtype: object

#### Adding datetime column to ticketing dataframe 

In [8]:
boarding_data['boarding_datetime'] = pd.to_datetime(boarding_data['DATAUTILIZACAO'] + ' ' + boarding_data['HORAUTILIZACAO'],format='%d/%m/%y %H:%M:%S')

### Read GPS data for the same day

In [9]:
gps_data = pd.read_csv('/local/tarciso/masters/data/bus_trips/test/buste-version-its-april-2018/2017_05_09.csv', dtype = {'route': str}, na_values='-')

In [10]:
gps_data.head()

Unnamed: 0,route,tripNum,shapeId,shapeSequence,shapeLat,shapeLon,distanceTraveledShape,busCode,gpsPointId,gpsLat,...,distanceToShapePoint,timestamp,stopPointId,problem,birthdate,cardTimestamp,lineName,cardNum,gender,date
0,500,1.0,4130,6463875,-25.50155,-49.23759,2129.862,GE718,,,...,,06:37:02,27472,BETWEEN,,,,,,2017_05_09
1,500,1.0,4130,6463945,-25.481825,-49.246977,4513.26,GE718,,-25.481693,...,15.561821,06:42:45,27551,NO_PROBLEM,,,,,,2017_05_09
2,500,1.0,4130,6464041,-25.459174,-49.25816,7268.131,GE718,,,...,,06:47:54,25471,BETWEEN,,,,,,2017_05_09
3,500,1.0,4130,6464140,-25.438837,-49.268129,9735.966,GE718,,-25.438738,...,12.105323,06:53:44,25515,NO_PROBLEM,,,,,,2017_05_09
4,500,1.0,4130,6464167,-25.43379,-49.270262,10543.422,GE718,,-25.433916,...,21.556105,06:58:25,27560,NO_PROBLEM,,,,,,2017_05_09


In [11]:
gps_data.dtypes

route                     object
tripNum                  float64
shapeId                    int64
shapeSequence              int64
shapeLat                 float64
shapeLon                 float64
distanceTraveledShape    float64
busCode                   object
gpsPointId               float64
gpsLat                   float64
gpsLon                   float64
distanceToShapePoint     float64
timestamp                 object
stopPointId                int64
problem                   object
birthdate                 object
cardTimestamp             object
lineName                  object
cardNum                  float64
gender                    object
date                      object
dtype: object

In [12]:
len(gps_data)

642639

#### Add date and datetime to gps dataframe

In [13]:
gps_data['date'] = gps_data['date'].str.replace('_','-')
gps_data['gps_datetime'] = pd.to_datetime(gps_data['date'] + ' ' + gps_data['timestamp'],format='%Y-%m-%d %H:%M:%S')
gps_data['boarding_datetime'] = pd.to_datetime(gps_data['date'] + ' ' + gps_data['cardTimestamp'],format='%Y-%m-%d %H:%M:%S')
gps_data['date'] = pd.to_datetime(gps_data['date'], format='%Y-%m-%d') 

In [17]:
gps_data[['date','gps_datetime','boarding_datetime']][pd.notnull(gps_data['boarding_datetime'])].head(10)

Unnamed: 0,date,gps_datetime,boarding_datetime
148,2017-05-09,2017-05-09 06:16:34,2017-05-09 06:17:31
150,2017-05-09,2017-05-09 06:18:08,2017-05-09 06:19:04
151,2017-05-09,2017-05-09 06:18:08,2017-05-09 06:19:00
152,2017-05-09,2017-05-09 06:18:08,2017-05-09 06:18:15
153,2017-05-09,2017-05-09 06:18:08,2017-05-09 06:18:11
154,2017-05-09,2017-05-09 06:19:18,2017-05-09 06:19:34
155,2017-05-09,2017-05-09 06:20:46,2017-05-09 06:21:39
157,2017-05-09,2017-05-09 06:23:04,2017-05-09 06:23:53
158,2017-05-09,2017-05-09 06:23:56,2017-05-09 06:24:41
163,2017-05-09,2017-05-09 06:29:02,2017-05-09 06:30:09


#### Analyzing Boarding and GPS Data

In [18]:
boarding_data[['NUMEROCARTAO','DATAUTILIZACAO','HORAUTILIZACAO','CODLINHA','CODVEICULO']] \
    .sort_values(['NUMEROCARTAO','DATAUTILIZACAO']).head(10)

Unnamed: 0,NUMEROCARTAO,DATAUTILIZACAO,HORAUTILIZACAO,CODLINHA,CODVEICULO
243380,228696,09/05/17,07:01:00,000,08024
243381,228696,09/05/17,13:55:35,000,05312
61748,229948,09/05/17,06:29:09,654,HA017
199680,233641,09/05/17,12:08:17,021,04020
199681,233641,09/05/17,06:39:21,000,01026
199751,257342,09/05/17,17:24:51,000,01021
199752,257342,09/05/17,07:48:58,511,EA172
14035,272904,09/05/17,17:24:52,000,03047
199772,300327,09/05/17,16:52:22,000,03057
199773,300327,09/05/17,06:56:45,OPC,HA240


In [20]:
gps_data[['cardNum','cardTimestamp','timestamp','route','busCode','tripNum','stopPointId']] \
            .sort_values(['cardNum','cardTimestamp','timestamp']).head(10)

Unnamed: 0,cardNum,cardTimestamp,timestamp,route,busCode,tripNum,stopPointId
558392,229948.0,06:29:09,06:28:49,654,HA017,2.0,35350
260499,257342.0,07:48:58,07:47:51,511,EA172,4.0,31195
408836,300327.0,06:56:45,06:55:54,654,HA240,3.0,36094
199446,304127.0,10:24:50,10:22:28,175,BC010,5.0,31748
305000,304127.0,10:57:36,10:42:18,370,LC016,3.0,3377
517064,304627.0,14:17:23,14:16:09,370,BC032,6.0,3377
421546,304627.0,16:24:40,15:45:37,370,LC020,6.0,30303
108958,306135.0,07:56:49,07:56:48,777,JC004,3.0,32097
258047,310241.0,17:55:48,17:55:42,468,DC090,5.0,30884
112368,312500.0,13:26:11,13:24:48,175,BC282,6.0,28632


### Checking for problems

#### Analyzing Trip Start/End times

In [21]:
trip_initial_time = gps_data[(gps_data['route'] == '654') & (gps_data['busCode'] == 'HA017')].sort_values('timestamp').groupby('tripNum').first() \
    .reset_index() \
    .sort_values(['tripNum','timestamp'])[['tripNum','timestamp']]
    
trip_final_time = gps_data[(gps_data['route'] == '654') & (gps_data['busCode'] == 'HA017')].sort_values('timestamp').groupby('tripNum').last() \
    .reset_index() \
    .sort_values(['tripNum','timestamp'])[['tripNum','timestamp']]

trip_initial_final_time = trip_initial_time.merge(trip_final_time, on='tripNum', how='inner')

trip_initial_final_time

Unnamed: 0,tripNum,timestamp_x,timestamp_y
0,1.0,05:16:44,05:56:19
1,2.0,05:58:50,06:32:01
2,3.0,06:36:39,07:19:31
3,4.0,07:24:13,08:04:12
4,5.0,08:05:18,08:45:37
5,6.0,08:50:02,09:37:32
6,7.0,09:40:53,10:22:45
7,8.0,10:25:55,11:05:00
8,9.0,11:09:38,11:57:20
9,10.0,12:03:37,12:46:17


#### Analyzing unique boarding-gps matches (there should be no duplicates)

In [24]:
first_cols = ['cardNum', 'boarding_datetime','gps_datetime','route','busCode','stopPointId']
other_cols = [col for col in gps_data.columns if col not in first_cols]
cols_order = first_cols + other_cols
boarding_key_cols = ['cardNum','boarding_datetime']
gps_by_boarding = gps_data[cols_order] \
    .dropna(subset=boarding_key_cols) \
    .sort_values(boarding_key_cols)
gps_by_boarding.head(20)

Unnamed: 0,cardNum,boarding_datetime,gps_datetime,route,busCode,stopPointId,tripNum,shapeId,shapeSequence,shapeLat,...,gpsLat,gpsLon,distanceToShapePoint,timestamp,problem,birthdate,cardTimestamp,lineName,gender,date
558392,229948.0,2017-05-09 06:29:09,2017-05-09 06:28:49,654,HA017,35350,2.0,2953,3848651,-25.492612,...,,,,06:28:49,BETWEEN,28/04/95,06:29:09,CAMPO ALEGRE,F,2017-05-09
260499,257342.0,2017-05-09 07:48:58,2017-05-09 07:47:51,511,EA172,31195,4.0,2746,5846156,-25.498727,...,-25.498735,-49.247771,9.667548,07:47:51,NO_PROBLEM,24/01/71,07:48:58,SÃO FRANCISCO,F,2017-05-09
408836,300327.0,2017-05-09 06:56:45,2017-05-09 06:55:54,654,HA240,36094,3.0,2106,6040871,-25.501307,...,-25.501313,-49.319971,0.73727,06:55:54,NO_PROBLEM,20/05/59,06:56:45,OP. CONTIGENCIA,F,2017-05-09
199446,304127.0,2017-05-09 10:24:50,2017-05-09 10:22:28,175,BC010,31748,5.0,1743,5444496,-25.450869,...,-25.450986,-49.254015,13.270845,10:22:28,NO_PROBLEM,09/06/54,10:24:50,BOM RETIRO / PUC,M,2017-05-09
305000,304127.0,2017-05-09 10:57:36,2017-05-09 10:42:18,370,LC016,3377,3.0,3669,6592154,-25.42822,...,-25.42821,-49.246896,5.426374,10:42:18,NO_PROBLEM,09/06/54,10:57:36,RUA XV / BARIGUI,M,2017-05-09
517064,304627.0,2017-05-09 14:17:23,2017-05-09 14:16:09,370,BC032,3377,6.0,2789,5510110,-25.428214,...,-25.428125,-49.246675,18.124441,14:16:09,NO_PROBLEM,05/04/53,14:17:23,RUA XV / BARIGUI,F,2017-05-09
421546,304627.0,2017-05-09 16:24:40,2017-05-09 15:45:37,370,LC020,30303,6.0,2789,5510843,-25.455839,...,-25.455803,-49.323013,5.475631,15:45:37,NO_PROBLEM,05/04/53,16:24:40,RUA XV / BARIGUI,F,2017-05-09
108958,306135.0,2017-05-09 07:56:49,2017-05-09 07:56:48,777,JC004,32097,3.0,2194,4299781,-25.44637,...,-25.446305,-49.275308,32.238018,07:56:48,NO_PROBLEM,29/10/52,07:56:49,V. VELHA,M,2017-05-09
258047,310241.0,2017-05-09 17:55:48,2017-05-09 17:55:42,468,DC090,30884,5.0,1926,4418132,-25.455683,...,-25.455618,-49.241156,8.32335,17:55:42,NO_PROBLEM,13/01/72,17:55:48,JD. ITIBERÊ,M,2017-05-09
112368,312500.0,2017-05-09 13:26:11,2017-05-09 13:24:48,175,BC282,28632,6.0,2743,5404932,-25.431576,...,-25.431573,-49.27168,6.383659,13:24:48,NO_PROBLEM,26/02/82,13:26:11,BOM RETIRO / PUC,F,2017-05-09
