In [1]:
import pyspark
from pyspark import SparkContext
# from pyspark.sql.window import Window
# from pyspark.sql import functions as F
# from pyspark.sql import types as T

import json
import pandas as pd
import numpy as np

#### Functions

In [2]:
def read_data(sqlContext, filepath):
    df = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferSchema", "true") \
        .option("nullValue", "-")\
        .load(filepath)

    return df

def printdf(df,l=10):
    return df.limit(l).toPandas()

In [3]:
sc = SparkContext.getOrCreate()
sqlContext = pyspark.SQLContext(sc)

### Read Boarding Data

In [4]:
boarding_data = pd.read_json('/local/tarciso/masters/data/bus_trips/preliminary-exp-sample-data/ticketing-sample/doc1-2017051015.txt', lines=True)

In [5]:
boarding_data.head()

Unnamed: 0,CODLINHA,CODVEICULO,DATANASCIMENTO,DATAUTILIZACAO,NOMELINHA,NUMEROCARTAO,SEXO
0,0,00070,22/11/58,"09/05/17 11:05:57,000000",OPER S/LINHA,1353891,M
1,542,GA117,23/03/72,"09/05/17 17:25:14,000000",BAIRRO NOVO B,2357837,F
2,0,09053,23/03/72,"09/05/17 17:57:33,000000",OPER S/LINHA,2357837,F
3,0,09053,23/03/72,"09/05/17 17:57:28,000000",OPER S/LINHA,2357837,F
4,21,08046,26/01/72,"09/05/17 20:17:34,000000",INTERB II ANTI H,1937533,F


In [6]:
boardings_total = boarding_data.count()[0]

#### Checking number of boardings on route 000

In [7]:
boardings_000 = boarding_data[boarding_data['CODLINHA'] == '000'].count()[0]

In [8]:
print boardings_000, boardings_total, boardings_000/float(boardings_total)

134629 320292 0.420332071984


#### Looking at 000 boardings in more detail

In [9]:
boardings_000_df = boarding_data[boarding_data['CODLINHA'] == '000']

In [10]:
boardings_000_df.head(20)

Unnamed: 0,CODLINHA,CODVEICULO,DATANASCIMENTO,DATAUTILIZACAO,NOMELINHA,NUMEROCARTAO,SEXO
0,0,70,22/11/58,"09/05/17 11:05:57,000000",OPER S/LINHA,1353891,M
2,0,9053,23/03/72,"09/05/17 17:57:33,000000",OPER S/LINHA,2357837,F
3,0,9053,23/03/72,"09/05/17 17:57:28,000000",OPER S/LINHA,2357837,F
13,0,3009,28/08/84,"09/05/17 18:57:22,000000",OPER S/LINHA,3591720,F
15,0,3031,29/12/63,"09/05/17 07:20:19,000000",OPER S/LINHA,1225547,M
17,0,5040,17/01/79,"09/05/17 15:29:11,000000",OPER S/LINHA,2699543,F
20,0,9005,14/09/76,"09/05/17 06:59:45,000000",OPER S/LINHA,2788103,F
22,0,3037,01/06/89,"09/05/17 17:00:00,000000",OPER S/LINHA,2900987,F
25,0,5056,02/07/63,"09/05/17 07:42:22,000000",OPER S/LINHA,1947358,M
26,0,9029,22/02/65,"09/05/17 07:00:15,000000",OPER S/LINHA,3543165,M


In [31]:
boardings_000_df.describe(include='all')

Unnamed: 0,CODLINHA,CODVEICULO,DATANASCIMENTO,DATAUTILIZACAO,NOMELINHA,NUMEROCARTAO,SEXO
count,134629.0,134629.0,134629.0,134629,134629,134629.0,134629
unique,1.0,311.0,18778.0,45061,1,,3
top,0.0,3014.0,,"09/05/17 18:12:08,000000",OPER S/LINHA,,F
freq,134629.0,2479.0,1351.0,17,134629,,81325
mean,,,,,,2998047.0,
std,,,,,,1100731.0,
min,,,,,,228696.0,
25%,,,,,,2341897.0,
50%,,,,,,3230604.0,
75%,,,,,,3660119.0,


#### Read Line 000 Codes translation table

In [79]:
line_000_terminals = pd.read_csv('/local/tarciso/masters/data/urbs/Tubos e Terminais FINAL corresp..csv', dtype = {'COD_URBS': str})

In [80]:
line_000_terminals.head()

Unnamed: 0,LINHA,NOME,COD_URBS,GRUPO_ID,LAT,LON
0,0,TERMINAL BOQUEIRÃO,1,14489,-25.516723,-49.230568
1,0,TERMINAL BOQUEIRÃO,2,14489,-25.516723,-49.230568
2,0,TERMINAL BOQUEIRÃO,3,14489,-25.516723,-49.230568
3,0,TERMINAL CARMO,4,14485,-25.501311,-49.237825
4,0,TERMINAL CARMO,5,14485,-25.501311,-49.237825


In [81]:
line_000_terminals.count()

LINHA       360
NOME        360
COD_URBS    360
GRUPO_ID    360
LAT         360
LON         360
dtype: int64

#### Match CODVEICULO to COD_URBS to find line 000 boarding stations

In [87]:
route_000_vehicles = boardings_000_df[['CODLINHA','CODVEICULO']].drop_duplicates()

In [88]:
route_000_vehicles.count()

CODLINHA      311
CODVEICULO    311
dtype: int64

In [89]:
route_000_vehicles.head()

Unnamed: 0,CODLINHA,CODVEICULO
0,0,70
2,0,9053
13,0,3009
15,0,3031
17,0,5040


In [94]:
matched_000_stations = route_000_vehicles.merge(line_000_terminals, left_on='CODVEICULO', right_on='COD_URBS', how='inner')

In [95]:
matched_000_stations.count()

CODLINHA      287
CODVEICULO    287
LINHA         287
NOME          287
COD_URBS      287
GRUPO_ID      287
LAT           287
LON           287
dtype: int64

In [96]:
matched_000_stations.head(20)

Unnamed: 0,CODLINHA,CODVEICULO,LINHA,NOME,COD_URBS,GRUPO_ID,LAT,LON
0,0,70,0,TERMINAL HAUER,70,14487,-25.4819,-49.247078
1,0,9053,0,ESTAÇÃO ARROIO CERCADO - SENT. TERM. PINHEIRINHO,9053,41833,-25.534667,-49.282752
2,0,3009,0,TERMINAL PORTÃO,3009,14499,-25.47641,-49.292681
3,0,3031,0,ESTAÇÃO SEBASTIÃO PARANÁ - SENT. BAIRRO,3031,26104,-25.461169,-49.289857
4,0,5040,0,ESTAÇÃO PASSEIO PÚBLICO - SENT. TERM. SANTA CÂ...,5040,41773,-25.423959,-49.268338
5,0,9005,0,TUBO TERM. CAIUÁ,9005,14515,-25.483504,-49.350138
6,0,3037,0,ESTAÇÃO SILVA JARDIM - SENT. BAIRRO,3037,26105,-25.448323,-49.287777
7,0,5056,0,ESTAÇÃO ANTÔNIO LAGO - SENT. CENTRO,5056,41812,-25.387903,-49.235215
8,0,9029,0,ESTAÇÃO VITAL BRASIL - SENT. BAIRRO,9029,26121,-25.464607,-49.291302
9,0,5053,0,ESTAÇÃO HOLANDA - SENT. BAIRRO,5053,41771,-25.399555,-49.246224


#### Read Stops GTFS file

In [97]:
stops_df = pd.read_csv('/local/tarciso/data/gtfs/curitiba/stops.txt')

In [98]:
stops_df.head(40)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,70,104505,Terminal Campina do Siqueira - 303 - Centenári...,Terminal Campina do Siqueira - Campo Comprido,-25.435724,-49.306998,,,0,14506.0,,
1,270,104905,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501341,-49.237597,,,0,14485.0,,
2,276,105606,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.45155,-49.214917,,,0,14481.0,,
3,299,105603,Terminal Oficinas - 030 - Interbairros III,Terminal Oficinas - 030 - Interbairros III (Se...,-25.451665,-49.215086,,,0,14481.0,,
4,308,104907,Terminal Carmo - 030 - Interbairros III,Terminal Carmo - 030 - Interbairros III (Senti...,-25.501311,-49.237825,,,0,14485.0,,
5,568,190836,"R. Dep. José Hoffmann, 80 - Vista Alegre",150 - C. Música / V.Alegre (Ponto Final),-25.408609,-49.29986,,,0,,,
6,581,110312,Praça Santos Andrade - 150 - C. da Música / Vi...,Praça Santos Andrade 150 - C.Música / V. Alegre .,-25.42821,-49.265846,,,0,,,
7,597,190896,"R. Eng. Agro. Lauro Klas, 106 - Pilarzinho",160 - Jd. Mercês / Guanabara (Ponto Final),-25.39841,-49.293255,,,0,,,
8,616,150689,"Rua Rio de Janeiro, 1293 - Água Verde",Ponto Final 160 - Jd. Mercês / Guanabara (Sent...,-25.462635,-49.27792,,,0,,,
9,662,190600,"Rua São Francisco Xavier, 132 - Pilarzinho",166 - Vila Nori (Ponto Final) 167 - Fredolin W...,-25.389211,-49.302757,,,0,,,


In [99]:
terminal_stops = stops_df[stops_df['location_type'] == 1]

In [100]:
len(terminal_stops)

328

In [101]:
terminal_stops.head(40)

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
118,9475,,Terminal Urbano Campo Largo,,-25.455531,-49.524119,,,1,,,
143,14415,,Estação Tubo Santa Quitéria,,-25.459128,-49.302436,,,1,,,
144,14432,,Estação Tubo Mercês,,-25.422391,-49.291635,,,1,,,
145,14434,,Estação Tubo Centro Cívico - Palácio Iguaçú,,-25.415392,-49.26791,,,1,,,
146,14470,,Terminal Santa Cândida,,-25.377073,-49.22436,,,1,,,
147,14471,,Terminal Boa Vista,,-25.393295,-49.241051,,,1,,,
148,14474,,Terminal Cabral,,-25.406659,-49.252791,,,1,,,
149,14476,,Terminal Bairro Alto,,-25.413047,-49.20548,,,1,,,
150,14478,,Terminal Capão da Imbuia,,-25.43998,-49.221858,,,1,,,
151,14481,,Terminal Oficinas,,-25.451505,-49.214964,,,1,,,


#### Match terminal stop codes to line 000 terminals

In [102]:
line_000_stations_stops = matched_000_stations.merge(terminal_stops, left_on='GRUPO_ID', right_on='stop_id', how='inner')

In [103]:
len(line_000_stations_stops)

287

In [104]:
line_000_stations_stops.head(20)

Unnamed: 0,CODLINHA,CODVEICULO,LINHA,NOME,COD_URBS,GRUPO_ID,LAT,LON,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,0,70,0,TERMINAL HAUER,70,14487,-25.4819,-49.247078,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
1,0,9,0,TERMINAL HAUER,9,14487,-25.4819,-49.247078,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
2,0,7,0,TERMINAL HAUER,7,14487,-25.4819,-49.247078,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
3,0,8,0,TERMINAL HAUER,8,14487,-25.4819,-49.247078,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
4,0,9053,0,ESTAÇÃO ARROIO CERCADO - SENT. TERM. PINHEIRINHO,9053,41833,-25.534667,-49.282752,41833,,Estação Tubo Arroio Cercado,Estação Tubo Arroio Cercado - Sitio Cercado,-25.534673,-49.282746,,,1,,,
5,0,9052,0,ESTAÇÃO ARROIO CERCADO - SENT. TERM. SÍTIO CER...,9052,41833,-25.534667,-49.282752,41833,,Estação Tubo Arroio Cercado,Estação Tubo Arroio Cercado - Sitio Cercado,-25.534673,-49.282746,,,1,,,
6,0,3009,0,TERMINAL PORTÃO,3009,14499,-25.47641,-49.292681,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,
7,0,3014,0,TERMINAL PORTÃO,3014,14499,-25.47641,-49.292681,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,
8,0,3006,0,TERMINAL PORTÃO,3006,14499,-25.47641,-49.292681,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,
9,0,3007,0,TERMINAL PORTÃO,3007,14499,-25.47641,-49.292681,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,


In [105]:
line_000_codes_translation = line_000_stations_stops[['CODVEICULO','stop_id','stop_code','stop_name','stop_desc','stop_lat','stop_lon','zone_id','stop_url','location_type','parent_station','stop_timezone','wheelchair_boarding']]

In [106]:
line_000_codes_translation

Unnamed: 0,CODVEICULO,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,00070,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
1,00009,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
2,00007,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
3,00008,14487,,Terminal Hauer,,-25.481927,-49.246999,,,1,,,
4,09053,41833,,Estação Tubo Arroio Cercado,Estação Tubo Arroio Cercado - Sitio Cercado,-25.534673,-49.282746,,,1,,,
5,09052,41833,,Estação Tubo Arroio Cercado,Estação Tubo Arroio Cercado - Sitio Cercado,-25.534673,-49.282746,,,1,,,
6,03009,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,
7,03014,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,
8,03006,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,
9,03007,14499,,Terminal Portão,,-25.476335,-49.292629,,,1,,,


#### Match Line 000 stations to ticketing data

In [107]:
boardings_000_stations = boardings_000_df.merge(line_000_codes_translation, on='CODVEICULO', how='inner')

In [109]:
len(boardings_000_stations)

122864