In [1]:
# Exploring and preparing GTFS file to use to clean the dataset

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1608594734397_0002,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Helpfunl links
# https://sites.google.com/site/gtfschanges/proposals/route-type --> route types3 
# GTFS Basics -- > https://www.youtube.com/watch?v=azd2Pbt2fho by Esri Working Group

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## O que é GTFS?/ What is GTFS?

### GTFS --> https://developers.google.com/transit/gtfs

A Especificação Geral sobre Feeds de Transporte Público (GTFS, na sigla em inglês), 
também chamada de GTFS estática ou transporte público estático para diferenciá-la da 
extensão GTFS Realtime, define um formato comum para os horários de transporte público e 
as informações geográficas relacionadas. Os "feeds" GTFS permitem que empresas publiquem 
dados relevantes e que desenvolvedores criem aplicativos que consumam essas informações com interoperabilidade.

Cada arquivo modela um aspecto específico das informações sobre o transporte público: 
paradas, trajetos, viagens e outros dados relativos a horários. 
Os detalhes de cada arquivo são definidos na referência GTFS.


### Referência de arquivos/ Files reference 

- **reference:** https://developers.google.com/transit/gtfs/reference


### Arquivos utilizados para o processamento / GTFS Files for processing data

- routes.txt
- trips.txt
- shapes.txt
- stops.txt
- stop_times.txt


### Arquivos AVL / AVL Files
- MO_XXX --> registro de posicoes dos veiculos do dia XXX de outubro/vehicle positions registers of the buses
- AL_XXX --> arquivo descrevendo rota da linha e direção/file describing line route and direction

In [3]:
# https://sites.google.com/site/gtfschanges/proposals/route-type --> route types3 
# GTFS Basics -- > https://www.youtube.com/watch?v=azd2Pbt2fho by Esri Working Group

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Attempt 1
# 1- Open file MO and AL
# 2- For each row MO, get line_id
# 3- search the line-id in AL file and get route and direction
# 4- for each row in MO put route and direction found in AL file
# PS1: in AL file the direction can be 1 or 2, while the direction in trips.txt (GTFS file) can be 0 or 1, 
# so a transformation in the future will be required
# PS2: maybe stops.txt will be added for processing the dataset for identifying bus terminals

# Tentativa 1
# 1- Abrir os arquivos MO e AL
# 2- Para cada linha em MO, obter o campo line_id
# 3- Procurar o line_id no arquivo AL correspondente do MO, obter no arquivo AL a rota e direção
# 4- Para cada linha em MO, identificar a rota e direção correspondente no arquivo AL
# OBS1:  No arquivo AL a direção do veículo pode ser 1 ou 2, já no arquivo de trips.txt do GTFS a direcao é 1 ou 0 
# no futuro alguma transformação será necessária para corrigir essa diferença
# OBS2:  talvez o arquivo stops.txt será utilizado para processar o dataset para identificar os pontos em terminais  de onibus

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

# installing required packages for this notebook session
# installing required packages for this notebook session
sc.install_pypi_package("matplotlib")
sc.install_pypi_package("descartes")
sc.install_pypi_package("shapely")
sc.install_pypi_package("pyproj==2.2.2")
sc.install_pypi_package("geopandas")
sc.install_pypi_package("boto3")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting matplotlib
  Using cached https://files.pythonhosted.org/packages/30/f2/10c822cb0ca5ebec58bd1892187bc3e3db64a867ac26531c6204663fc218/matplotlib-3.3.3-cp37-cp37m-manylinux1_x86_64.whl
Collecting python-dateutil>=2.1 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl
Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/8a/bb/488841f56197b13700afd5658fc279a2025a39e22449b7cf29864669b15d/pyparsing-2.4.7-py2.py3-none-any.whl
Collecting pillow>=6.2.0 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/af/fa/c1302a26d5e1a17fa8e10e43417b6cf038b0648c4b79fcf2302a4a0c5d30/Pillow-8.0.1-cp37-cp37m-manylinux1_x86_64.whl
Collecting cycler>=0.10 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8a

In [6]:
# libs
import geopandas as gpd
from shapely.geometry import Point, Polygon
from pyspark.sql.functions import *

# Reading São Paulo ESRI shape file
sp_shape = gpd.read_file('s3://mobility-traces-sp/aux-files/shape-sp/DISTRITO_MUNICIPAL_SP_SMDUPolygon.shp')

# Receives a point (lat,long) in a row, and sp variable shape file
def get_region(row, sp):
    # row[0] = longitude, row[1] = latitude
    point = Point((float(row[0]), float(row[1])))
    
    # 96 districts
    for i in range(96):
        # if the point is within that region, the function return the region Name
        if point.within(sp.loc[i, "geometry"]):
            return sp.loc[i, "Nome"]
    # if the point is outside sp, the function returns "None"
    return "None"


# user definied function for spark
def get_region_udf(sp):
    return udf(lambda x: get_region(x, sp))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# Joining Stop_time x Stops by stop_id

stops = spark.read.csv("s3a://mobility-traces-sp/aux-files/gtfs/stops.csv",header=True)
stop_times = spark.read.csv("s3a://mobility-traces-sp/aux-files/gtfs/stop_times.csv",header=True)


stops.show(10)
stop_times.show(10)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+---------------+---------+----------+----------+
|stop_id|      stop_name|stop_desc|  stop_lat|  stop_lon|
+-------+---------------+---------+----------+----------+
|  18848|       Clínicas|     null| -23.55419|-46.670723|
|  18849|  Vila Madalena|     null| -23.54672|-46.690738|
|  18850|     Consolação|     null|-23.557501|-46.660872|
|  18851|      Conceição|     null|-23.636214|-46.641141|
|  18852|      Jabaquara|     null|-23.645754|-46.642084|
|  18853|      São Judas|     null| -23.62558|-46.640778|
|  18854|          Saúde|     null|-23.618285|-46.639148|
|  18855|Praça Da Árvore|     null|-23.610578|-46.637848|
|  18856|     Santa Cruz|     null|-23.599225|-46.636659|
|  18857|   Vila Mariana|     null|-23.589541|-46.634701|
+-------+---------------+---------+----------+----------+
only showing top 10 rows

+---------+------------+--------------+---------+-------------+
|  trip_id|arrival_time|departure_time|  stop_id|stop_sequence|
+---------+------------+----------

In [9]:
# Joining Stop_time x Stops by stop_id, and dropping not useful columns

joined = stop_times.join(stops,on=["stop_id"],how="left").drop("arrival_time","departure_time","stop_name","stop_desc")


# Adding region for each stop
joined_with_region = joined.withColumn("region", get_region_udf(sc.broadcast(sp_shape).value)
                                                           (struct(joined["stop_lon"],
                                                             joined["stop_lat"])))

joined_with_region = joined_with_region.repartition(100)


joined_with_region.repartition(100).write.parquet("s3://mobility-traces-sp/aux-files/gtfs/stopsXstop_times-region/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
# Checking if a shapes starts on the stops
# Example 
# Trip_id 1016-10-0
# Shape_ids = 53755 (checked in trips.csv)

all_stops = spark.read.parquet("s3a://mobility-traces-sp/aux-files/gtfs/stopsXstop_times-region/")

coords_stops = all_stops.filter("trip_id == '1016-10-0'").select("stop_lat","stop_lon").rdd.map(lambda r: [r[0],r[1]]).collect()

# reading shapes
all_shapes = spark.read.csv("s3a://mobility-traces-sp/aux-files/gtfs/shapes.csv",header=True)

coords_shapes = all_shapes.filter("shape_id == '53755'").select("shape_pt_lat","shape_pt_lon").rdd.map(lambda r: [r[0],r[1]]).collect()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
sc.install_pypi_package("folium")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting folium
  Using cached https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl
Collecting branca>=0.3.0 (from folium)
  Using cached https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Collecting jinja2>=2.9 (from folium)
  Using cached https://files.pythonhosted.org/packages/30/9e/f663a2aa66a09d838042ae1a2c5659828bb9b41ea3a6efa20a20fd92b121/Jinja2-2.11.2-py2.py3-none-any.whl
Collecting requests (from folium)
  Using cached https://files.pythonhosted.org/packages/29/c1/24814557f1d22c56d50280771a17307e6bf87b70727d975fd6b2ce6b014a/requests-2.25.1-py2.py3-none-any.whl
Collecting MarkupSafe>=0.23 (from jinja2>=2.9->folium)
  Using cached https://files.pythonhosted.org/packages/98/7b/ff284bd8c80654e471b769062a9b43cc5d03e7a615048d96f4619df8d420/MarkupSafe-1.1.1-cp37-cp37m-manylinux1_x86_64.whl
Collecting chardet<5

In [19]:
import folium

map_folium = folium.Map([-23.477476,-46.610409], zoom_start=15)

tile = folium.TileLayer('cartodbpositron').add_to(map_folium)


counter = len(coords_shapes)

for coord in coords_shapes:
    counter = counter - 1
    print(counter)
    icon = folium.features.CustomIcon('/home/hadoop/green-point.png', icon_size=(10,10))
    folium.Marker(coord,icon=icon).add_to(map_folium)
    
    
for coord in coords_stops:
    counter = counter - 1
    print(counter)
    icon = folium.features.CustomIcon('/home/hadoop/red-point.png', icon_size=(10,10))
    folium.Marker(coord,icon=icon).add_to(map_folium)

    
map_folium.save("/tmp/visualizing-shapes-and-stops.html")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

259
258
257
256
255
254
253
252
251
250
249
248
247
246
245
244
243
242
241
240
239
238
237
236
235
234
233
232
231
230
229
228
227
226
225
224
223
222
221
220
219
218
217
216
215
214
213
212
211
210
209
208
207
206
205
204
203
202
201
200
199
198
197
196
195
194
193
192
191
190
189
188
187
186
185
184
183
182
181
180
179
178
177
176
175
174
173
172
171
170
169
168
167
166
165
164
163
162
161
160
159
158
157
156
155
154
153
152
151
150
149
148
147
146
145
144
143
142
141
140
139
138
137
136
135
134
133
132
131
130
129
128
127
126
125
124
123
122
121
120
119
118
117
116
115
114
113
112
111
110
109
108
107
106
105
104
103
102
101
100
99
98
97
96
95
94
93
92
91
90
89
88
87
86
85
84
83
82
81
80
79
78
77
76
75
74
73
72
71
70
69
68
67
66
65
64
63
62
61
60
59
58
57
56
55
54
53
52
51
50
49
48
47
46
45
44
43
42
41
40
39
38
37
36
35
34
33
32
31
30
29
28
27
26
25
24
23
22
21
20
19
18
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3
2
1
0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20

In [None]:
# Some Notes
# 1- Juntar arquivo MO e , no final dt_avl,id_avl,x,y,line_id,route_id (rota/direcao)
# 2- Juntar arquivos routes, stops, trips, stop_times
# 3- Cruzar dois arquivos --> dt_avl,id_avl,x,y,line_id,route_id, trip_id, stop_id, stop_sequence, stopX, direction
# 4- Computar distancia entre Lat e Long do StopX,StopY.
# 5- Fazer marcacao do ponto mais proximo


In [None]:
# # Some Notes

# - Linhas circulares
# - Andando para trás
# - O onibus fica emitindo registros varias vezes a partirdo terminal de saída



In [20]:
# Some Notes
# # Como calcular velocidade
# 1- Dada uma viagem (ordenacao de timestamp), pega o último registro, com o ultimo StopId da sequencia
# 2- Para cada registro consecutivos, calcular distancia entre pontos, calcular delta tempo. Vmedia = distancia/delta tempo = velocidade média escalar
# 2a - Filtro de velocidade (onibus com velocidade muito alta ou muito baixao)
# 3- A velocidade média dauqela viagem é a média de todas as velocidades escalares
# 4- Velocidades serão agrupadas
#     Média por região
#     Média por dia
    
    
#     pq nao calcular --> VmViagem = (ponto final - ponto inicial)/(Tfinal - Tinicial) (validar ideia grosso)
    
    
    



VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
# Some Notes

# Como calcular as métricas de conectividade
# 1- Particionar os dados por Dia / hora /min /região --> 08/19/36/pompeia
# 2- Computar distância para registros de cada partição.
#     Se distancia < 100m, criar conectividade
# Será montado um grafo com pesos
# quais regioes há mais conexoes
# O numero de conexoes será o numero de contatos do onibus

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…