In [None]:
!ls ../../../datascience/data/urbs/2018-11/19-23/*_veiculos.json

### **INIT SPARK CONTEXT AND SET CONFIGURATIONS**

In [1]:
# import findspark
# findspark.init()
import pixiedust
import pyspark
import random
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from sklearn.cluster import DBSCAN
import numpy as np 

# Import `pyplot` 
import matplotlib.pyplot as plt

# Set the style to `ggplot`
plt.style.use("ggplot")
pixiedust.enableJobMonitor()

conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '4G')
        .set('spark.driver.memory', '30G')
        .set('spark.driver.maxResultSize', '10G'))

sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)

Pixiedust database opened successfully


Successfully enabled Spark Job Progress Monitor


#### **LOAD DATA FILES**

In [2]:
path='../../../datascience/data/urbs/2018-11/19-23/'

position_events = sqlContext.read.json(path+'*_veiculos.json')

toDateTime =  udf(lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M:%S'), DateType())

position_events = position_events.withColumn("DATA", toDateTime(col('DTHR')))

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
display(position_events)

COD_LINHA,DTHR,LAT,LON,VEIC,DATA
652,21/11/2018 11:06:46,-25.52336,-49.333561,LA006,2018-11-21 00:00:00
652,21/11/2018 11:05:16,-25.517328,-49.32638,LA006,2018-11-21 00:00:00
652,21/11/2018 11:00:48,-25.511756,-49.325066,LA006,2018-11-21 00:00:00
652,21/11/2018 10:50:06,-25.519795,-49.326856,LA006,2018-11-21 00:00:00
652,21/11/2018 10:46:44,-25.53289,-49.32993,LA006,2018-11-21 00:00:00


In [4]:
lines = sqlContext.read.json(path+'2018_11_23_linhas.json')

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
display(lines)

CATEGORIA_SERVICO,COD,NOME,NOME_COR,SOMENTE_CARTAO
CONVENCIONAL,464,A. MUNHOZ / J. BOTANICO,AMARELA,S
ALIMENTADOR,226,ABAETE,LARANJA,N
TRONCAL,182,ABRANCHES,AMARELA,N
ALIMENTADOR,332,ACROPOLE,LARANJA,N
ALIMENTADOR,334,AGRICOLA,LARANJA,N


#### **JOIN VEHICLE TRACKER DATA WITH BUS LINES**

In [6]:
full_dataset = lines.join(position_events, position_events.COD_LINHA == lines.COD, "left_outer")

0,1,2
▸,:,


In [7]:
display(full_dataset)

CATEGORIA_SERVICO,COD,NOME,NOME_COR,SOMENTE_CARTAO,COD_LINHA,DTHR,LAT,LON,VEIC,DATA
ALIMENTADOR,829,UNIV.POSITIVO,LARANJA,N,829,22/11/2018 22:45:19,-25.445413,-49.354233,BA011,2018-11-22 00:00:00
ALIMENTADOR,829,UNIV.POSITIVO,LARANJA,N,829,22/11/2018 22:44:04,-25.450261,-49.35387,BA011,2018-11-22 00:00:00
ALIMENTADOR,829,UNIV.POSITIVO,LARANJA,N,829,22/11/2018 22:31:30,-25.441608,-49.346521,BA011,2018-11-22 00:00:00
ALIMENTADOR,829,UNIV.POSITIVO,LARANJA,N,829,22/11/2018 22:25:07,-25.441515,-49.347606,BA011,2018-11-22 00:00:00
ALIMENTADOR,829,UNIV.POSITIVO,LARANJA,N,829,22/11/2018 22:24:35,-25.441795,-49.350026,BA011,2018-11-22 00:00:00


---

#### **HIP I - % de linhas que são rastreadas**

In [6]:
qtd_linhas_existentes = lines.select("NOME").distinct().count()
qtd_linhas_rastreadas = position_events.select("COD_LINHA").distinct().count()
pct_linhas_rastreadas = (qtd_linhas_rastreadas/qtd_linhas_existentes)*100

print("Foram encontradas {0} linhas das quais {1} linhas estão posicionando eventos. \n Somente {2:.0f}% das linhas são rastreadas?".format(qtd_linhas_existentes,qtd_linhas_rastreadas,pct_linhas_rastreadas))

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Foram encontradas 309 linhas das quais 125 linhas estão posicionando eventos. 
 Somente 40% das linhas são rastreadas?


<IPython.core.display.Javascript object>

---

#### **HIP II - Qualidade dos rastreadores**

In [9]:
df_registros = position_events    \
    .groupBy("COD_LINHA").count() \
    .toPandas()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
print("Nº médio de eventos posicionados {0:.0f}".format(df_registros['count'].mean()))
print("Nº mediano de eventos posicionados {0:.0f}".format(df_registros['count'].median()))
print("Nº máximo de eventos posicionados {0:.0f}".format(df_registros['count'].max()))
print("Nº mínimo de eventos posicionados {0:.0f}".format(df_registros['count'].min()))

0,1,2
▸,:,


Nº médio de eventos posicionados 44769
Nº mediano de eventos posicionados 37534
Nº máximo de eventos posicionados 186714
Nº mínimo de eventos posicionados 8


In [11]:
position_events_df = \
        full_dataset.select("COD_LINHA","DATA")   \
        .groupBy("COD_LINHA","DATA").count()      \
        .sort(col("COD_LINHA"),col("DATA").asc()) \
        .toPandas()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
registros_por_data = \
        full_dataset.select("COD_LINHA","DATA") \
        .groupBy("DATA").count()                \
        .sort(col("DATA").asc())                \
        .toPandas()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
registros_por_data

0,1,2
▸,:,


Unnamed: 0,DATA,count
0,,192
1,2018-11-19,135868
2,2018-11-20,753584
3,2018-11-21,1229876
4,2018-11-22,1325769
5,2018-11-23,1416010
6,2018-11-24,714819


In [14]:
registros_por_linha = \
            full_dataset.select("COD","NOME") \
            .groupBy("COD","NOME").count()    \
            .sort(col("COD").asc())           

0,1,2
▸,:,


#### Linhas sem dados de rastreamento

In [15]:
display(registros_por_linha)

COD,NOME,count
2,CIRCULAR CENTRO (ANTI-HORARIO),1
10,INTERBAIRROS I (HORARIO),73903
30,INTERBAIRROS III,184125
166,V. NORI,61466
168,RAPOSO TAVARES,70213
169,JD. KOSMOS,45029
171,PRIMAVERA,47229
180,AGUA VERDE/ ABRANCHES,1
182,ABRANCHES,42927
183,JD. CHAPARRAL,23963


In [16]:
full_dataset.select("COD_LINHA","DTHR","DATA","LAT","LON") \
            .filter("COD_LINHA == '010'")                  \
            .sort(col("DTHR").asc())                       \
            .toPandas().head()

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,COD_LINHA,DTHR,DATA,LAT,LON
0,10,21/11/2018 05:29:40,2018-11-21,-25.415033,-49.28065
1,10,21/11/2018 05:29:59,2018-11-21,-25.414886,-49.280471
2,10,21/11/2018 05:30:00,2018-11-21,-25.415003,-49.281313
3,10,21/11/2018 05:30:01,2018-11-21,-25.414798,-49.280315
4,10,21/11/2018 05:30:05,2018-11-21,-25.414775,-49.280003


<IPython.core.display.Javascript object>

In [17]:
full_dataset.select("COD_LINHA","NOME","DATA","VEIC")                \
        .filter("COD_LINHA == '010' or COD_LINHA=='011'").distinct() \
        .groupBy("COD_LINHA","NOME","DATA").count()                  \
        .sort(col("COD_LINHA"), col("DATA").asc())  \
        .toPandas().head(20)

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,COD_LINHA,NOME,DATA,count
0,10,INTERBAIRROS I (HORARIO),2018-11-21,6
1,10,INTERBAIRROS I (HORARIO),2018-11-22,6
2,10,INTERBAIRROS I (HORARIO),2018-11-23,6
3,10,INTERBAIRROS I (HORARIO),2018-11-24,3
4,11,INTERBAIRROS I (ANTI-HORARIO),2018-11-21,4
5,11,INTERBAIRROS I (ANTI-HORARIO),2018-11-22,5
6,11,INTERBAIRROS I (ANTI-HORARIO),2018-11-23,4
7,11,INTERBAIRROS I (ANTI-HORARIO),2018-11-24,3


<IPython.core.display.Javascript object>

### **BUNCHING BUS DETECTION - POC**

In [18]:
linha_010 = full_dataset                                    \
        .filter("COD_LINHA == '010' and DATA='2018-11-21'") \
        .sort(col("DTHR").asc())

0,1,2
▸,:,


In [19]:
display(linha_010)

CATEGORIA_SERVICO,COD,NOME,NOME_COR,SOMENTE_CARTAO,COD_LINHA,DTHR,LAT,LON,VEIC,DATA
INTERBAIRROS,10,INTERBAIRROS I (HORARIO),VERDE,N,10,21/11/2018 05:31:34,-25.414145,-49.281243,BB309,2018-11-21 00:00:00
INTERBAIRROS,10,INTERBAIRROS I (HORARIO),VERDE,N,10,21/11/2018 05:52:42,-25.434178,-49.261743,BB309,2018-11-21 00:00:00
INTERBAIRROS,10,INTERBAIRROS I (HORARIO),VERDE,N,10,21/11/2018 05:57:05,-25.443635,-49.25302,BB309,2018-11-21 00:00:00
INTERBAIRROS,10,INTERBAIRROS I (HORARIO),VERDE,N,10,21/11/2018 06:01:15,-25.410531,-49.275285,BB303,2018-11-21 00:00:00
INTERBAIRROS,10,INTERBAIRROS I (HORARIO),VERDE,N,10,21/11/2018 06:05:55,-25.412653,-49.26563,BB303,2018-11-21 00:00:00


In [20]:
from datetime import datetime

def create_key(row):
    
    sec = int(datetime.strptime(row['DTHR'], '%d/%m/%Y %H:%M:%S').second)
    mi =  str(datetime.strptime(row['DTHR'], '%d/%m/%Y %H:%M:%S').minute)
    hr =  str(datetime.strptime(row['DTHR'], '%d/%m/%Y %H:%M:%S').hour)
    
    partition = ''
    if(sec <= 20):
         partition = hr+'-'+mi+'-020'
    elif(sec > 20 and sec <= 40):
        partition = hr+'-'+mi+'-040'
    else:
        partition = hr+'-'+mi+'-060'
    
    key = row['COD_LINHA']+'-'+str(row['DATA'])+'-'+partition
    return key

linha_010_kv = linha_010.rdd.map(lambda x: (create_key(x), x))

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
a = linha_010_kv.map(lambda x: (x[0], 1)).reduceByKey(lambda a,b: a+b)
a.take(10)

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[('010-2018-11-21-5-39-020', 2),
 ('010-2018-11-21-8-30-040', 10),
 ('010-2018-11-21-12-4-060', 8),
 ('010-2018-11-21-13-35-060', 7),
 ('010-2018-11-21-15-37-020', 8),
 ('010-2018-11-21-18-48-040', 9),
 ('010-2018-11-21-20-15-020', 7),
 ('010-2018-11-21-20-42-020', 5),
 ('010-2018-11-21-21-50-020', 4),
 ('010-2018-11-21-6-27-020', 4)]

<IPython.core.display.Javascript object>

In [22]:
from pyspark.sql.types import Row

def f(x):
    d = {}
    for i in range(len(x)):
        d[str(i)] = x[i]
    return d

df = linha_010_kv.filter(lambda x: x[0] == '010-2018-11-21-12-46-020').map(lambda x: Row(**f(x[1]))).toDF()
display(df)

0,1,10,2,3,4,5,6,7,8,9
INTERBAIRROS,10,2018-11-21 00:00:00,INTERBAIRROS I (HORARIO),VERDE,N,10,21/11/2018 12:46:04,-25.414698,-49.260141,BB309
INTERBAIRROS,10,2018-11-21 00:00:00,INTERBAIRROS I (HORARIO),VERDE,N,10,21/11/2018 12:46:16,-25.44529,-49.255658,BB310


In [23]:
df = linha_010_kv.filter(lambda x: x[0] == '010-2018-11-21-12-46-020')

0,1,2
▸,:,


<IPython.core.display.Javascript object>

In [24]:
df.take(3)

0,1,2
▸,:,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[('010-2018-11-21-12-46-020',
  Row(CATEGORIA_SERVICO='INTERBAIRROS', COD='010', NOME='INTERBAIRROS I (HORARIO)', NOME_COR='VERDE', SOMENTE_CARTAO='N', COD_LINHA='010', DTHR='21/11/2018 12:46:04', LAT='-25.414698', LON='-49.260141', VEIC='BB309', DATA=datetime.date(2018, 11, 21))),
 ('010-2018-11-21-12-46-020',
  Row(CATEGORIA_SERVICO='INTERBAIRROS', COD='010', NOME='INTERBAIRROS I (HORARIO)', NOME_COR='VERDE', SOMENTE_CARTAO='N', COD_LINHA='010', DTHR='21/11/2018 12:46:04', LAT='-25.444895', LON='-49.255731', VEIC='BB310', DATA=datetime.date(2018, 11, 21))),
 ('010-2018-11-21-12-46-020',
  Row(CATEGORIA_SERVICO='INTERBAIRROS', COD='010', NOME='INTERBAIRROS I (HORARIO)', NOME_COR='VERDE', SOMENTE_CARTAO='N', COD_LINHA='010', DTHR='21/11/2018 12:46:06', LAT='-25.44467', LON='-49.284456', VEIC='BB304', DATA=datetime.date(2018, 11, 21)))]

<IPython.core.display.Javascript object>

In [25]:
display(df.map(lambda row: (row[1]['VEIC'],row[1]['DTHR'],row[1]['LAT'],row[1]['LON'])).toDF())

_1,_2,_3,_4
BB309,21/11/2018 12:46:04,-25.414698,-49.260141
BB310,21/11/2018 12:46:04,-25.444895,-49.255731
BB304,21/11/2018 12:46:06,-25.44467,-49.284456
BB310,21/11/2018 12:46:11,-25.445008,-49.25579
BB301,21/11/2018 12:46:12,-25.447028,-49.256338
BB309,21/11/2018 12:46:12,-25.4153,-49.259853
BB309,21/11/2018 12:46:14,-25.415455,-49.259708
BB310,21/11/2018 12:46:16,-25.44529,-49.255658
BB301,21/11/2018 12:46:17,-25.447075,-49.25666
BB309,21/11/2018 12:46:18,-25.415745,-49.259528


In [None]:
sc.stop()