In [None]:
!ls ../../../datascience/data/urbs/2018-11/19-23/*_veiculos.json

### **INIT SPARK CONTEXT AND SET CONFIGURATIONS**

In [None]:
# import findspark
# findspark.init()
import pixiedust
import pyspark
import random
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from sklearn.cluster import DBSCAN
import numpy as np 

# Import `pyplot` 
import matplotlib.pyplot as plt

# Set the style to `ggplot`
plt.style.use("ggplot")
pixiedust.enableJobMonitor()

conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '4G')
        .set('spark.driver.memory', '30G')
        .set('spark.driver.maxResultSize', '10G'))

sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)

#### **LOAD DATA FILES**

In [None]:
path='../../../datascience/data/urbs/2018-11/19-23/'

position_events = sqlContext.read.json(path+'*_veiculos.json')

toDateTime =  udf(lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M:%S'), DateType())

position_events = position_events.withColumn("DATA", toDateTime(col('DTHR')))

In [None]:
display(position_events)

In [None]:
lines = sqlContext.read.json(path+'2018_11_23_linhas.json')

In [None]:
display(lines)

#### **JOIN VEHICLE TRACKER DATA WITH BUS LINES**

In [None]:
full_dataset = lines.join(position_events, position_events.COD_LINHA == lines.COD, "left_outer")

In [None]:
display(full_dataset)

---

#### **HIP I - % de linhas que são rastreadas**

In [None]:
qtd_linhas_existentes = lines.select("NOME").distinct().count()
qtd_linhas_rastreadas = position_events.select("COD_LINHA").distinct().count()
pct_linhas_rastreadas = (qtd_linhas_rastreadas/qtd_linhas_existentes)*100

print("Foram encontradas {0} linhas das quais {1} linhas estão posicionando eventos. \n Somente {2:.0f}% das linhas são rastreadas?".format(qtd_linhas_existentes,qtd_linhas_rastreadas,pct_linhas_rastreadas))

---

#### **HIP II - Qualidade dos rastreadores**

In [None]:
df_registros = position_events    \
    .groupBy("COD_LINHA").count() \
    .toPandas()

In [None]:
print("Nº médio de eventos posicionados {0:.0f}".format(df_registros['count'].mean()))
print("Nº mediano de eventos posicionados {0:.0f}".format(df_registros['count'].median()))
print("Nº máximo de eventos posicionados {0:.0f}".format(df_registros['count'].max()))
print("Nº mínimo de eventos posicionados {0:.0f}".format(df_registros['count'].min()))

In [None]:
position_events_df = \
        full_dataset.select("COD_LINHA","DATA")   \
        .groupBy("COD_LINHA","DATA").count()      \
        .sort(col("COD_LINHA"),col("DATA").asc()) \
        .toPandas()

In [None]:
registros_por_data = \
        full_dataset.select("COD_LINHA","DATA") \
        .groupBy("DATA").count()                \
        .sort(col("DATA").asc())                \
        .toPandas()

In [None]:
registros_por_data

In [None]:
registros_por_linha = \
            full_dataset.select("COD","NOME") \
            .groupBy("COD","NOME").count()    \
            .sort(col("COD").asc())           

#### Linhas sem dados de rastreamento

In [None]:
display(registros_por_linha)

In [None]:
full_dataset.select("COD_LINHA","DTHR","DATA","LAT","LON") \
            .filter("COD_LINHA == '010'")                  \
            .sort(col("DTHR").asc())                       \
            .toPandas().head()

In [None]:
full_dataset.select("COD_LINHA","NOME","DATA","VEIC")                \
        .filter("COD_LINHA == '010' or COD_LINHA=='011'").distinct() \
        .groupBy("COD_LINHA","NOME","DATA").count()                  \
        .sort(col("COD_LINHA"), col("DATA").asc())  \
        .toPandas().head(20)

### **BUNCHING BUS DETECTION - POC**

In [None]:
linha_010 = full_dataset                                    \
        .filter("COD_LINHA == '010' and DATA='2018-11-21'") \
        .sort(col("DTHR").asc())

In [None]:
display(linha_010)

In [None]:
from datetime import datetime

def create_key(row):
    
    sec = int(datetime.strptime(row['DTHR'], '%d/%m/%Y %H:%M:%S').second)
    mi =  str(datetime.strptime(row['DTHR'], '%d/%m/%Y %H:%M:%S').minute)
    hr =  str(datetime.strptime(row['DTHR'], '%d/%m/%Y %H:%M:%S').hour)
    
    partition = ''
    if(sec <= 20):
         partition = hr+'-'+mi+'-020'
    elif(sec > 20 and sec <= 40):
        partition = hr+'-'+mi+'-040'
    else:
        partition = hr+'-'+mi+'-060'
    
    key = row['COD_LINHA']+'-'+str(row['DATA'])+'-'+partition
    return key

linha_010_kv = linha_010.rdd.map(lambda x: (create_key(x), x))

In [None]:
a = linha_010_kv.map(lambda x: (x[0], 1)).reduceByKey(lambda a,b: a+b)
a.take(10)

In [None]:
from pyspark.sql.types import Row

def f(x):
    d = {}
    for i in range(len(x)):
        d[str(i)] = x[i]
    return d

df = linha_010_kv.filter(lambda x: x[0] == '010-2018-11-21-12-46-020').map(lambda x: Row(**f(x[1]))).toDF()
display(df)

In [None]:
df = linha_010_kv.filter(lambda x: x[0] == '010-2018-11-21-12-46-020')

In [None]:
df.take(3)

In [None]:
display(df.map(lambda row: (row[1]['VEIC'],row[1]['DTHR'],row[1]['LAT'],row[1]['LON'])).toDF())

In [None]:
sc.stop()