In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType, StringType

from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark import SparkContext

In [2]:
conf = SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
                     .set('spark.executor.memory', '4G')
                     .set('spark.driver.memory', '2G')
                     .set('spark.driver.maxResultSize', '4G')
                     .set('spark.sql.autoBroadcastJoinThreshold', '-1')
                     .set("spark.sql.sources.partitionOverwriteMode", "dynamic")
                     )

In [3]:
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)

In [38]:
# veiculos

In [40]:
sqlContext.read.json("/opt/work/data/raw/2020-05/veiculos") \
.select(F.col("COD_LINHA").alias("line_code"), 
        F.date_format(F.unix_timestamp('dthr', 'dd/MM/yyyy HH:mm:ss').cast('timestamp'), "yyyy-MM-dd HH:mm:ss").alias('event_timestamp'),
        F.col("LAT").cast('double').alias("latitude"),
        F.col("LON").cast('double').alias("longitude"),
        F.col("VEIC").alias("vehicle")
       )\
        .withColumn("year", F.year("event_timestamp")) \
        .withColumn("month", F.month("event_timestamp")) \
        .withColumn("day", F.dayofmonth("event_timestamp")).dropDuplicates().coalesce(10) \
        .write.mode('overwrite') \
        .partitionBy("year", "month", "day") \
        .format("parquet") \
        .save("/opt/work/data/trusted/veiculos")

In [52]:
sqlContext.read.parquet("/opt/work/data/trusted/veiculos").limit(10).toPandas()

Unnamed: 0,line_code,event_timestamp,latitude,longitude,vehicle,year,month,day
0,236,2020-05-04 23:39:13,-25.368045,-49.227028,BA001,2020,5,4
1,236,2020-05-04 23:12:42,-25.358168,-49.23304,BA001,2020,5,4
2,245,2020-05-04 08:13:26,-25.386028,-49.261593,BA002,2020,5,4
3,245,2020-05-04 07:21:52,-25.391033,-49.260733,BA002,2020,5,4
4,245,2020-05-04 05:45:40,-25.354481,-49.263068,BA002,2020,5,4
5,812,2020-05-04 23:14:01,-25.41862,-49.347355,BA005,2020,5,4
6,812,2020-05-04 22:16:45,-25.411798,-49.358738,BA005,2020,5,4
7,812,2020-05-04 21:58:53,-25.421536,-49.334896,BA005,2020,5,4
8,812,2020-05-04 21:50:44,-25.435991,-49.307181,BA005,2020,5,4
9,812,2020-05-04 21:45:12,-25.430713,-49.32249,BA005,2020,5,4


In [33]:
def extract(sqlContext, src):
    df = sqlContext.read.json(src).withColumn("filepath", F.input_file_name())

    split_col = F.split(df['filepath'], '/')
    df = df.withColumn('filename', split_col.getItem(9))

    split = F.split(df['filename'], '_')

    df = df.withColumn('year', split.getItem(0))
    df = df.withColumn('month', split.getItem(1))
    df = df.withColumn('day', split.getItem(2))
    return df

In [36]:
# linhas

In [45]:
extract(sqlContext,"/opt/work/data/raw/2020-05/linhas").drop("filepath","filename").dropDuplicates().count() #limit(10).toPandas()

1854

In [53]:
sqlContext.read.parquet("/opt/work/data/trusted/linhas").limit(10).toPandas()

Unnamed: 0,categoria_servico,cod,nome,nome_cor,somente_cartao,year,month,day
0,ALIMENTADOR,642,GANCHINHO,LARANJA,N,2020,5,5
1,LINHA DIRETA,506,BAIRRO NOVO,PRATA,N,2020,5,5
2,ALIMENTADOR,641,LUIZ NICHELE,LARANJA,N,2020,5,5
3,MADRUGUEIRO,389,MAD. TARUM? / AUGUSTA,MADRUGUEIRO,S,2020,5,5
4,CONVENCIONAL,472,UBERABA,AMARELA,S,2020,5,5
5,CONVENCIONAL,865,JD. ESPLANADA,AMARELA,S,2020,5,5
6,ALIMENTADOR,718,PORTO BELO,LARANJA,N,2020,5,5
7,ALIMENTADOR,816,CAMP. SIQUEIRA / STA.FELICIDADE,LARANJA,N,2020,5,5
8,ALIMENTADOR,622,RONDON,LARANJA,F,2020,5,5
9,CONVENCIONAL,670,S?O JORGE,AMARELA,S,2020,5,5


In [48]:
extract(sqlContext,"/opt/work/data/raw/2020-05/pontoslinha").drop("filepath","filename","GRUPO").dropDuplicates().count()

123198

In [51]:
extract(sqlContext,"/opt/work/data/raw/2020-05/tabelaveiculo").drop("filepath","filename").dropDuplicates()

145596