In [None]:
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

In [None]:
# Selecting random working days and weekend days for analysis

import numpy as np

days = set([12]) # 6 days = 3 work days + 2 weekends + 1 holiday
weekends = [3,10,17,24,31,4,11,18,25]

random_weekend_choice = np.random.choice(weekends, 1)

# 1 weekends days
days.add(random_weekend_choice[0])

work_days = set(list(range(1,32)))

valid_days = work_days - set(weekends)

random_work_days = np.random.choice(list(valid_days), 2)

# 2 work days
days.add(random_work_days[0])
days.add(random_work_days[1])
days

In [None]:
# getting day's files
filenames = ["MO_1510" + str(day) for day in days]

In [None]:
# installing required packages for this notebook session
sc.install_pypi_package("matplotlib")
sc.install_pypi_package("descartes")
sc.install_pypi_package("shapely")
sc.install_pypi_package("geopandas")

In [None]:
# libs
import geopandas as gpd
from shapely.geometry import Point, Polygon
from pyspark.sql.functions import *

sp_shape = gpd.read_file('s3://mobility-traces-sp/aux-files/shape-sp/DISTRITO_MUNICIPAL_SP_SMDUPolygon.shp')

In [None]:
# name of all Sao Paulo regions
regions = {
    "centro":["ALTO DE PINHEIROS","BARRA FUNDA","BELA VISTA","BELEM","BOM RETIRO","BRAS","CAMBUCI","CONSOLACAO","JARDIM PAULISTA","LAPA","LIBERDADE","MOEMA","MOOCA","PARI","PERDIZES","PINHEIROS","REPUBLICA","SANTA CECILIA","SAUDE","SE","VILA LEOPOLDINA","VILA MARIANA"],
    "sul":["CAMPO BELO","CAMPO GRANDE","CIDADE ADEMAR","CIDADE DUTRA","GRAJAU","JABAQUARA","MARSILAC","PARELHEIROS","PEDREIRA","SOCORRO"],
    "noroeste":["SAO DOMINGOS","ANHANGUERA","BRASILANDIA","CACHOEIRINHA","FREGUESIA DO O","JAGUARA","JARAGUA","LIMAO","PERUS","PIRITUBA"],
    "leste":["ARICANDUVA","ARTUR ALVIM","CARRAO","CIDADE LIDER","CIDADE TIRADENTES","GUAIANASES","IGUATEMI","ITAQUERA","JOSE BONIFACIO","PARQUE DO CARMO","SAO RAFAEL","VILA FORMOSA","VILA MATILDE","SAO MATEUS"],
    "oeste":["BUTANTA","CAMPO LIMPO","JAGUARE","MORUMBI","RAPOSO TAVARES","RIO PEQUENO","VILA ANDRADE","VILA SONIA","AGUA RASA"],
    "sudoeste":["SANTO AMARO","CAPAO REDONDO","JARDIM ANGELA","JARDIM SAO LUIS","ITAIM BIBI"],
    "sudeste":["CURSINO","IPIRANGA","SACOMA","SAO LUCAS","SAPOPEMBA","VILA PRUDENTE"],
    "nordeste":["TATUAPE","CANGAIBA","ERMELINO MATARAZZO","ITAIM PAULISTA","JARDIM HELENA","LAJEADO","PENHA","PONTE RASA","SAO MIGUEL","VILA CURUCA","VILA JACUI"],
    "norte":["CASA VERDE","JACANA","MANDAQUI","SANTANA","TREMEMBE","TUCURUVI","VILA GUILHERME","VILA MARIA","VILA MEDEIROS"],    
}

In [None]:
# Checking not found regions in the shape
regions_shape = list(sp_shape["Nome"])
not_found = []

In [None]:
# finding all regions in the shape
for key in regions.keys():
    for region in regions[key]:
        if region not in regions_shape:
            not_found.append(region)
not_found

In [None]:
# selecting random Sao Paulo Regions
import numpy as np
random_regions = set()
for key in regions.keys():
    list_size = len(regions[key])
    choices = np.random.choice(list(range(0, list_size)),2)
    random_regions.add(regions[key][choices[0]])
    random_regions.add(regions[key][choices[1]])


In [None]:
len(random_regions)

In [None]:
# Selecting some random regions and day to analyze in the map
from pyspark.sql import functions as F
def in_region(region):
    return region in random_regions

from pyspark.sql.types import *
in_region_udf = udf(in_region, BooleanType())

for file in filenames:
    traces = spark.read.parquet("s3a://mobility-traces-sp/processed-data/using-server-hour/records-between-6-23-only-sp-server-hour/"+ file +  "/")

    new_traces_in_regions = traces.filter(in_region_udf('region'))

    for region in random_regions:
        traces_region = new_traces_in_regions.filter(col("region") == region)
        buses = traces_region.select("id_avl").sample(False, 0.1, seed=0).limit(2).collect()
        traces_bus_1 = traces_region.filter(col("id_avl") == buses[0]["id_avl"]).sort(col("dt_server"))
        traces_bus_2 = traces_region.filter(col("id_avl") == buses[1]["id_avl"]).sort(col("dt_server"))
        traces_bus_1.repartition(20).write.parquet("s3://mobility-traces-sp/processed-data/using-server-hour/exploring-data-on-map-only-sp-6-23-server-hour/"+file+"/"+region+"/"+str(buses[0]["id_avl"]) + "/")
        traces_bus_2.repartition(20).write.parquet("s3://mobility-traces-sp/processed-data/using-server-hour/exploring-data-on-map-only-sp-6-23-server-hour/"+file+"/"+region+"/"+str(buses[1]["id_avl"]) + "/")
        