In [None]:
# Descrição
# Este notebook verifica se há campos com valores nulos ou vazios nos dados não processados csv e 
# salva as estatísticas em formato csv.
#
# Description
# This notebook verifies if there are empty or null fields in the csv raw data, then stores the results in csv file.

In [None]:
# Spark Config
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

In [None]:
from pyspark.sql.functions import col
import numpy as np

# schema of the raw data
custom_schema = StructType([
    StructField("dt_server", StringType()),
    StructField("dt_avl", StringType()),
    StructField("line_id", IntegerType()),
    StructField("latitude", DoubleType()),
    StructField("longitude", DoubleType()),
    StructField("id_avl", IntegerType()),
    StructField("event", IntegerType()),
    StructField("id_point", IntegerType())
])

In [None]:
# stores the name of the files which have nulls/nan/empty fields
file_with_nulls = ["Nenhum"]

# october days 1 to 31
for day in range(1,32):
    filename = "MO_1510" + str(day)
    
    # read raw data file
    traces = spark.read.csv("s3a://mobility-traces-sp/raw-data/"+ filename + ".csv",header="false",schema=custom_schema)
    
    # identity if there is an empty/null or Nan column in the file, if yes returns 1 (just the first found element --> limit 1)
    nulls = [traces.where((col(column).isNull()) | isnan(col(column)) | (col(column) == "")).limit(1).count() for column in df.columns]
    
    # sum the number of columns which has empty/null/NaN values
    num_nulls = np.sum(nulls)
    
    # if there is at least 1 column containing empty/null/NaN values
    if num_nulls > 0:
        # adds the filename in the list
        file_with_nulls.append(filename)
        
    # saving accumulated results (aka partials) with the name of the current filename
    output = spark.createDataFrame([file_with_nulls])
    output.write.csv("s3://mobility-traces-sp/statistics/null-values/" + filename)

In [None]:
# Resultados
# Não há nenhum arquivo cujos campos tenham valores nulos ou vazios. 
# No futuro, talvez algum tratamento seja necessário para cada coluna ou tipo de dado para calculo das métricas.
#
# Results
# There is no file with null/empty/NaN fields.
# Maybe in the future, some treatment will be required for each column or data type to calculate the metrics.
