In [1]:
#Todos os imports necessários para aplicação.
import os
import sagemaker_pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField,StringType, FloatType
from pyspark import SparkContext, SparkConf
from sagemaker_pyspark import classpath_jars
from pyspark.sql.functions import create_map, struct
from pyspark.sql.functions import exp
from pyspark.sql.functions import regexp_replace
from pyspark.sql import SQLContext
from pyspark.sql.functions import when, lit
from functools import reduce 
from pyspark.sql import DataFrame

In [2]:
classpath = ":".join(sagemaker_pyspark.classpath_jars())

builder = SparkSession.builder.appName("Dados Enem")
builder.config(
    "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
builder.config("spark.speculation", "false")
builder.config("spark.sql.parquet.compression.codec", "gzip")
builder.config("spark.debug.maxToStringFields", "100")
builder.config("spark.driver.extraClassPath", classpath)
builder.config("spark.driver.memory", "1g")
builder.config("spark.driver.cores", "1")
builder.config("spark.executor-memory", "20g")
builder.config("spark.executor.cores", "4")


builder.master("local[*]")

spark = builder.getOrCreate()
spark

In [3]:
LANDED = 'C:\\BigData\\Fontes\\microdados_enem\\Landed\\'
RAW = 'C:\\BigData\\Fontes\\microdados_enem\\Raw\\'
MODELED = 'C:\\BigData\\Fontes\\microdados_enem\\Modeled\\'
SELF = 'C:\\BigData\\Fontes\\microdados_enem\\Self\\'

In [4]:
ENEM_2018_PARQUET = 'MICRODADOS_ENEM_2018_REDUZIDO.parquet'
ENEM_2017_PARQUET = 'MICRODADOS_ENEM_2017_REDUZIDO.parquet'
ENEM_2016_PARQUET = 'MICRODADOS_ENEM_2016_REDUZIDO.parquet'

In [5]:
sqlContext = SQLContext(spark.sparkContext)

df_2016 = spark.read.parquet(RAW + ENEM_2016_PARQUET)    
df_2016.printSchema()                  

root
 |-- NU_ANO: integer (nullable = true)
 |-- NU_NOTA_CN: double (nullable = true)
 |-- NU_NOTA_CH: double (nullable = true)
 |-- NU_NOTA_LC: double (nullable = true)
 |-- NU_NOTA_MT: double (nullable = true)
 |-- NU_NOTA_REDACAO: integer (nullable = true)
 |-- Q001: string (nullable = true)
 |-- Q002: string (nullable = true)
 |-- Q005: integer (nullable = true)
 |-- RENDA_FAMILIAR: string (nullable = true)



In [6]:
df_2017 = spark.read.parquet(RAW + ENEM_2017_PARQUET)    
df_2017.printSchema()  

root
 |-- NU_ANO: integer (nullable = true)
 |-- NU_NOTA_CN: double (nullable = true)
 |-- NU_NOTA_CH: double (nullable = true)
 |-- NU_NOTA_LC: double (nullable = true)
 |-- NU_NOTA_MT: double (nullable = true)
 |-- NU_NOTA_REDACAO: integer (nullable = true)
 |-- Q001: string (nullable = true)
 |-- Q002: string (nullable = true)
 |-- Q005: integer (nullable = true)
 |-- RENDA_FAMILIAR: string (nullable = true)



In [7]:
df_2018 = spark.read.parquet(RAW + ENEM_2018_PARQUET)    
df_2018.printSchema()  

root
 |-- NU_ANO: integer (nullable = true)
 |-- NU_NOTA_CN: double (nullable = true)
 |-- NU_NOTA_CH: double (nullable = true)
 |-- NU_NOTA_LC: double (nullable = true)
 |-- NU_NOTA_MT: double (nullable = true)
 |-- NU_NOTA_REDACAO: integer (nullable = true)
 |-- Q001: string (nullable = true)
 |-- Q002: string (nullable = true)
 |-- Q005: integer (nullable = true)
 |-- RENDA_FAMILIAR: string (nullable = true)



In [8]:
def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

df_all = unionAll(df_2016, df_2017, df_2018)

In [9]:
df_all.write.mode("overwrite").parquet(MODELED + 'MICRODADOS_ENEM_UNION.parquet')