# Criação da sessão do PySpark

In [11]:
#iniciar spark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pandas as pd

conf = SparkConf()
#conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.2')
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.2.2,com.microsoft.azure:spark-mssql-connector_2.12:1.2.0,com.databricks:spark-xml_2.12:0.6.0')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.InstanceProfileCredentialsProvider')

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Lendo arquivo CSV vindo de um bucket da S3

In [12]:
df_cholera = spark.read.option('delimiter', ',') \
    .option('header', 'true') \
    .csv('s3a://andre-sprint03-sptech-bucket-bruto/cholera-cases.csv')

In [13]:
df_sanitation_water = spark.read.format("com.databricks.spark.xml") \
.option("rootTag", "data") \
.option("rowTag", "row").load('s3a://andre-sprint03-sptech-bucket-bruto/sanitation-water-global.xml')

                                                                                

# Tratando os datatypes do dataframe

In [14]:
df_cholera.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Number of reported cases of cholera: string (nullable = true)
 |-- Number of reported deaths from cholera: string (nullable = true)
 |-- Cholera case fatality rate: string (nullable = true)
 |-- WHO Region: string (nullable = true)



In [15]:
df_sanitation_water.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Country_code: string (nullable = true)
 |-- Population_practising_open_defecation_Rural: double (nullable = true)
 |-- Population_practising_open_defecation_Total: double (nullable = true)
 |-- Population_practising_open_defecation_Urban: double (nullable = true)
 |-- Population_using_at_least_basic_drinking_water_services_Rural: double (nullable = true)
 |-- Population_using_at_least_basic_drinking_water_services_Total: double (nullable = true)
 |-- Population_using_at_least_basic_drinking_water_services_Urban: double (nullable = true)
 |-- Population_using_at_least_basic_sanitation_services_Total: double (nullable = true)
 |-- Population_using_at_least_basic_sanitation_services_Urban: double (nullable = true)
 |-- Population_using_safely_managed_drinking_water_services_Rural: double (nullable = true)
 |-- Population_using_safely_managed_drinking_water_services_Total: double (nullable = true)
 |-- Population_using_safely_managed_drinkin

In [20]:
from pyspark.sql.types import IntegerType, DecimalType
from pyspark.sql.functions import col

#Tranformando os datatypes do dataframe de colera
df_cholera = df_cholera.select(
    col('Country').alias('COUNTRY'),
    col('Year').cast(IntegerType()).alias('YEAR'),
    col('Number of reported cases of cholera').cast(IntegerType()).alias('CASES_CHOLERA'),
    col('Number of reported deaths from cholera').cast(IntegerType()).alias('DEATHS_CHOLERA'),
    col('Cholera case fatality rate').cast(DecimalType(18, 2)).alias('FATALITY_RATE'),
    col('WHO Region').alias('REGION')
)


In [21]:
df_cholera.printSchema()

root
 |-- COUNTRY: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- CASES_CHOLERA: integer (nullable = true)
 |-- DEATHS_CHOLERA: integer (nullable = true)
 |-- FATALITY_RATE: decimal(18,2) (nullable = true)
 |-- REGION: string (nullable = true)



In [22]:
from pyspark.sql.types import IntegerType, DecimalType, StringType, DoubleType
from pyspark.sql.functions import col

# Modificando o datatype do Dataframe de saneamneto de água
df_sanitation_water = df_sanitation_water.select(
    col('Year').cast(IntegerType()),
    col('Region').cast(StringType()),
    col('Country_code').cast(StringType()),
    col('Country').cast(StringType()),
    col('Population_using_at_least_basic_drinking_water_services_Rural').cast(DecimalType(18, 2)),
    col('Population_using_at_least_basic_drinking_water_services_Total').cast(DecimalType(18, 2)),
    col('Population_using_at_least_basic_drinking_water_services_Urban').cast(DecimalType(18, 2)),
    col('Population_using_safely_managed_drinking_water_services_Rural').cast(DecimalType(18, 2)),
    col('Population_using_safely_managed_drinking_water_services_Total').cast(DecimalType(18, 2)),
    col('Population_using_safely_managed_drinking_water_services_Urban').cast(DecimalType(18, 2)),
    col('Population_using_at_least_basic_sanitation_services_Total').cast(DecimalType(18, 2)),
    col('Population_using_at_least_basic_sanitation_services_Urban').cast(DecimalType(18, 2)),
    col('Population_using_safely_managed_sanitation_services_Rural').cast(DecimalType(18, 2)),
    col('Population_using_safely_managed_sanitation_services_Total').cast(DecimalType(18, 2)),
    col('Population_using_safely_managed_sanitation_services_Urban').cast(DecimalType(18, 2)),
    col('Population_with_basic_handwashing_facilities_at_home_Rural').cast(DecimalType(18, 2)),
    col('Population_with_basic_handwashing_facilities_at_home_Total').cast(DecimalType(18, 2)),
    col('Population_with_basic_handwashing_facilities_at_home_Urban').cast(DecimalType(18, 2)),
    col('Population_practising_open_defecation_Rural').cast(DecimalType(18, 2)),
    col('Population_practising_open_defecation_Total').cast(DecimalType(18, 2)),
    col('Population_practising_open_defecation_Urban').cast(DecimalType(18, 2))
)

In [23]:
# Dataframe antes de trabalhar os datatypes
df_sanitation_water.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Country_code: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Population_using_at_least_basic_drinking_water_services_Rural: decimal(18,2) (nullable = true)
 |-- Population_using_at_least_basic_drinking_water_services_Total: decimal(18,2) (nullable = true)
 |-- Population_using_at_least_basic_drinking_water_services_Urban: decimal(18,2) (nullable = true)
 |-- Population_using_safely_managed_drinking_water_services_Rural: decimal(18,2) (nullable = true)
 |-- Population_using_safely_managed_drinking_water_services_Total: decimal(18,2) (nullable = true)
 |-- Population_using_safely_managed_drinking_water_services_Urban: decimal(18,2) (nullable = true)
 |-- Population_using_at_least_basic_sanitation_services_Total: decimal(18,2) (nullable = true)
 |-- Population_using_at_least_basic_sanitation_services_Urban: decimal(18,2) (nullable = true)
 |-- Population_using_safely_managed_sanitati

# Verificando se dataframe possuí dados NaN ou Null

In [24]:
df_sanitation_water.show()

+----+------+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-------------------------------------------+-------------------------------------------+--------------------------

In [25]:
df_sanitation_water.describe('Population_using_at_least_basic_drinking_water_services_Rural').show()
df_sanitation_water.describe('Population_using_at_least_basic_drinking_water_services_Total').show()
df_sanitation_water.describe('Population_using_at_least_basic_drinking_water_services_Urban').show()

df_sanitation_water.describe('Population_using_safely_managed_drinking_water_services_Rural').show()
df_sanitation_water.describe('Population_using_safely_managed_drinking_water_services_Total').show()
df_sanitation_water.describe('Population_using_safely_managed_drinking_water_services_Urban').show()

df_sanitation_water.describe('Population_using_at_least_basic_sanitation_services_Total').show()
df_sanitation_water.describe('Population_using_at_least_basic_sanitation_services_Urban').show()

df_sanitation_water.describe('Population_using_safely_managed_sanitation_services_Rural').show()
df_sanitation_water.describe('Population_using_safely_managed_sanitation_services_Total').show()
df_sanitation_water.describe('Population_using_safely_managed_sanitation_services_Urban').show()

df_sanitation_water.describe('Population_with_basic_handwashing_facilities_at_home_Rural').show()
df_sanitation_water.describe('Population_with_basic_handwashing_facilities_at_home_Total').show()
df_sanitation_water.describe('Population_with_basic_handwashing_facilities_at_home_Urban').show()

df_sanitation_water.describe('Population_practising_open_defecation_Rural').show()
df_sanitation_water.describe('Population_practising_open_defecation_Total').show()
df_sanitation_water.describe('Population_practising_open_defecation_Urban').show()

                                                                                

+-------+-------------------------------------------------------------+
|summary|Population_using_at_least_basic_drinking_water_services_Rural|
+-------+-------------------------------------------------------------+
|  count|                                                         2953|
|   mean|                                                    74.984047|
| stddev|                                           24.676125950356724|
|    min|                                                         4.08|
|    max|                                                       100.00|
+-------+-------------------------------------------------------------+



                                                                                

+-------+-------------------------------------------------------------+
|summary|Population_using_at_least_basic_drinking_water_services_Total|
+-------+-------------------------------------------------------------+
|  count|                                                         3449|
|   mean|                                                    84.286587|
| stddev|                                           18.873825766196795|
|    min|                                                        18.70|
|    max|                                                       100.00|
+-------+-------------------------------------------------------------+



                                                                                

+-------+-------------------------------------------------------------+
|summary|Population_using_at_least_basic_drinking_water_services_Urban|
+-------+-------------------------------------------------------------+
|  count|                                                         3013|
|   mean|                                                    92.390037|
| stddev|                                            9.563595057836439|
|    min|                                                        49.49|
|    max|                                                       100.00|
+-------+-------------------------------------------------------------+



                                                                                

+-------+-------------------------------------------------------------+
|summary|Population_using_safely_managed_drinking_water_services_Rural|
+-------+-------------------------------------------------------------+
|  count|                                                          612|
|   mean|                                                    38.149118|
| stddev|                                            28.13971260930169|
|    min|                                                         0.00|
|    max|                                                       100.00|
+-------+-------------------------------------------------------------+



                                                                                

+-------+-------------------------------------------------------------+
|summary|Population_using_safely_managed_drinking_water_services_Total|
+-------+-------------------------------------------------------------+
|  count|                                                         1745|
|   mean|                                                    74.498613|
| stddev|                                           27.518824099908116|
|    min|                                                         4.53|
|    max|                                                       100.00|
+-------+-------------------------------------------------------------+



                                                                                

+-------+-------------------------------------------------------------+
|summary|Population_using_safely_managed_drinking_water_services_Urban|
+-------+-------------------------------------------------------------+
|  count|                                                          929|
|   mean|                                                    68.634801|
| stddev|                                           25.867416121398463|
|    min|                                                         3.71|
|    max|                                                       100.00|
+-------+-------------------------------------------------------------+



                                                                                

+-------+---------------------------------------------------------+
|summary|Population_using_at_least_basic_sanitation_services_Total|
+-------+---------------------------------------------------------+
|  count|                                                     3439|
|   mean|                                                71.211966|
| stddev|                                       30.342367114183762|
|    min|                                                     3.40|
|    max|                                                   100.00|
+-------+---------------------------------------------------------+



                                                                                

+-------+---------------------------------------------------------+
|summary|Population_using_at_least_basic_sanitation_services_Urban|
+-------+---------------------------------------------------------+
|  count|                                                     2992|
|   mean|                                                75.765515|
| stddev|                                       25.793371387983765|
|    min|                                                     9.43|
|    max|                                                   100.00|
+-------+---------------------------------------------------------+



                                                                                

+-------+---------------------------------------------------------+
|summary|Population_using_safely_managed_sanitation_services_Rural|
+-------+---------------------------------------------------------+
|  count|                                                      783|
|   mean|                                                54.222771|
| stddev|                                        33.66585235730465|
|    min|                                                     0.96|
|    max|                                                   100.00|
+-------+---------------------------------------------------------+

+-------+---------------------------------------------------------+
|summary|Population_using_safely_managed_sanitation_services_Total|
+-------+---------------------------------------------------------+
|  count|                                                     1571|
|   mean|                                                62.768409|
| stddev|                                      

                                                                                

+-------+----------------------------------------------------------+
|summary|Population_with_basic_handwashing_facilities_at_home_Total|
+-------+----------------------------------------------------------+
|  count|                                                       921|
|   mean|                                                 47.501911|
| stddev|                                         32.77440614718687|
|    min|                                                      1.03|
|    max|                                                    100.00|
+-------+----------------------------------------------------------+



                                                                                

+-------+----------------------------------------------------------+
|summary|Population_with_basic_handwashing_facilities_at_home_Urban|
+-------+----------------------------------------------------------+
|  count|                                                       898|
|   mean|                                                 55.588686|
| stddev|                                         31.46764551429652|
|    min|                                                      1.76|
|    max|                                                    100.00|
+-------+----------------------------------------------------------+



                                                                                

+-------+-------------------------------------------+
|summary|Population_practising_open_defecation_Rural|
+-------+-------------------------------------------+
|  count|                                       2902|
|   mean|                                  18.086999|
| stddev|                         24.423633718705887|
|    min|                                       0.00|
|    max|                                      97.95|
+-------+-------------------------------------------+



                                                                                

+-------+-------------------------------------------+
|summary|Population_practising_open_defecation_Total|
+-------+-------------------------------------------+
|  count|                                       3402|
|   mean|                                  10.825744|
| stddev|                          17.66044778055224|
|    min|                                       0.00|
|    max|                                      84.59|
+-------+-------------------------------------------+

+-------+-------------------------------------------+
|summary|Population_practising_open_defecation_Urban|
+-------+-------------------------------------------+
|  count|                                       2990|
|   mean|                                   3.790906|
| stddev|                          7.623980204142354|
|    min|                                       0.00|
|    max|                                      64.50|
+-------+-------------------------------------------+



In [26]:
# Inserindo valores 0 aonde existe pois o valor minimo em todas as colunas decimais é 0
df_sanitation_water = df_sanitation_water.na.fill(0)

In [27]:
# Dataframe após substituir os valores
df_sanitation_water.show()

+----+------+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+----------------------------------------------------------+-------------------------------------------+-------------------------------------------+--------------------------

In [28]:
df_cholera.show()

+-----------+----+-------------+--------------+-------------+--------------------+
|    COUNTRY|YEAR|CASES_CHOLERA|DEATHS_CHOLERA|FATALITY_RATE|              REGION|
+-----------+----+-------------+--------------+-------------+--------------------+
|Afghanistan|2016|          677|             5|         0.70|Eastern Mediterra...|
|Afghanistan|2015|        58064|             8|         0.01|Eastern Mediterra...|
|Afghanistan|2014|        45481|             4|         0.00|Eastern Mediterra...|
|Afghanistan|2013|         3957|            14|         0.35|Eastern Mediterra...|
|Afghanistan|2012|           12|             0|         0.10|Eastern Mediterra...|
|Afghanistan|2011|         3733|            44|         1.18|Eastern Mediterra...|
|Afghanistan|2010|         2369|            10|         0.42|Eastern Mediterra...|
|Afghanistan|2009|          662|            11|         1.66|Eastern Mediterra...|
|Afghanistan|2008|         4384|            22|         0.50|Eastern Mediterra...|
|Afg

In [29]:
df_cholera.describe('CASES_CHOLERA').show()
df_cholera.describe('DEATHS_CHOLERA').show()
df_cholera.describe('FATALITY_RATE').show()

+-------+------------------+
|summary|     CASES_CHOLERA|
+-------+------------------+
|  count|              2469|
|   mean| 3718.379100850547|
| stddev|14904.906044893929|
|    min|                 0|
|    max|            340311|
+-------+------------------+

+-------+------------------+
|summary|    DEATHS_CHOLERA|
+-------+------------------+
|  count|              2373|
|   mean|378.08849557522126|
| stddev|3570.2829791470526|
|    min|                 0|
|    max|            124227|
+-------+------------------+

+-------+------------------+
|summary|     FATALITY_RATE|
+-------+------------------+
|  count|              2363|
|   mean|          5.758028|
| stddev|15.566521726111958|
|    min|              0.00|
|    max|            450.00|
+-------+------------------+



In [30]:
# Substituindo os valores nulls por 0 pois a mínima dessas colunas é 0
df_cholera = df_cholera.na.fill(0)

In [31]:
df_cholera.show()

+-----------+----+-------------+--------------+-------------+--------------------+
|    COUNTRY|YEAR|CASES_CHOLERA|DEATHS_CHOLERA|FATALITY_RATE|              REGION|
+-----------+----+-------------+--------------+-------------+--------------------+
|Afghanistan|2016|          677|             5|         0.70|Eastern Mediterra...|
|Afghanistan|2015|        58064|             8|         0.01|Eastern Mediterra...|
|Afghanistan|2014|        45481|             4|         0.00|Eastern Mediterra...|
|Afghanistan|2013|         3957|            14|         0.35|Eastern Mediterra...|
|Afghanistan|2012|           12|             0|         0.10|Eastern Mediterra...|
|Afghanistan|2011|         3733|            44|         1.18|Eastern Mediterra...|
|Afghanistan|2010|         2369|            10|         0.42|Eastern Mediterra...|
|Afghanistan|2009|          662|            11|         1.66|Eastern Mediterra...|
|Afghanistan|2008|         4384|            22|         0.50|Eastern Mediterra...|
|Afg

# Fazendo upload dos datasets no S3

In [32]:
df_cholera.write \
   .option("header", "true") \
   .mode('overwrite') \
   .csv('s3a://andre-sprint03-sptech-bucket-tratados/cases_of_cholera')

22/06/15 00:42:47 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
22/06/15 00:42:48 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                

In [33]:
df_sanitation_water.write \
   .option("header", "true") \
   .mode('overwrite') \
   .csv('s3a://andre-sprint03-sptech-bucket-tratados/sanitation_water')

22/06/15 00:42:51 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
22/06/15 00:42:51 WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
                                                                                