In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.master('local[2]').getOrCreate()


In [12]:


adultinternetusagepenetration = spark.read.csv("hdfs://hdfs-nn:9000/user/Projeto TABD/Bronze/AdultInternetUsagePenetration.csv", 
                                                 header=True, 
                                                 sep=";")

In [13]:
adultinternetusagepenetration.printSchema()
adultinternetusagepenetration.show()
adultinternetusagepenetration.toPandas()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+----+----+------+----+
| _c0| _c1|   _c2| _c3|
+----+----+------+----+
|null|null|  null|null|
|null|Male|Female|null|
|2000|  54|    50|in %|
|2005|  69|    67|in %|
|2009|  77|    75|in %|
|2010|  77|    76|in %|
|2013|  84|    84|in %|
|2014|  84|    84|in %|
|2015|  86|    86|in %|
|2016|  89|    86|in %|
|2018|  89|    88|in %|
|2019|  90|    91|in %|
+----+----+------+----+



Unnamed: 0,_c0,_c1,_c2,_c3
0,,,,
1,,Male,Female,
2,2000.0,54,50,in %
3,2005.0,69,67,in %
4,2009.0,77,75,in %
5,2010.0,77,76,in %
6,2013.0,84,84,in %
7,2014.0,84,84,in %
8,2015.0,86,86,in %
9,2016.0,89,86,in %


In [14]:
adultinternetusagepenetration = adultinternetusagepenetration.selectExpr("_c0 as year", 
                                                                         "_c1 as male", 
                                                                         "_c2 as female")

In [15]:
adultinternetusagepenetration.toPandas()

Unnamed: 0,year,male,female
0,,,
1,,Male,Female
2,2000.0,54,50
3,2005.0,69,67
4,2009.0,77,75
5,2010.0,77,76
6,2013.0,84,84
7,2014.0,84,84
8,2015.0,86,86
9,2016.0,89,86


In [16]:
# Substitui o valor  por None para depois remover a linha toda
adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "male",
    when(
        (col("male") == "String"), 
        None
    ).otherwise(concat(lit(""), col("male"))))

adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "female",
    when(
        (col("female") == "String"), 
        None
    ).otherwise(concat(lit(""), col("female"))))



adultinternetusagepenetration = adultinternetusagepenetration.dropna(how='any')

In [17]:
adultinternetusagepenetration.toPandas()

Unnamed: 0,year,male,female
0,2000,54,50
1,2005,69,67
2,2009,77,75
3,2010,77,76
4,2013,84,84
5,2014,84,84
6,2015,86,86
7,2016,89,86
8,2018,89,88
9,2019,90,91


In [18]:
# Unpivot - Columns to rows
adultinternetusagepenetration = adultinternetusagepenetration.select(
        col("year"),
        expr("stack(2, 'male', `male`, 'female', `female`) as (gender,penetration_percentage)")
)


In [19]:
adultinternetusagepenetration.toPandas()

Unnamed: 0,year,gender,penetration_percentage
0,2000,male,54
1,2000,female,50
2,2005,male,69
3,2005,female,67
4,2009,male,77
5,2009,female,75
6,2010,male,77
7,2010,female,76
8,2013,male,84
9,2013,female,84


In [20]:
adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "penetration_percentage",
    adultinternetusagepenetration["penetration_percentage"].cast(IntegerType()))

adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "year",
    adultinternetusagepenetration["year"].cast(IntegerType()))

## Inserir o resto das colunas para ficar igual ao resto das tabelas 

adultinternetusagepenetration = adultinternetusagepenetration.selectExpr("year as year",
                                                                         "gender as gender",
                                                                         "penetration_percentage as penetration_percentage",
                                                                         "'null' as region", 
                                                                         "'null' as individuals",          
                                                                         "'0' as activity_percentage", 
                                                                         "'0' as usage_rate",
                                                                         "'0' as internet_users",
                                                                         "'null' as internet_activity",
                                                                         "'null' as frequency_of_access",
                                                                         "'0' as frequency_of_access_percentage"
                                                                            )
## Trocar a ordem para ficar igual aos outros
adultinternetusagepenetration = adultinternetusagepenetration[['year', 'region', 'gender', 'usage_rate', 'penetration_percentage', 'internet_users', 'internet_activity', 'activity_percentage', 'frequency_of_access', 'individuals', 'frequency_of_access_percentage']]


## Converter o tipo da coluna adicionada para ficar igual

adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "usage_rate",
    adultinternetusagepenetration["usage_rate"].cast(DoubleType()))

adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "internet_users",
    adultinternetusagepenetration["internet_users"].cast(DoubleType()))

adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "activity_percentage",
    adultinternetusagepenetration["activity_percentage"].cast(IntegerType()))

adultinternetusagepenetration = adultinternetusagepenetration.withColumn(
    "frequency_of_access_percentage",
    adultinternetusagepenetration["frequency_of_access_percentage"].cast(IntegerType()))

In [21]:
adultinternetusagepenetration.printSchema()

root
 |-- year: integer (nullable = true)
 |-- region: string (nullable = false)
 |-- gender: string (nullable = true)
 |-- usage_rate: double (nullable = true)
 |-- penetration_percentage: integer (nullable = true)
 |-- internet_users: double (nullable = true)
 |-- internet_activity: string (nullable = false)
 |-- activity_percentage: integer (nullable = true)
 |-- frequency_of_access: string (nullable = false)
 |-- individuals: string (nullable = false)
 |-- frequency_of_access_percentage: integer (nullable = true)



In [22]:
adultinternetusagepenetration \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/AdultInternetUsagePenetration/")