In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql import *

spark = SparkSession.builder.master('local[2]').getOrCreate()


In [28]:


globalinternetusageratebygenderregion = spark.read.csv("hdfs://hdfs-nn:9000/user/Projeto TABD/Bronze/GlobalInternetUsageRateByGenderRegion.csv", 
                                                 header=True, 
                                                 sep=";")

In [29]:
globalinternetusageratebygenderregion.printSchema()
globalinternetusageratebygenderregion.show()
globalinternetusageratebygenderregion.toPandas()


root
 |-- Global internet usage rate 2019, by gender and region: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+-----------------------------------------------------+------+----+----+
|Global internet usage rate 2019, by gender and region|   _c1| _c2| _c3|
+-----------------------------------------------------+------+----+----+
|                                                 null|  null|null|null|
|                                                 null|Female|Male|null|
|                                               Europe|  80,1|85,1|in %|
|                                         The Americas|    77|76,3|in %|
|                                                  CIS|  71,7|  74|in %|
|                                          Arab States|  47,3|61,3|in %|
|                                         Asia Pacific|  41,3|48,3|in %|
|                                               Africa|  20,2|37,1|in %|
+

Unnamed: 0,"Global internet usage rate 2019, by gender and region",_c1,_c2,_c3
0,,,,
1,,Female,Male,
2,Europe,801,851,in %
3,The Americas,77,763,in %
4,CIS,717,74,in %
5,Arab States,473,613,in %
6,Asia Pacific,413,483,in %
7,Africa,202,371,in %


In [30]:
globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.selectExpr("`Global internet usage rate 2019, by gender and region` as region", 
                                                                                         "_c1 as female", 
                                                                                         "_c2 as male")

In [31]:
globalinternetusageratebygenderregion.toPandas()

Unnamed: 0,region,female,male
0,,,
1,,Female,Male
2,Europe,801,851
3,The Americas,77,763
4,CIS,717,74
5,Arab States,473,613
6,Asia Pacific,413,483
7,Africa,202,371


In [32]:
# Substitui o valor  por None para depois remover a linha toda
globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "female",
    when(
        (col("female") == "String"), 
        None
    ).otherwise(concat(lit(""), col("female")))
    #
    #.when(
    #
    #)
)
globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "male",
    when(
        (col("male") == "String"), 
        None
    ).otherwise(concat(lit(""), col("male")))
    #
    #.when(
    #
    #)
)

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.dropna(how='any')

In [33]:
globalinternetusageratebygenderregion.toPandas()

Unnamed: 0,region,female,male
0,Europe,801,851
1,The Americas,77,763
2,CIS,717,74
3,Arab States,473,613
4,Asia Pacific,413,483
5,Africa,202,371


In [34]:
# Unpivot - Columns to rows
globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.select(
        col("region"),
        expr("stack(2, 'female', `female`, 'male', `male`) as (gender,usage_rate)")
)


In [35]:
globalinternetusageratebygenderregion.toPandas()

Unnamed: 0,region,gender,usage_rate
0,Europe,female,801
1,Europe,male,851
2,The Americas,female,77
3,The Americas,male,763
4,CIS,female,717
5,CIS,male,74
6,Arab States,female,473
7,Arab States,male,613
8,Asia Pacific,female,413
9,Asia Pacific,male,483


In [36]:
## Inserir o resto das colunas para ficar igual ao resto das tabelas 

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.selectExpr("'2019' as year",
                                                         "region as region", 
                                                         "gender as gender",
                                                         "usage_rate as usage_rate",
                                                         "'null' as individuals",
                                                         "'0' as activity_percentage", 
                                                         "'0' as penetration_percentage",
                                                         "'0' as internet_users",
                                                         "'null' as internet_activity",
                                                         "'null' as frequency_of_access",
                                                         "'0' as frequency_of_access_percentage"
                                                        )


In [37]:
globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "usage_rate",
    f.regexp_replace(col("usage_rate"),",", "."))

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "usage_rate",
    f.regexp_replace(col("usage_rate")," ", ""))

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "usage_rate",
    globalinternetusageratebygenderregion["usage_rate"].cast(DoubleType()))

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "year",
    globalinternetusageratebygenderregion["year"].cast(IntegerType()))

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "internet_users",
    globalinternetusageratebygenderregion["internet_users"].cast(DoubleType()))


## Trocar a ordem para ficar igual aos outros
globalinternetusageratebygenderregion = globalinternetusageratebygenderregion[['year', 'region', 'gender', 'usage_rate', 'penetration_percentage', 'internet_users', 'internet_activity', 'activity_percentage', 'frequency_of_access', 'individuals', 'frequency_of_access_percentage']]


## Converter o tipo da coluna adicionada para ficar igual

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "penetration_percentage",
    globalinternetusageratebygenderregion["penetration_percentage"].cast(IntegerType()))

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "internet_users",
    globalinternetusageratebygenderregion["internet_users"].cast(DoubleType()))

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "activity_percentage",
    globalinternetusageratebygenderregion["activity_percentage"].cast(IntegerType()))

globalinternetusageratebygenderregion = globalinternetusageratebygenderregion.withColumn(
    "frequency_of_access_percentage",
    globalinternetusageratebygenderregion["frequency_of_access_percentage"].cast(IntegerType()))

In [38]:
globalinternetusageratebygenderregion.toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,internet_activity,activity_percentage,frequency_of_access,individuals,frequency_of_access_percentage
0,2019,Europe,female,80.1,0,0.0,,0,,,0
1,2019,Europe,male,85.1,0,0.0,,0,,,0
2,2019,The Americas,female,77.0,0,0.0,,0,,,0
3,2019,The Americas,male,76.3,0,0.0,,0,,,0
4,2019,CIS,female,71.7,0,0.0,,0,,,0
5,2019,CIS,male,74.0,0,0.0,,0,,,0
6,2019,Arab States,female,47.3,0,0.0,,0,,,0
7,2019,Arab States,male,61.3,0,0.0,,0,,,0
8,2019,Asia Pacific,female,41.3,0,0.0,,0,,,0
9,2019,Asia Pacific,male,48.3,0,0.0,,0,,,0


In [39]:
globalinternetusageratebygenderregion.printSchema()

root
 |-- year: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- usage_rate: double (nullable = true)
 |-- penetration_percentage: integer (nullable = true)
 |-- internet_users: double (nullable = true)
 |-- internet_activity: string (nullable = false)
 |-- activity_percentage: integer (nullable = true)
 |-- frequency_of_access: string (nullable = false)
 |-- individuals: string (nullable = false)
 |-- frequency_of_access_percentage: integer (nullable = true)



In [40]:
globalinternetusageratebygenderregion \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/GlobalInternetUsageRateByGenderRegion/")