In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.master('local[2]').getOrCreate()


In [2]:


internetusagefrequency = spark.read.csv("hdfs://hdfs-nn:9000/user/Projeto TABD/Bronze/InternetUsageFrequency.csv", 
                                                 header=True, 
                                                 sep=",")

In [3]:
internetusagefrequency.printSchema()
internetusagefrequency.show()
internetusagefrequency.toPandas()

root
 |-- TIME: string (nullable = true)
 |-- GEO: string (nullable = true)
 |-- INDIC_IS: string (nullable = true)
 |-- UNIT: string (nullable = true)
 |-- IND_TYPE: string (nullable = true)
 |-- Value: string (nullable = true)

+----+--------------------+--------------------+--------------------+---------------+-----+
|TIME|                 GEO|            INDIC_IS|                UNIT|       IND_TYPE|Value|
+----+--------------------+--------------------+--------------------+---------------+-----+
|2011|European Union - ...|Frequency of inte...|Percentage of ind...|All Individuals|   65|
|2011|European Union - ...|Frequency of inte...|Percentage of ind...|All Individuals|   94|
|2011|European Union - ...|Frequency of inte...|Percentage of ind...|All Individuals|   54|
|2011|European Union - ...|Frequency of inte...|Percentage of ind...|All Individuals|   78|
|2011|European Union - ...|Frequency of inte...|Percentage of ind...|All Individuals|   11|
|2011|European Union - ...|Frequen

Unnamed: 0,TIME,GEO,INDIC_IS,UNIT,IND_TYPE,Value
0,2011,European Union - 27 countries (from 2020),Frequency of internet access: once a week (inc...,Percentage of individuals,All Individuals,65
1,2011,European Union - 27 countries (from 2020),Frequency of internet access: once a week (inc...,Percentage of individuals who used internet in...,All Individuals,94
2,2011,European Union - 27 countries (from 2020),Frequency of internet access: daily,Percentage of individuals,All Individuals,54
3,2011,European Union - 27 countries (from 2020),Frequency of internet access: daily,Percentage of individuals who used internet in...,All Individuals,78
4,2011,European Union - 27 countries (from 2020),Frequency of internet access: at least once a ...,Percentage of individuals,All Individuals,11
...,...,...,...,...,...,...
5275,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: at least once a ...,Percentage of individuals who used internet in...,All Individuals,:
5276,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: less than once a...,Percentage of individuals,All Individuals,:
5277,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: less than once a...,Percentage of individuals who used internet in...,All Individuals,:
5278,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: less than once a...,Percentage of individuals,All Individuals,1


In [4]:
internetusagefrequency = internetusagefrequency.selectExpr("TIME as year", 
                                                           "GEO as region", 
                                                           "INDIC_IS as frequency_of_access",
                                                           "UNIT as individuals",
                                                           "IND_TYPE as individuals_type",
                                                           "Value as frequency_of_access_percentage",
                                                          )

In [5]:
internetusagefrequency.toPandas()

Unnamed: 0,year,region,frequency_of_access,individuals,individuals_type,frequency_of_access_percentage
0,2011,European Union - 27 countries (from 2020),Frequency of internet access: once a week (inc...,Percentage of individuals,All Individuals,65
1,2011,European Union - 27 countries (from 2020),Frequency of internet access: once a week (inc...,Percentage of individuals who used internet in...,All Individuals,94
2,2011,European Union - 27 countries (from 2020),Frequency of internet access: daily,Percentage of individuals,All Individuals,54
3,2011,European Union - 27 countries (from 2020),Frequency of internet access: daily,Percentage of individuals who used internet in...,All Individuals,78
4,2011,European Union - 27 countries (from 2020),Frequency of internet access: at least once a ...,Percentage of individuals,All Individuals,11
...,...,...,...,...,...,...
5275,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: at least once a ...,Percentage of individuals who used internet in...,All Individuals,:
5276,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: less than once a...,Percentage of individuals,All Individuals,:
5277,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: less than once a...,Percentage of individuals who used internet in...,All Individuals,:
5278,2020,Kosovo (under United Nations Security Council ...,Frequency of internet access: less than once a...,Percentage of individuals,All Individuals,1


In [6]:

internetusagefrequency = internetusagefrequency.drop("individuals_type")

internetusagefrequency = internetusagefrequency.withColumn(
    "frequency_of_access",
    when(
        (col("frequency_of_access") == "Frequency of internet access: once a week (including every day)"), "Once a week"   
    ).when(
        (col("frequency_of_access") == "Frequency of internet access: daily"), "Daily"
    ).when(
        (col("frequency_of_access") == "Frequency of internet access: at least once a week (but not every day)"), "At least once a week"
    ).when(
        (col("frequency_of_access") == "Frequency of internet access: at least once a month"), "At least once a month"
    ).when(
        (col("frequency_of_access") == "Frequency of internet access: less than once a month"), "Less than once a month"
    ).when(
        (col("frequency_of_access") == "Frequency of internet access: less than once a week"), "Less than once a week"    
    ).otherwise(col("frequency_of_access")))

internetusagefrequency = internetusagefrequency.withColumn(
    "individuals",
    when(
        (col("individuals") == "Percentage of individuals"), "All Individuals"   
    ).when(
        (col("individuals") == "Percentage of individuals who used internet in the last 3 months"), "All individuals who used internet in the last 3 months" 
    ).otherwise(col("individuals")))

internetusagefrequency = internetusagefrequency.withColumn(
    "region",
    when(
        (col("region") == "Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015)"), "Euro Area"   
    ).when(
        (col("region") == "Germany (until 1990 former territory of the FRG)"), "Germany"
    ).when(
        (col("region") == "Kosovo (under United Nations Security Council Resolution 1244/99)"), "Kosovo"
    ).when(
        (col("region") == "European Union - 27 countries (from 2020)"), "European Union from 2020"
    ).when(
        (col("region") == "European Union - 28 countries (2013-2020)"), "European Union from 2013 to 2020"
    ).when(
        (col("region") == "European Union - 27 countries (2007-2013)"), "European Union from 2007 to 2013"    
    ).when(
        (col("region") == "European Union - 25 countries (2004-2006)"), "European Union from 2004 to 2006" 
    ).when(
        (col("region") == "European Union - 15 countries (1995-2004))"), "European Union from 1995 to 2004"  
    ).otherwise(col("region")))


In [7]:
internetusagefrequency.toPandas()

Unnamed: 0,year,region,frequency_of_access,individuals,frequency_of_access_percentage
0,2011,European Union from 2020,Once a week,All Individuals,65
1,2011,European Union from 2020,Once a week,All individuals who used internet in the last ...,94
2,2011,European Union from 2020,Daily,All Individuals,54
3,2011,European Union from 2020,Daily,All individuals who used internet in the last ...,78
4,2011,European Union from 2020,At least once a week,All Individuals,11
...,...,...,...,...,...
5275,2020,Kosovo,At least once a month,All individuals who used internet in the last ...,:
5276,2020,Kosovo,Less than once a month,All Individuals,:
5277,2020,Kosovo,Less than once a month,All individuals who used internet in the last ...,:
5278,2020,Kosovo,Less than once a week,All Individuals,1


In [8]:
internetusagefrequency = internetusagefrequency.withColumn(
    "year",
    internetusagefrequency["year"].cast(IntegerType()))

internetusagefrequency = internetusagefrequency.withColumn(
    "frequency_of_access_percentage",
    internetusagefrequency["frequency_of_access_percentage"].cast(IntegerType()))

internetusagefrequency = internetusagefrequency.fillna(0)


## Inserir o resto das colunas para ficar igual ao resto das tabelas 

internetusagefrequency = internetusagefrequency.selectExpr("year as year",
                                                           "region as region",
                                                           "frequency_of_access as frequency_of_access",
                                                           "individuals as individuals", 
                                                           "frequency_of_access_percentage as frequency_of_access_percentage",
                                                           "'null' as gender",
                                                           "'0' as penetration_percentage",
                                                           "'0' as activity_percentage", 
                                                           "'0' as usage_rate",
                                                           "'0' as internet_users",
                                                           "'null' as internet_activity"
                                                         )
## Trocar a ordem para ficar igual aos outros
internetusagefrequency = internetusagefrequency[['year', 'region', 'gender', 'usage_rate', 'penetration_percentage', 'internet_users', 'internet_activity', 'activity_percentage', 'frequency_of_access', 'individuals', 'frequency_of_access_percentage']]


## Converter o tipo da coluna adicionada para ficar igual

internetusagefrequency = internetusagefrequency.withColumn(
    "usage_rate",
    internetusagefrequency["usage_rate"].cast(DoubleType()))

internetusagefrequency = internetusagefrequency.withColumn(
    "internet_users",
    internetusagefrequency["internet_users"].cast(DoubleType()))

internetusagefrequency = internetusagefrequency.withColumn(
    "activity_percentage",
    internetusagefrequency["activity_percentage"].cast(IntegerType()))

internetusagefrequency = internetusagefrequency.withColumn(
    "penetration_percentage",
    internetusagefrequency["penetration_percentage"].cast(IntegerType()))

In [9]:
internetusagefrequency.toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,internet_activity,activity_percentage,frequency_of_access,individuals,frequency_of_access_percentage
0,2011,European Union from 2020,,0.0,0,0.0,,0,Once a week,All Individuals,65
1,2011,European Union from 2020,,0.0,0,0.0,,0,Once a week,All individuals who used internet in the last ...,94
2,2011,European Union from 2020,,0.0,0,0.0,,0,Daily,All Individuals,54
3,2011,European Union from 2020,,0.0,0,0.0,,0,Daily,All individuals who used internet in the last ...,78
4,2011,European Union from 2020,,0.0,0,0.0,,0,At least once a week,All Individuals,11
...,...,...,...,...,...,...,...,...,...,...,...
5275,2020,Kosovo,,0.0,0,0.0,,0,At least once a month,All individuals who used internet in the last ...,0
5276,2020,Kosovo,,0.0,0,0.0,,0,Less than once a month,All Individuals,0
5277,2020,Kosovo,,0.0,0,0.0,,0,Less than once a month,All individuals who used internet in the last ...,0
5278,2020,Kosovo,,0.0,0,0.0,,0,Less than once a week,All Individuals,1


In [10]:
internetusagefrequency.printSchema()

root
 |-- year: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- gender: string (nullable = false)
 |-- usage_rate: double (nullable = true)
 |-- penetration_percentage: integer (nullable = true)
 |-- internet_users: double (nullable = true)
 |-- internet_activity: string (nullable = false)
 |-- activity_percentage: integer (nullable = true)
 |-- frequency_of_access: string (nullable = true)
 |-- individuals: string (nullable = true)
 |-- frequency_of_access_percentage: integer (nullable = true)



In [11]:
internetusagefrequency \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/InternetUsageFrequency/")