In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.master('local[2]').getOrCreate()

In [2]:
# Create a DataFrame from JSON data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

# Read Sillicon valley episodes data
internetjoindate = spark.read.csv("hdfs://hdfs-nn:9000/user/Projeto TABD/Bronze/InternetJoinDate.csv",header=True, sep=";")

In [3]:
internetjoindate.printSchema()
internetjoindate.show()
internetjoindate.toPandas()

root
 |-- iso2c: string (nullable = true)
 |-- ccTLD_registration_date: string (nullable = true)

+-----+-----------------------+
|iso2c|ccTLD_registration_date|
+-----+-----------------------+
|   AF|             1997-10-16|
|   AX|             2006-06-21|
|   AL|             1992-04-21|
|   DZ|             1994-01-03|
|   AS|             1997-06-12|
|   AD|             1996-01-09|
|   AO|             1995-11-15|
|   AI|             1995-02-16|
|   AQ|             1992-02-26|
|   AG|             1991-09-03|
|   AR|             1987-09-23|
|   AM|             1994-08-26|
|   AW|             1996-02-20|
|   AU|             1986-03-05|
|   AT|             1988-01-20|
|   AZ|             1993-08-25|
|   BS|             1991-09-03|
|   BH|             1994-02-01|
|   BD|             1999-05-20|
|   BB|             1991-09-03|
+-----+-----------------------+
only showing top 20 rows



Unnamed: 0,iso2c,ccTLD_registration_date
0,AF,1997-10-16
1,AX,2006-06-21
2,AL,1992-04-21
3,DZ,1994-01-03
4,AS,1997-06-12
...,...,...
240,VE,1991-03-07
241,VN,1994-04-14
242,VG,1997-02-20
243,VI,1995-08-31


In [4]:
internetjoindate = internetjoindate.selectExpr("iso2c as country_id", "`ccTLD_registration_date` as join_date",
                                               "'' as country_or_area", "'0' as area_km", 
                                               "'0' as current_account_balance", "'0' as internet_hosts",
                                               "'0' as `telephone_main_lines_in_use`", "'0' as telephone_mobile_celular",
                                               "'0' as status", "'0' as population",  "'0' as population_rank", 
                                               "'0' as internet_users", "'0' as internet_users_percentage", 
                                               "'0' as internet_user_rank")


In [5]:
internetjoindate = internetjoindate.withColumn("join_date", internetjoindate["join_date"].cast(DateType()))
internetjoindate = internetjoindate.withColumn("current_account_balance", internetjoindate["current_account_balance"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("internet_hosts", internetjoindate["internet_hosts"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("telephone_main_lines_in_use", internetjoindate["telephone_main_lines_in_use"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("telephone_mobile_celular", internetjoindate["telephone_mobile_celular"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("status", internetjoindate["status"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("population", internetjoindate["population"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("population_rank", internetjoindate["population_rank"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("internet_users", internetjoindate["internet_users"].cast(IntegerType()))
internetjoindate = internetjoindate.withColumn("internet_users_percentage", internetjoindate["internet_users_percentage"].cast(FloatType()))
internetjoindate = internetjoindate.withColumn("internet_user_rank", internetjoindate["internet_user_rank"].cast(IntegerType()))

internetjoindate = internetjoindate[['country_or_area','area_km', 'current_account_balance', 'internet_hosts', 'telephone_main_lines_in_use', 'telephone_mobile_celular', 'country_id', 'status', 'join_date','population', 'population_rank', 'internet_users', 'internet_users_percentage', 'internet_user_rank']]

In [6]:
internetjoindate.printSchema()

root
 |-- country_or_area: string (nullable = false)
 |-- area_km: string (nullable = false)
 |-- current_account_balance: integer (nullable = true)
 |-- internet_hosts: integer (nullable = true)
 |-- telephone_main_lines_in_use: integer (nullable = true)
 |-- telephone_mobile_celular: integer (nullable = true)
 |-- country_id: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- join_date: date (nullable = true)
 |-- population: integer (nullable = true)
 |-- population_rank: integer (nullable = true)
 |-- internet_users: integer (nullable = true)
 |-- internet_users_percentage: float (nullable = true)
 |-- internet_user_rank: integer (nullable = true)



In [7]:
internetjoindate = internetjoindate.fillna(0)
internetjoindate.toPandas()

Unnamed: 0,country_or_area,area_km,current_account_balance,internet_hosts,telephone_main_lines_in_use,telephone_mobile_celular,country_id,status,join_date,population,population_rank,internet_users,internet_users_percentage,internet_user_rank
0,,0,0,0,0,0,AF,0,1997-10-16,0,0,0,0.0,0
1,,0,0,0,0,0,AX,0,2006-06-21,0,0,0,0.0,0
2,,0,0,0,0,0,AL,0,1992-04-21,0,0,0,0.0,0
3,,0,0,0,0,0,DZ,0,1994-01-03,0,0,0,0.0,0
4,,0,0,0,0,0,AS,0,1997-06-12,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,,0,0,0,0,0,VE,0,1991-03-07,0,0,0,0.0,0
241,,0,0,0,0,0,VN,0,1994-04-14,0,0,0,0.0,0
242,,0,0,0,0,0,VG,0,1997-02-20,0,0,0,0.0,0
243,,0,0,0,0,0,VI,0,1995-08-31,0,0,0,0.0,0


In [8]:
internetjoindate \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/InternetJoinDate/")