In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.master('local[2]').getOrCreate()

In [9]:
# Create a DataFrame from JSON data (automatically infer schema and data types)
# There are other file formats you can read from (e.g., csv, orc, parquet)
# https://spark.apache.org/docs/2.2.0/sql-programming-guide.html#data-sources

# Read Sillicon valley episodes data
data_file="hdfs://hdfs-nn:9000/user/Projeto TABD/Bronze/ListLeastDevelopedCountries.csv"
listleastdevelopedcountries = spark.read.csv(data_file,header=True, sep=";")


In [10]:
listleastdevelopedcountries.printSchema()
listleastdevelopedcountries.show()
listleastdevelopedcountries.toPandas()

root
 |-- Country name: string (nullable = true)
 |-- di_id: string (nullable = true)
 |-- LDC status marker (1=LDC): string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)

+--------------------+-----+-------------------------+----+----+----+----+
|        Country name|di_id|LDC status marker (1=LDC)| _c3| _c4| _c5| _c6|
+--------------------+-----+-------------------------+----+----+----+----+
|         Afghanistan|   AF|                        1|null|null|null|null|
|              Angola|   AO|                        1|null|null|null|null|
|          Bangladesh|   BD|                        1|null|null|null|null|
|               Benin|   BJ|                        1|null|null|null|null|
|              Bhutan|   BT|                        1|null|null|null|null|
|        Burkina Faso|   BF|                        1|null|null|null|null|
|             Burundi|   BI|             

Unnamed: 0,Country name,di_id,LDC status marker (1=LDC),_c3,_c4,_c5,_c6
0,Afghanistan,AF,1,,,,
1,Angola,AO,1,,,,
2,Bangladesh,BD,1,,,,
3,Benin,BJ,1,,,,
4,Bhutan,BT,1,,,,
5,Burkina Faso,BF,1,,,,
6,Burundi,BI,1,,,,
7,Cambodia,KH,1,,,,
8,Central African Republic,CF,1,,,,
9,Chad,TD,1,,,,


In [11]:
listleastdevelopedcountries = listleastdevelopedcountries.selectExpr("`Country name` as country_or_area", "di_id as country_id", "`LDC status marker (1=LDC)` as status", "'9999-01-01' as join_date", "'0' as area_km", "'0' as current_account_balance", "'0' as internet_hosts", "'0' as `telephone_main_lines_in_use`", "'0' as telephone_mobile_celular", "'0' as population",  "'0' as population_rank",  "'0' as internet_users", "'0' as internet_users_percentage", "'0' as internet_user_rank")
listleastdevelopedcountries.toPandas()

Unnamed: 0,country_or_area,country_id,status,join_date,area_km,current_account_balance,internet_hosts,telephone_main_lines_in_use,telephone_mobile_celular,population,population_rank,internet_users,internet_users_percentage,internet_user_rank
0,Afghanistan,AF,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
1,Angola,AO,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
2,Bangladesh,BD,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
3,Benin,BJ,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
4,Bhutan,BT,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
5,Burkina Faso,BF,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
6,Burundi,BI,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
7,Cambodia,KH,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
8,Central African Republic,CF,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
9,Chad,TD,1,9999-01-01,0,0,0,0,0,0,0,0,0,0


In [12]:
listleastdevelopedcountries = listleastdevelopedcountries.withColumn(
    "country_or_area",
    when(
        (col("country_or_area") == "S�o Tom� and Pr�ncipe"), "Sao Tome and Principe").otherwise(col("country_or_area")))

In [13]:
listleastdevelopedcountries.toPandas()

Unnamed: 0,country_or_area,country_id,status,join_date,area_km,current_account_balance,internet_hosts,telephone_main_lines_in_use,telephone_mobile_celular,population,population_rank,internet_users,internet_users_percentage,internet_user_rank
0,Afghanistan,AF,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
1,Angola,AO,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
2,Bangladesh,BD,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
3,Benin,BJ,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
4,Bhutan,BT,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
5,Burkina Faso,BF,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
6,Burundi,BI,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
7,Cambodia,KH,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
8,Central African Republic,CF,1,9999-01-01,0,0,0,0,0,0,0,0,0,0
9,Chad,TD,1,9999-01-01,0,0,0,0,0,0,0,0,0,0


In [14]:
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("current_account_balance", listleastdevelopedcountries["current_account_balance"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("internet_hosts", listleastdevelopedcountries["internet_hosts"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("telephone_main_lines_in_use", listleastdevelopedcountries["telephone_main_lines_in_use"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("telephone_mobile_celular", listleastdevelopedcountries["telephone_mobile_celular"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("status", listleastdevelopedcountries["status"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("join_date", listleastdevelopedcountries["join_date"].cast(DateType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("population", listleastdevelopedcountries["population"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("population_rank", listleastdevelopedcountries["population_rank"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("internet_users", listleastdevelopedcountries["internet_users"].cast(IntegerType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("internet_users_percentage", listleastdevelopedcountries["internet_users_percentage"].cast(FloatType()))
listleastdevelopedcountries = listleastdevelopedcountries.withColumn("internet_user_rank", listleastdevelopedcountries["internet_user_rank"].cast(IntegerType()))

listleastdevelopedcountries = listleastdevelopedcountries[['country_or_area','area_km', 'current_account_balance', 'internet_hosts', 'telephone_main_lines_in_use', 'telephone_mobile_celular', 'country_id', 'status', 'join_date','population', 'population_rank', 'internet_users', 'internet_users_percentage', 'internet_user_rank']]

In [15]:
listleastdevelopedcountries.printSchema()

root
 |-- country_or_area: string (nullable = true)
 |-- area_km: string (nullable = false)
 |-- current_account_balance: integer (nullable = true)
 |-- internet_hosts: integer (nullable = true)
 |-- telephone_main_lines_in_use: integer (nullable = true)
 |-- telephone_mobile_celular: integer (nullable = true)
 |-- country_id: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- join_date: date (nullable = true)
 |-- population: integer (nullable = true)
 |-- population_rank: integer (nullable = true)
 |-- internet_users: integer (nullable = true)
 |-- internet_users_percentage: float (nullable = true)
 |-- internet_user_rank: integer (nullable = true)



In [16]:
listleastdevelopedcountries \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/ListLeastDevelopedCountries/")