In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [21]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|           countries|      false|
| tabd_db|countryinternetusers|      false|
| tabd_db|dailyinternetusag...|      false|
| tabd_db|    internetjoindate|      false|
| tabd_db|listleastdevelope...|      false|
| tabd_db|listofcountriesby...|      false|
+--------+--------------------+-----------+



In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.ListOfCountriesByNumberOfInternetUsers
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.ListOfCountriesByNumberOfInternetUsers (
    country_or_area VARCHAR(45),
        area_km VARCHAR(45),
        current_account_balance INT,
        internet_hosts INT,
        telephone_main_lines_in_use INT,
        telephone_mobile_celular INT,
        country_id VARCHAR(45),
        status INT,
        join_date DATE,
        population INT,
        population_rank INT,
        internet_users INT,
        internet_users_percentage FLOAT,
        internet_user_rank INT
    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/ListOfCountriesByNumberOfInternetUsers/'
    """
)

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [23]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|           countries|      false|
| tabd_db|countryinternetusers|      false|
| tabd_db|dailyinternetusag...|      false|
| tabd_db|    internetjoindate|      false|
| tabd_db|listleastdevelope...|      false|
| tabd_db|listofcountriesby...|      false|
+--------+--------------------+-----------+



In [24]:
# Let's look into HDFS

In [25]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.ListOfCountriesByNumberOfInternetUsers
    """
).show()

+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
|country_or_area|area_km|current_account_balance|internet_hosts|telephone_main_lines_in_use|telephone_mobile_celular|country_id|status|join_date|population|population_rank|internet_users|internet_users_percentage|internet_user_rank|
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+



In [26]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.ListOfCountriesByNumberOfInternetUsers
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,country_or_area,varchar(45),
1,area_km,varchar(45),
2,current_account_balance,int,
3,internet_hosts,int,
4,telephone_main_lines_in_use,int,
5,telephone_mobile_celular,int,
6,country_id,varchar(45),
7,status,int,
8,join_date,date,
9,population,int,


In [27]:
# Let's put the files into HDFS

In [3]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.ListOfCountriesByNumberOfInternetUsers
    """
).toPandas()

Unnamed: 0,country_or_area,area_km,current_account_balance,internet_hosts,telephone_main_lines_in_use,telephone_mobile_celular,country_id,status,join_date,population,population_rank,internet_users,internet_users_percentage,internet_user_rank
0,China,0,0,0,0,,,0,,1409517397,1.0,765367947.0,54.299999,116.0
1,India,0,0,0,0,,,0,,1339180127,2.0,461347554.0,34.450001,145.0
2,United States,0,0,0,0,,,0,,324459463,3.0,244090854.0,75.230003,68.0
3,Brazil,0,0,0,0,,,0,,209288278,4.0,141206801.0,67.470001,83.0
4,Japan,0,0,0,0,,,0,,127484450,5.0,115845120.0,90.870003,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,Falkland Islands,0,0,0,0,,,0,,2910,211.0,2881.0,99.019997,
211,Montserrat,0,0,0,0,,,0,,5177,212.0,2833.0,54.549999,115.0
212,Wallis and Futuna,0,0,0,0,,,0,,11773,213.0,1383.0,8.950000,
213,Niue,0,0,0,0,,,0,,1618,214.0,1034.0,86.900002,30.0


In [29]:


spark.sql(
    """
    SELECT *
    FROM tabd_db.ListOfCountriesByNumberOfInternetUsers
    """
).show()

+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
|country_or_area|area_km|current_account_balance|internet_hosts|telephone_main_lines_in_use|telephone_mobile_celular|country_id|status|join_date|population|population_rank|internet_users|internet_users_percentage|internet_user_rank|
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+



In [36]:
spark.stop()