In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|           countries|      false|
| tabd_db|countryinternetusers|      false|
| tabd_db|dailyinternetusag...|      false|
| tabd_db|    internetjoindate|      false|
| tabd_db|listleastdevelope...|      false|
| tabd_db|listofcountriesby...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausebyp...|      false|
| tabd_db|socialmediausebyt...|      false|
+--------+--------------------+-----------+



In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.P_CountryInternetUsers
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.P_CountryInternetUsers (
    country_or_area VARCHAR(45),
        area_km VARCHAR(45),
        current_account_balance INT,
        internet_hosts INT,
        telephone_main_lines_in_use INT,
        telephone_mobile_celular INT,
        country_id VARCHAR(45),
        status INT,
        join_date DATE,
        population INT,
        population_rank INT,
        internet_users INT,
        internet_users_percentage FLOAT,
        internet_user_rank INT
    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/P_CountryInternetUsers/'
    """
)

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [4]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|           countries|      false|
| tabd_db|countryinternetusers|      false|
| tabd_db|dailyinternetusag...|      false|
| tabd_db|    internetjoindate|      false|
| tabd_db|listleastdevelope...|      false|
| tabd_db|listofcountriesby...|      false|
| tabd_db|p_countryinternet...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausebyp...|      false|
| tabd_db|socialmediausebyt...|      false|
+--------+--------------------+-----------+



In [5]:
# Let's look into HDFS

In [6]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.P_CountryInternetUsers
    """
).show()

+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
|country_or_area|area_km|current_account_balance|internet_hosts|telephone_main_lines_in_use|telephone_mobile_celular|country_id|status|join_date|population|population_rank|internet_users|internet_users_percentage|internet_user_rank|
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+



In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.P_CountryInternetUsers
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,country_or_area,varchar(45),
1,area_km,varchar(45),
2,current_account_balance,int,
3,internet_hosts,int,
4,telephone_main_lines_in_use,int,
5,telephone_mobile_celular,int,
6,country_id,varchar(45),
7,status,int,
8,join_date,date,
9,population,int,


In [8]:
# Let's put the files into HDFS

In [9]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.P_CountryInternetUsers
    """
).toPandas()

Unnamed: 0,country_or_area,area_km,current_account_balance,internet_hosts,telephone_main_lines_in_use,telephone_mobile_celular,country_id,status,join_date,population,population_rank,internet_users,internet_users_percentage,internet_user_rank


In [10]:


spark.sql(
    """
    SELECT *
    FROM tabd_db.P_CountryInternetUsers
    """
).show()

+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
|country_or_area|area_km|current_account_balance|internet_hosts|telephone_main_lines_in_use|telephone_mobile_celular|country_id|status|join_date|population|population_rank|internet_users|internet_users_percentage|internet_user_rank|
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+
+---------------+-------+-----------------------+--------------+---------------------------+------------------------+----------+------+---------+----------+---------------+--------------+-------------------------+------------------+



In [23]:
spark.stop()