In [34]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [35]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...| 

In [36]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.SocialMediaUseByPurpose 
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.SocialMediaUseByPurpose  (
        year INT,
        region VARCHAR(45),
        percentage INT,
        advertising_type VARCHAR(45),
        purpose VARCHAR(45)
    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/SocialMediaUseByPurpose/'
    """
)

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [37]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...| 

In [38]:
# Let's look into HDFS

In [39]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.SocialMediaUseByPurpose
    """
).show()

+----+------+----------+----------------+-------+
|year|region|percentage|advertising_type|purpose|
+----+------+----------+----------------+-------+
+----+------+----------+----------------+-------+



In [40]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.SocialMediaUseByPurpose
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,year,int,
1,region,varchar(45),
2,percentage,int,
3,advertising_type,varchar(45),
4,purpose,varchar(45),
5,,,
6,# Detailed Table Information,,
7,Database,tabd_db,
8,Table,socialmediausebypurpose,
9,Owner,jovyan,


In [41]:
# Let's put the files into HDFS

In [42]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.SocialMediaUseByPurpose
    """
).toPandas()

Unnamed: 0,year,region,percentage,advertising_type,purpose


In [43]:
# recover partitions is needed so that the Hive Metastore (Catalog)
# is updated. Otherwise Hive and the querying engines do not know
# that there are new parittions in the partitioned table.

spark.sql(
    """
    SELECT *
    FROM tabd_db.SocialMediaUseByPurpose
    """
).show()

+----+------+----------+----------------+-------+
|year|region|percentage|advertising_type|purpose|
+----+------+----------+----------------+-------+
+----+------+----------+----------------+-------+



In [44]:
spark.stop()