In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|dailyinternetusag...|      false|
| tabd_db|enterprisesocialm...|      false|
| tabd_db|  socialnetworkusers|      false|
+--------+--------------------+-----------+



In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.DeviceUsageImpact
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.DeviceUsageImpact (
        year INT,
        mobile_usage INT,
        desktop_usage INT,
        impact FLOAT
    )
    STORED AS PARQUET

    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/DeviceUsageImpact/'
    """
)

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [4]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|dailyinternetusag...|      false|
| tabd_db|   deviceusageimpact|      false|
| tabd_db|enterprisesocialm...|      false|
| tabd_db|  socialnetworkusers|      false|
+--------+--------------------+-----------+



In [5]:
# Let's look into HDFS

In [6]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.DeviceUsageImpact
    """
).show()

+------------+-------------+------+----+
|mobile_usage|desktop_usage|impact|year|
+------------+-------------+------+----+
+------------+-------------+------+----+



In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.DeviceUsageImpact
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,mobile_usage,int,
1,desktop_usage,int,
2,impact,float,
3,year,int,
4,# Partition Information,,
5,# col_name,data_type,comment
6,year,int,
7,,,
8,# Detailed Table Information,,
9,Database,tabd_db,


In [8]:
# Let's put the files into HDFS

In [9]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.DeviceUsageImpact
    """
).toPandas()

Unnamed: 0,mobile_usage,desktop_usage,impact,year


In [10]:


spark.sql(
    """
    SELECT *
    FROM tabd_db.DeviceUsageImpact
    """
).show()

+------------+-------------+------+----+
|mobile_usage|desktop_usage|impact|year|
+------------+-------------+------+----+
+------------+-------------+------+----+



In [11]:
spark.stop()