In [23]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [24]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|countryinternetusers|      false|
| tabd_db|dailyinternetusag...|      false|
| tabd_db|   deviceusageimpact|      false|
| tabd_db|enterprisesocialm...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagebyco...|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|  socialnetworkusers|      false|
+--------+--------------------+-----------+



In [25]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.InternetUsageByCountyLevelDevelopment
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.InternetUsageByCountyLevelDevelopment (
        year INT,
        country VARCHAR(45),
        internet_users INT,
        population INT,
        population_rank INT,
        internet_users_percentage FLOAT,
        internet_users_rank INT,
        status INT,
        join_date DATE,
        usage_rate FLOAT
    )
    STORED AS PARQUET
    
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/InternetUsageByCountyLevelDevelopment/'
    """
)

DataFrame[]

In [26]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|countryinternetusers|      false|
| tabd_db|dailyinternetusag...|      false|
| tabd_db|   deviceusageimpact|      false|
| tabd_db|enterprisesocialm...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagebyco...|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|  socialnetworkusers|      false|
+--------+--------------------+-----------+



In [27]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.InternetUsageByCountyLevelDevelopment
    """
).show()

+--------------+----------+---------------+-------------------------+-------------------+------+---------+----------+-------+----+
|internet_users|population|population_rank|internet_users_percentage|internet_users_rank|status|join_date|usage_rate|country|year|
+--------------+----------+---------------+-------------------------+-------------------+------+---------+----------+-------+----+
+--------------+----------+---------------+-------------------------+-------------------+------+---------+----------+-------+----+



In [28]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.InternetUsageByCountyLevelDevelopment
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,internet_users,int,
1,population,int,
2,population_rank,int,
3,internet_users_percentage,float,
4,internet_users_rank,int,
5,status,int,
6,join_date,date,
7,usage_rate,float,
8,country,varchar(45),
9,year,int,


In [29]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.InternetUsageByCountyLevelDevelopment
    """
).toPandas()

Unnamed: 0,internet_users,population,population_rank,internet_users_percentage,internet_users_rank,status,join_date,usage_rate,country,year


In [30]:


spark.sql(
    """
    SELECT *
    FROM tabd_db.InternetUsageByCountyLevelDevelopment
    """
).show()

+--------------+----------+---------------+-------------------------+-------------------+------+---------+----------+-------+----+
|internet_users|population|population_rank|internet_users_percentage|internet_users_rank|status|join_date|usage_rate|country|year|
+--------------+----------+---------------+-------------------------+-------------------+------+---------+----------+-------+----+
+--------------+----------+---------------+-------------------------+-------------------+------+---------+----------+-------+----+



In [31]:
spark.stop()