In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [11]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|distributiongloba...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|socialmediausebyp...|      false|
| tabd_db|socialmediausebyt...|      false|
| tabd_db|socialmediausersa...|      false|
| tabd_db|  socialnetworkusers|      false|
+--------+--------------------+-

In [12]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.DistributionGlobalSocialMediaUsersByRegion2020 
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.DistributionGlobalSocialMediaUsersByRegion2020  (
        region VARCHAR(45),
        social_media_users DOUBLE,
        year INT,
        mobile_penetration_percentage INT,
        social_media_users_percentage FLOAT
    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/DistributionGlobalSocialMediaUsersByRegion2020/'
    """
)

DataFrame[]

In [13]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|distributiongloba...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|socialmediausebyp...|      false|
| tabd_db|socialmediausebyt...|      false|
| tabd_db|socialmediausersa...|      false|
| tabd_db|  socialnetworkusers|      false|
+--------+--------------------+-

In [14]:
# Let's look into HDFS
spark.sql(
    """
    SELECT *
    FROM tabd_db.DistributionGlobalSocialMediaUsersByRegion2020
    """
).show()

+--------------------+------------------+----+-----------------------------+-----------------------------+
|              region|social_media_users|year|mobile_penetration_percentage|social_media_users_percentage|
+--------------------+------------------+----+-----------------------------+-----------------------------+
|        Eastern Asia|               0.0|2020|                            0|                         29.6|
|      Southeast Asia|               0.0|2020|                            0|                         13.4|
|       Southern Asia|               0.0|2020|                            0|                         13.1|
|    Northern America|               0.0|2020|                            0|                          9.2|
|       South America|               0.0|2020|                            0|                          7.6|
|Central & Western...|               0.0|2020|                            0|                          5.5|
|        Western Asia|               

In [15]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.DistributionGlobalSocialMediaUsersByRegion2020
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,region,varchar(45),
1,social_media_users,double,
2,year,int,
3,mobile_penetration_percentage,int,
4,social_media_users_percentage,float,
5,,,
6,# Detailed Table Information,,
7,Database,tabd_db,
8,Table,distributionglobalsocialmediausersbyregion2020,
9,Owner,jovyan,


In [16]:
# Let's put the files into HDFS

spark.sql(
    """
    SELECT *
    FROM tabd_db.DistributionGlobalSocialMediaUsersByRegion2020
    """
).toPandas()

Unnamed: 0,region,social_media_users,year,mobile_penetration_percentage,social_media_users_percentage
0,Eastern Asia,0.0,2020,0,29.6
1,Southeast Asia,0.0,2020,0,13.4
2,Southern Asia,0.0,2020,0,13.1
3,Northern America,0.0,2020,0,9.2
4,South America,0.0,2020,0,7.6
5,Central & Western Europe,0.0,2020,0,5.5
6,Western Asia,0.0,2020,0,4.4
7,Southern Europe,0.0,2020,0,4.0
8,Eastern Europe,0.0,2020,0,3.3
9,Northern Africa,0.0,2020,0,3.0


In [17]:
# recover partitions is needed so that the Hive Metastore (Catalog)
# is updated. Otherwise Hive and the querying engines do not know
# that there are new parittions in the partitioned table.

spark.sql(
    """
    SELECT *
    FROM tabd_db.DistributionGlobalSocialMediaUsersByRegion2020
    """
).show()

+--------------------+------------------+----+-----------------------------+-----------------------------+
|              region|social_media_users|year|mobile_penetration_percentage|social_media_users_percentage|
+--------------------+------------------+----+-----------------------------+-----------------------------+
|        Eastern Asia|               0.0|2020|                            0|                         29.6|
|      Southeast Asia|               0.0|2020|                            0|                         13.4|
|       Southern Asia|               0.0|2020|                            0|                         13.1|
|    Northern America|               0.0|2020|                            0|                          9.2|
|       South America|               0.0|2020|                            0|                          7.6|
|Central & Western...|               0.0|2020|                            0|                          5.5|
|        Western Asia|               

In [18]:
spark.stop()