In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# spark = SparkSession.builder.master('local[2]').getOrCreate()

from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [10]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|socialmediausebyp...| 

In [11]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.DistributionGlobalSocialMediaUsersByRegion2020
    
    """
).show()

+--------------------+------------------+----+-----------------------------+-----------------------------+
|              region|social_media_users|year|mobile_penetration_percentage|social_media_users_percentage|
+--------------------+------------------+----+-----------------------------+-----------------------------+
|        Eastern Asia|               0.0|2020|                            0|                         29.6|
|      Southeast Asia|               0.0|2020|                            0|                         13.4|
|       Southern Asia|               0.0|2020|                            0|                         13.1|
|    Northern America|               0.0|2020|                            0|                          9.2|
|       South America|               0.0|2020|                            0|                          7.6|
|Central & Western...|               0.0|2020|                            0|                          5.5|
|        Western Asia|               

In [12]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.GlobalMobilePenetration
    
    """
).show()

+----------------+------------------+----+-----------------------------+-----------------------------+
|          region|social_media_users|year|mobile_penetration_percentage|social_media_users_percentage|
+----------------+------------------+----+-----------------------------+-----------------------------+
|    Eastern Asia|               0.0|2020|                           70|                          0.0|
|Northern America|               0.0|2020|                           61|                          0.0|
|   South America|               0.0|2020|                           61|                          0.0|
| Northern Europe|               0.0|2020|                           59|                          0.0|
| Central America|               0.0|2020|                           59|                          0.0|
|  Southeast Asia|               0.0|2020|                           56|                          0.0|
|         Oceania|               0.0|2020|                           51| 

In [13]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.NumberOfWorldwideSocialMediaUse
    
    """
).show()

+--------------------+------------------+----+-----------------------------+-----------------------------+
|              region|social_media_users|year|mobile_penetration_percentage|social_media_users_percentage|
+--------------------+------------------+----+-----------------------------+-----------------------------+
|        Eastern Asia|           1065.88|2020|                            0|                          0.0|
|      Southeast Asia|            482.73|2020|                            0|                          0.0|
|       Southern Asia|            470.01|2020|                            0|                          0.0|
|    Northern America|            329.25|2020|                            0|                          0.0|
|       South America|            274.22|2020|                            0|                          0.0|
|Central & Western...|            198.07|2020|                            0|                          0.0|
|        Western Asia|            158

In [14]:
GlobalSocialMediaUsage = spark.sql(
    """
    
    SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage
    
    FROM (
      SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage FROM tabd_db.DistributionGlobalSocialMediaUsersByRegion2020 
      UNION ALL 
      SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage FROM tabd_db.GlobalMobilePenetration
      UNION ALL
      SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage FROM tabd_db.NumberOfWorldwideSocialMediaUse
     ) 
     
     GROUP BY region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage
        
    """
)

In [15]:
spark.sql(
    """
    
    SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage
    
    FROM (
      SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage FROM tabd_db.DistributionGlobalSocialMediaUsersByRegion2020 
      UNION ALL 
      SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage FROM tabd_db.GlobalMobilePenetration
      UNION ALL
      SELECT region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage FROM tabd_db.NumberOfWorldwideSocialMediaUse
     ) 
     
     GROUP BY region, social_media_users, year, mobile_penetration_percentage, social_media_users_percentage
        
    """
).toPandas()

Unnamed: 0,region,social_media_users,year,mobile_penetration_percentage,social_media_users_percentage
0,Global average,0.0,2020,42,0.0
1,Southeast Asia,482.73,2020,0,0.0
2,Southern Asia,470.01,2020,0,0.0
3,Northern Africa,106.48,2020,0,0.0
4,Southern Europe,143.27,2020,0,0.0
5,Western Asia,0.0,2020,0,4.4
6,Southern Asia,0.0,2020,0,13.1
7,South America,0.0,2020,61,0.0
8,Central Asia,13.93,2020,0,0.0
9,Central America,29.66,2020,0,0.0


In [16]:
GlobalSocialMediaUsage \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/GlobalSocialMediaUsage/")