In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# spark = SparkSession.builder.master('local[2]').getOrCreate()

from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...| 

In [3]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.SocialMediaUseByTypeInternetAdvertising
    
    """
).show()

+----+--------------------+----------+--------------------+-------+
|year|              region|percentage|    advertising_type|purpose|
+----+--------------------+----------+--------------------+-------+
|2013|European Union fr...|         0|Pay to advertise ...|   null|
|2013|European Union fr...|         0|Enterprises using...|   null|
|2013|European Union fr...|         0|Have a website an...|   null|
|2013|European Union fr...|         0|Have web sales to...|   null|
|2013|European Union fr...|         0|Use social networ...|   null|
|2013|European Union fr...|         0|Use enterprise bl...|   null|
|2013|European Union fr...|         0|Use multimedia co...|   null|
|2013|European Union fr...|         0|Use wiki based kn...|   null|
|2013|European Union fr...|         0|Use any social media|   null|
|2013|European Union fr...|         0|Use only one type...|   null|
|2013|European Union fr...|         0|Use two or more s...|   null|
|2013|European Union fr...|         0|Have a Web

In [4]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.SocialMediaUseByPurpose
    """
).show()

+----+--------------------+----------+----------------+--------------------+
|year|              region|percentage|advertising_type|             purpose|
+----+--------------------+----------+----------------+--------------------+
|2013|European Union fr...|         5|            null|Website has onlin...|
|2015|European Union fr...|         0|            null|Website has onlin...|
|2017|European Union fr...|         0|            null|Website has onlin...|
|2019|European Union fr...|         0|            null|Website has onlin...|
|2013|European Union fr...|        21|            null|Develop the enter...|
|2015|European Union fr...|        29|            null|Develop the enter...|
|2017|European Union fr...|        38|            null|Develop the enter...|
|2019|European Union fr...|        43|            null|Develop the enter...|
|2013|European Union fr...|        14|            null|Obtain or respond...|
|2015|European Union fr...|        19|            null|Obtain or respond...|

In [5]:
EnterpriseSocialMediaUse = spark.sql(
    """
    
    SELECT tabd_db.SocialMediaUseByTypeInternetAdvertising.year, tabd_db.SocialMediaUseByTypeInternetAdvertising.region,
           tabd_db.SocialMediaUseByTypeInternetAdvertising.percentage, 
           tabd_db.SocialMediaUseByTypeInternetAdvertising.advertising_type, 
           tabd_db.SocialMediaUseByPurpose.purpose
    
    FROM (tabd_db.SocialMediaUseByTypeInternetAdvertising 
    LEFT JOIN tabd_db.SocialMediaUseByPurpose 
    ON (tabd_db.SocialMediaUseByTypeInternetAdvertising.year = tabd_db.SocialMediaUseByPurpose.year and 
    tabd_db.SocialMediaUseByTypeInternetAdvertising.region = tabd_db.SocialMediaUseByPurpose.region and
    tabd_db.SocialMediaUseByTypeInternetAdvertising.percentage = tabd_db.SocialMediaUseByPurpose.percentage))
    
      
    """
)

In [6]:
spark.sql(
    """
    
    SELECT tabd_db.SocialMediaUseByTypeInternetAdvertising.year, tabd_db.SocialMediaUseByTypeInternetAdvertising.region,
           tabd_db.SocialMediaUseByTypeInternetAdvertising.percentage, 
           tabd_db.SocialMediaUseByTypeInternetAdvertising.advertising_type, 
           tabd_db.SocialMediaUseByPurpose.purpose
    
    FROM (tabd_db.SocialMediaUseByTypeInternetAdvertising 
    LEFT JOIN tabd_db.SocialMediaUseByPurpose 
    ON (tabd_db.SocialMediaUseByTypeInternetAdvertising.year = tabd_db.SocialMediaUseByPurpose.year and 
    tabd_db.SocialMediaUseByTypeInternetAdvertising.region = tabd_db.SocialMediaUseByPurpose.region and
    tabd_db.SocialMediaUseByTypeInternetAdvertising.percentage = tabd_db.SocialMediaUseByPurpose.percentage))
        
    """
).toPandas()

Unnamed: 0,year,region,percentage,advertising_type,purpose
0,2013,European Union from 2020,0,Pay to advertise on the internet,
1,2013,European Union from 2020,0,Enterprises using information about visitors b...,
2,2013,European Union from 2020,0,Have a website and pay to advertise on the int...,
3,2013,European Union from 2020,0,Have web sales to private consumers (B2C) and ...,
4,2013,European Union from 2020,0,"Use social networks (Facebook, LinkedIn, Xing,...",
...,...,...,...,...,...
10287,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use two or more social media for more than one...
10288,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use only one type of social medium for more th...
10289,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use two or more social media for only one purp...
10290,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use only one type of social medium for only on...


In [7]:
EnterpriseSocialMediaUse \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/P_EnterpriseSocialMediaUse/")