In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# spark = SparkSession.builder.master('local[2]').getOrCreate()

from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|           countries|      false|
| tabd_db|countryinternetusers|      false|
| tabd_db|dailyinternetusag...|      false|
| tabd_db|    internetjoindate|      false|
| tabd_db|listleastdevelope...|      false|
| tabd_db|listofcountriesby...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausebyp...|      false|
| tabd_db|socialmediausebyt...|      false|
+--------+--------------------+-----------+



In [7]:
spark.sql(
    """
    
    REFRESH TABLE tabd_db.P_CountryInternetUsers
    
    """
)
spark.sql(
    """
    SELECT *
    FROM tabd_db.P_CountryInternetUsers
    
    """
).toPandas()

Unnamed: 0,country_or_area,area_km,current_account_balance,internet_hosts,telephone_main_lines_in_use,telephone_mobile_celular,country_id,status,join_date,population,population_rank,internet_users,internet_users_percentage,internet_user_rank
0,France,547030,-305000000.0,2396761.0,33905400.0,41683100.0,,,,64979548,13.0,52308536.0,80.500000,50.0
1,Nigeria,923768,0.0,1142.0,853100.0,3149500.0,,,,190015955,12.0,30557175.0,27.510000,161.0
2,Malaysia,329750,0.0,107971.0,4571600.0,11124100.0,,,,31624264,30.0,25343685.0,80.139999,52.0
3,Yemen,527970,369900000.0,138.0,542200.0,411100.0,,1.0,,28250420,62.0,7548512.0,26.719999,164.0
4,Burkina Faso,274200,-471700000.0,442.0,65400.0,227000.0,BF,1.0,1993-03-29,19193382,101.0,3047909.0,15.880000,183.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,Maldives,,,,,,,,,436330,169.0,275717.0,63.189999,95.0
211,Monaco,,,,,,,,,38695,199.0,37553.0,97.050003,10.0
212,Brunei,,,,,,,,,428697,157.0,406705.0,94.870003,16.0
213,Tonga,,,,,,,,,108020,196.0,44558.0,41.250000,136.0


In [8]:
spark.sql(
    """
    
    REFRESH TABLE tabd_db.P_EnterpriseSocialMediaUse
    
    """
)


spark.sql(
    """
    SELECT *
    FROM tabd_db.P_EnterpriseSocialMediaUse
    
    """
).toPandas()

Unnamed: 0,year,region,percentage,advertising_type,purpose
0,2013,European Union from 2020,0,Pay to advertise on the internet,
1,2013,European Union from 2020,0,Enterprises using information about visitors b...,
2,2013,European Union from 2020,0,Have a website and pay to advertise on the int...,
3,2013,European Union from 2020,0,Have web sales to private consumers (B2C) and ...,
4,2013,European Union from 2020,0,"Use social networks (Facebook, LinkedIn, Xing,...",
...,...,...,...,...,...
10287,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use two or more social media for more than one...
10288,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use only one type of social medium for more th...
10289,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use two or more social media for only one purp...
10290,2019,Bosnia and Herzegovina,0,"Use any social media, only for posting paid ad...",Use only one type of social medium for only on...


In [36]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.D_SocialMediaUsageForEnterprisesLevelDevelopment
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.D_SocialMediaUsageForEnterprisesLevelDevelopment (
    country_or_area VARCHAR(45),
    population_rank INT,
    internet_user_rank INT,
    purpose VARCHAR(45),
    ratio_percentage_enterprise DOUBLE,
    internet_users_percentage FLOAT,
    avg_internet_users_region DOUBLE,
    ratio_internet_users_region DOUBLE
      )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/D_SocialMediaUsageForEnterprisesLevelDevelopment/'
    """
)

DataFrame[]

In [38]:
spark.sql(
    """
    
    
    SELECT A.country_or_area,
 A.population_rank, 
 A.internet_user_rank,
A.purpose,
 ((E.total_percentage_region/F.total_percentage)*100) as ratio_percentage_enterprise,
 A.internet_users_percentage,
 B.avg_internet_users_region,
   ((C.total_internet_users_region/D.total_internet_users)*100) as ratio_internet_users_region
   
    
    FROM
   (SELECT DISTINCT tabd_db.P_EnterpriseSocialMediaUse.percentage,
 tabd_db.P_EnterpriseSocialMediaUse.purpose,
 tabd_db.P_CountryInternetUsers.country_or_area,
 tabd_db.P_CountryInternetUsers.internet_users_percentage,
 tabd_db.P_CountryInternetUsers.population_rank,
  tabd_db.P_CountryInternetUsers.internet_user_rank
 
 
  FROM (tabd_db.P_CountryInternetUsers 
    INNER JOIN tabd_db.P_EnterpriseSocialMediaUse 
    ON tabd_db.P_CountryInternetUsers.country_or_area = tabd_db.P_EnterpriseSocialMediaUse.region)) as A
    
    INNER JOIN 
    
    (SELECT DISTINCT country_or_area, AVG(internet_users_percentage) AS avg_internet_users_region
    FROM tabd_db.P_CountryInternetUsers   GROUP BY country_or_area
         ) as B 
    ON B.country_or_area = A.country_or_area

    INNER JOIN 
    
     (SELECT DISTINCT country_or_area, SUM(internet_users_percentage) AS total_internet_users_region
    FROM tabd_db.P_CountryInternetUsers
    GROUP BY country_or_area) as C
    ON C.country_or_area = A.country_or_area
    
    INNER JOIN 
(SELECT DISTINCT SUM(internet_users_percentage) AS total_internet_users
    FROM tabd_db.P_CountryInternetUsers) as D
    
    INNER JOIN 
    
     (SELECT DISTINCT region, SUM(percentage) AS total_percentage_region
    FROM tabd_db.P_EnterpriseSocialMediaUse
    GROUP BY region) as E
    ON E.region = A.country_or_area
    
    INNER JOIN 
(SELECT DISTINCT SUM(percentage) AS total_percentage
    FROM tabd_db.P_EnterpriseSocialMediaUse) as F
    
    """
).toPandas()

Unnamed: 0,country_or_area,population_rank,internet_user_rank,purpose,ratio_percentage_enterprise,internet_users_percentage,avg_internet_users_region,ratio_internet_users_region
0,Sweden,51,12,,3.397165,96.410004,96.410004,0.805718
1,Bosnia and Herzegovina,109,80,,0.556084,69.489998,69.489998,0.580742
2,Finland,83,29,,3.427497,87.470001,87.470001,0.731004
3,Latvia,125,45,,1.937193,81.320000,81.320000,0.679608
4,Serbia,75,76,,1.466038,70.330002,70.330002,0.587762
...,...,...,...,...,...,...,...,...
1290,Iceland,164,3,Use only one type of social medium for more th...,2.068631,98.260002,98.260002,0.821178
1291,Hungary,63,60,Involve customers in development or innovation...,2.007967,76.750000,76.750000,0.641415
1292,Estonia,133,26,,2.337573,88.099998,88.099998,0.736269
1293,Latvia,125,45,,1.937193,81.320000,81.320000,0.679608


In [39]:
countryusers = spark.sql(
      """
    
    
    SELECT A.country_or_area,
 A.population_rank, 
 A.internet_user_rank,
A.purpose,
 ((E.total_percentage_region/F.total_percentage)*100) as ratio_percentage_enterprise,
 A.internet_users_percentage,
 B.avg_internet_users_region,
   ((C.total_internet_users_region/D.total_internet_users)*100) as ratio_internet_users_region
   
    
    FROM
   (SELECT DISTINCT tabd_db.P_EnterpriseSocialMediaUse.percentage,
 tabd_db.P_EnterpriseSocialMediaUse.purpose,
 tabd_db.P_CountryInternetUsers.country_or_area,
 tabd_db.P_CountryInternetUsers.internet_users_percentage,
 tabd_db.P_CountryInternetUsers.population_rank,
  tabd_db.P_CountryInternetUsers.internet_user_rank
 
 
  FROM (tabd_db.P_CountryInternetUsers 
    INNER JOIN tabd_db.P_EnterpriseSocialMediaUse 
    ON tabd_db.P_CountryInternetUsers.country_or_area = tabd_db.P_EnterpriseSocialMediaUse.region)) as A
    
    INNER JOIN 
    
    (SELECT DISTINCT country_or_area, AVG(internet_users_percentage) AS avg_internet_users_region
    FROM tabd_db.P_CountryInternetUsers   GROUP BY country_or_area
         ) as B 
    ON B.country_or_area = A.country_or_area

    INNER JOIN 
    
     (SELECT DISTINCT country_or_area, SUM(internet_users_percentage) AS total_internet_users_region
    FROM tabd_db.P_CountryInternetUsers
    GROUP BY country_or_area) as C
    ON C.country_or_area = A.country_or_area
    
    INNER JOIN 
(SELECT DISTINCT SUM(internet_users_percentage) AS total_internet_users
    FROM tabd_db.P_CountryInternetUsers) as D
    
    INNER JOIN 
    
     (SELECT DISTINCT region, SUM(percentage) AS total_percentage_region
    FROM tabd_db.P_EnterpriseSocialMediaUse
    GROUP BY region) as E
    ON E.region = A.country_or_area
    
    INNER JOIN 
(SELECT DISTINCT SUM(percentage) AS total_percentage
    FROM tabd_db.P_EnterpriseSocialMediaUse) as F
    
    """
)

In [40]:
countryusers \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/D_SocialMediaUsageForEnterprisesLevelDevelopment/")