In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).toPandas()

Unnamed: 0,database,tableName,isTemporary
0,tabd_db,adultinternetusagepenetration,False
1,tabd_db,distributionglobalsocialmediausersbyregion2020,False
2,tabd_db,globalinternetusageratebygenderregion,False
3,tabd_db,globalmobilepenetration,False
4,tabd_db,globalsocialnetworks,False
5,tabd_db,internetactivities1,False
6,tabd_db,internetactivities2,False
7,tabd_db,internetusage,False
8,tabd_db,internetusagefrequency,False
9,tabd_db,numberofworldwideinternetusersbyregion,False


In [6]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.SocialMediaUsersAgeByRegion
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.SocialMediaUsersAgeByRegion (
        region VARCHAR(45),
        individuals VARCHAR(45),
        internet_users DOUBLE,
        avg_internet_users_individuals DOUBLE, 
        total_internet_users_region DOUBLE,
        avg_internet_users_region DOUBLE, 
        total_internet_users DOUBLE, 
        ratio_internet_users_region DOUBLE, 
        avg_internet_users DOUBLE

    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/SocialMediaUsersAgeByRegion/'
    """
)

     

DataFrame[]

In [7]:
socialmediausersagebyregion = spark.sql(
    """
    
    SELECT A.region, F.individuals, G.internet_users, F.avg_internet_users_individuals, A.total_internet_users_region,
           B.avg_internet_users_region, D.total_internet_users, 
           ((A.total_internet_users_region/D.total_internet_users)*100) as ratio_internet_users_region, E.avg_internet_users
           
    FROM
    (SELECT DISTINCT region, SUM(internet_users) AS total_internet_users_region
    FROM tabd_db.P_InternetUsage
    
    GROUP BY region) as A
    
    INNER JOIN
    
    (SELECT DISTINCT region, AVG(internet_users) AS avg_internet_users_region
    FROM tabd_db.P_InternetUsage
    
    GROUP BY region) as B ON A.region = B.region
    
    INNER JOIN
    
    (SELECT DISTINCT COUNT(DISTINCT region) AS count_region
    FROM tabd_db.P_InternetUsage) as C 
    
    INNER JOIN
    
    (SELECT DISTINCT SUM(internet_users) AS total_internet_users
    FROM tabd_db.P_InternetUsage) as D
    
    INNER JOIN
    
    (SELECT DISTINCT AVG(internet_users) AS avg_internet_users
    FROM tabd_db.P_InternetUsage) as E
    
    INNER JOIN
    
    (SELECT DISTINCT individuals, AVG(internet_users) AS avg_internet_users_individuals
    FROM tabd_db.P_InternetUsage
    
    GROUP BY individuals) as F
    
    INNER JOIN
    
    (SELECT DISTINCT region, internet_users
    FROM tabd_db.P_InternetUsage) as G
    
    """
)

In [8]:
socialmediausersagebyregion.toPandas()

Unnamed: 0,region,individuals,internet_users,avg_internet_users_individuals,total_internet_users_region,avg_internet_users_region,total_internet_users,ratio_internet_users_region,avg_internet_users
0,Europe,All Individuals,273.07,508.828126,1251023.34,634.393174,6393298.52,19.567729,503.250828
1,Europe,All Individuals,77.02,508.828126,1251023.34,634.393174,6393298.52,19.567729,503.250828
2,Europe,All Individuals,327.57,508.828126,1251023.34,634.393174,6393298.52,19.567729,503.250828
3,Europe,All Individuals,90.00,508.828126,1251023.34,634.393174,6393298.52,19.567729,503.250828
4,Europe,All Individuals,132.59,508.828126,1251023.34,634.393174,6393298.52,19.567729,503.250828
...,...,...,...,...,...,...,...,...,...
1612,Asia,All individuals who used internet in the last ...,184.86,496.577460,2971184.77,1695.881718,6393298.52,46.473425,503.250828
1613,Asia,All individuals who used internet in the last ...,302.01,496.577460,2971184.77,1695.881718,6393298.52,46.473425,503.250828
1614,Asia,All individuals who used internet in the last ...,320.06,496.577460,2971184.77,1695.881718,6393298.52,46.473425,503.250828
1615,Asia,All individuals who used internet in the last ...,254.92,496.577460,2971184.77,1695.881718,6393298.52,46.473425,503.250828


In [9]:
socialmediausersagebyregion \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/SocialMediaUsersAgeByRegion/")