In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# spark = SparkSession.builder.master('local[2]').getOrCreate()

from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...| 

In [4]:
spark.sql(
    """
    
    REFRESH TABLE tabd_db.IndividualsActivities1
    
    """
)

spark.sql(
    """
    SELECT *
    FROM tabd_db.IndividualsActivities1
    
    """
).toPandas()

Unnamed: 0,year,region,internet_activity,activity_percentage,individuals
0,2011,European Union from 2020,,20,All Individuals
1,2012,European Union from 2020,,25,All Individuals
2,2013,European Union from 2020,,24,All Individuals
3,2014,European Union from 2020,,28,All Individuals
4,2015,European Union from 2020,,28,All Individuals
...,...,...,...,...,...
5755,2016,South Korea,,0,Individuals with high formal education
5756,2017,South Korea,,0,Individuals with high formal education
5757,2018,South Korea,,0,Individuals with high formal education
5758,2019,South Korea,,0,Individuals with high formal education


In [5]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.IndividualsActivities2
    
    """
).toPandas()

Unnamed: 0,year,region,internet_activity,activity_percentage,individuals
0,2011,European Union from 2020,"Looking for information about education, train...",28,
1,2012,European Union from 2020,"Looking for information about education, train...",0,
2,2013,European Union from 2020,"Looking for information about education, train...",30,
3,2014,European Union from 2020,"Looking for information about education, train...",0,
4,2015,European Union from 2020,"Looking for information about education, train...",31,
...,...,...,...,...,...
5275,2016,South Korea,"Playing/downloading games, listening to music ...",0,
5276,2017,South Korea,"Playing/downloading games, listening to music ...",0,
5277,2018,South Korea,"Playing/downloading games, listening to music ...",0,
5278,2019,South Korea,"Playing/downloading games, listening to music ...",0,


In [6]:
internetusage = spark.sql(
    """
    
    SELECT DISTINCT tabd_db.IndividualsActivities1.year, tabd_db.IndividualsActivities1.region,
           tabd_db.IndividualsActivities2.internet_activity,
           tabd_db.IndividualsActivities2.activity_percentage,
           tabd_db.IndividualsActivities1.individuals
           
    
    FROM tabd_db.IndividualsActivities1 INNER JOIN tabd_db.IndividualsActivities2 
         ON tabd_db.IndividualsActivities1.year = tabd_db.IndividualsActivities2.year and 
         tabd_db.IndividualsActivities1.region = tabd_db.IndividualsActivities2.region
  
    """
)

In [7]:
spark.sql(
    """
    
    SELECT DISTINCT tabd_db.IndividualsActivities1.year, tabd_db.IndividualsActivities1.region,
           tabd_db.IndividualsActivities2.internet_activity,
           tabd_db.IndividualsActivities2.activity_percentage,
           tabd_db.IndividualsActivities1.individuals
           
    
    FROM tabd_db.IndividualsActivities1 INNER JOIN tabd_db.IndividualsActivities2 
         ON tabd_db.IndividualsActivities1.year = tabd_db.IndividualsActivities2.year and 
         tabd_db.IndividualsActivities1.region = tabd_db.IndividualsActivities2.region
  
    """
).toPandas()


Unnamed: 0,year,region,internet_activity,activity_percentage,individuals
0,2018,European Union from 2020,"Looking for information about education, train...",0,Individuals with medium formal education
1,2014,European Union from 2020,Sending/receiving e-mails,67,Individuals with 15 years old or less
2,2015,European Union from 2020,Sending/receiving e-mails,67,Individuals with 55 to 64 years old
3,2013,European Union from 2020,"Playing/downloading games, images, films or music",0,Individuals with high formal education
4,2018,European Union from 2020,Listening to web radios and/or watching web TV,0,All Individuals
...,...,...,...,...,...
63355,2015,South Korea,Finding information about goods and services,0,Individuals with 15 years old or less
63356,2017,South Korea,"Playing/downloading games, images, films or music",0,Individuals with 16 to 24 years old
63357,2018,South Korea,"Playing/downloading games, images, films or music",0,Individuals with 25 to 34 years old
63358,2011,South Korea,Listening to web radios and/or watching web TV,0,Individuals with no or low formal education


In [8]:
internetusage \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/P_IndividualsActivities/")

In [10]:
spark.sql(
    """
    DROP TABLE tabd_db.IndividualsActivities1
    
    """
)

DataFrame[]

In [12]:
spark.sql(
    """
    DROP TABLE tabd_db.IndividualsActivities2
    
    """
)

DataFrame[]