In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...| 

In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.IndividualsActivities2
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.IndividualsActivities2 (
        year INT,
        region VARCHAR(45),
        internet_activity VARCHAR(45),
        activity_percentage INT,
        individuals VARCHAR(45)

    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/IndividualsActivities2/'
    """
)

        

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [4]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...| 

In [5]:
# Let's look into HDFS

In [6]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.IndividualsActivities2
    """
).show()

+----+--------------------+--------------------+-------------------+-----------+
|year|              region|   internet_activity|activity_percentage|individuals|
+----+--------------------+--------------------+-------------------+-----------+
|2011|European Union fr...|Looking for infor...|                 28|       null|
|2012|European Union fr...|Looking for infor...|                  0|       null|
|2013|European Union fr...|Looking for infor...|                 30|       null|
|2014|European Union fr...|Looking for infor...|                  0|       null|
|2015|European Union fr...|Looking for infor...|                 31|       null|
|2016|European Union fr...|Looking for infor...|                  0|       null|
|2017|European Union fr...|Looking for infor...|                  0|       null|
|2018|European Union fr...|Looking for infor...|                  0|       null|
|2019|European Union fr...|Looking for infor...|                  0|       null|
|2020|European Union fr...|L

In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.IndividualsActivities2
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,year,int,
1,region,varchar(45),
2,internet_activity,varchar(45),
3,activity_percentage,int,
4,individuals,varchar(45),
5,,,
6,# Detailed Table Information,,
7,Database,tabd_db,
8,Table,individualsactivities2,
9,Owner,jovyan,


In [8]:
# Let's put the files into HDFS

In [9]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.IndividualsActivities2
    """
).toPandas()

Unnamed: 0,year,region,internet_activity,activity_percentage,individuals
0,2011,European Union from 2020,"Looking for information about education, train...",28,
1,2012,European Union from 2020,"Looking for information about education, train...",0,
2,2013,European Union from 2020,"Looking for information about education, train...",30,
3,2014,European Union from 2020,"Looking for information about education, train...",0,
4,2015,European Union from 2020,"Looking for information about education, train...",31,
...,...,...,...,...,...
5275,2016,South Korea,"Playing/downloading games, listening to music ...",0,
5276,2017,South Korea,"Playing/downloading games, listening to music ...",0,
5277,2018,South Korea,"Playing/downloading games, listening to music ...",0,
5278,2019,South Korea,"Playing/downloading games, listening to music ...",0,


In [10]:


spark.sql(
    """
    SELECT *
    FROM tabd_db.IndividualsActivities2
    """
).show()

+----+--------------------+--------------------+-------------------+-----------+
|year|              region|   internet_activity|activity_percentage|individuals|
+----+--------------------+--------------------+-------------------+-----------+
|2011|European Union fr...|Looking for infor...|                 28|       null|
|2012|European Union fr...|Looking for infor...|                  0|       null|
|2013|European Union fr...|Looking for infor...|                 30|       null|
|2014|European Union fr...|Looking for infor...|                  0|       null|
|2015|European Union fr...|Looking for infor...|                 31|       null|
|2016|European Union fr...|Looking for infor...|                  0|       null|
|2017|European Union fr...|Looking for infor...|                  0|       null|
|2018|European Union fr...|Looking for infor...|                  0|       null|
|2019|European Union fr...|Looking for infor...|                  0|       null|
|2020|European Union fr...|L

In [11]:
spark.stop()