In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|distributiongloba...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|socialmediausebyp...|      false|
| tabd_db|socialmediausebyt...|      false|
| tabd_db|socialmediausersa...|      false|
| tabd_db|  socialnetworkusers| 

In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.InternetUsageFrequency
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.InternetUsageFrequency (
        year INT,
        region VARCHAR(45),
        gender VARCHAR(45),
        usage_rate DOUBLE,
        penetration_percentage INT,
        internet_users DOUBLE,
        frequency_of_access VARCHAR(45),
        individuals VARCHAR(45),
        frequency_of_access_percentage INT

    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/InternetUsageFrequency/'
    """
)

        

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [4]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|distributiongloba...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|socialmediausebyp...|      false|
| tabd_db|socialmediausebyt...|      false|
| tabd_db|socialmediausersa...| 

In [5]:
# Let's look into HDFS

In [6]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.InternetUsageFrequency
    """
).show()

+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|year|              region|gender|usage_rate|penetration_percentage|internet_users| frequency_of_access|         individuals|frequency_of_access_percentage|
+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|2011|European Union fr...|  null|       0.0|                     0|           0.0|         Once a week|     All Individuals|                            65|
|2011|European Union fr...|  null|       0.0|                     0|           0.0|         Once a week|All individuals w...|                            94|
|2011|European Union fr...|  null|       0.0|                     0|           0.0|               Daily|     All Individuals|                            54|
|2011|European Union fr...|  null|       0.0|             

In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.InternetUsageFrequency
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,year,int,
1,region,varchar(45),
2,gender,varchar(45),
3,usage_rate,double,
4,penetration_percentage,int,
5,internet_users,double,
6,frequency_of_access,varchar(45),
7,individuals,varchar(45),
8,frequency_of_access_percentage,int,
9,,,


In [8]:
# Let's put the files into HDFS

In [9]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.InternetUsageFrequency
    """
).toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,frequency_of_access,individuals,frequency_of_access_percentage
0,2011,European Union from 2020,,0.0,0,0.0,Once a week,All Individuals,65
1,2011,European Union from 2020,,0.0,0,0.0,Once a week,All individuals who used internet in the last ...,94
2,2011,European Union from 2020,,0.0,0,0.0,Daily,All Individuals,54
3,2011,European Union from 2020,,0.0,0,0.0,Daily,All individuals who used internet in the last ...,78
4,2011,European Union from 2020,,0.0,0,0.0,At least once a week,All Individuals,11
...,...,...,...,...,...,...,...,...,...
5275,2020,Kosovo,,0.0,0,0.0,At least once a month,All individuals who used internet in the last ...,0
5276,2020,Kosovo,,0.0,0,0.0,Less than once a month,All Individuals,0
5277,2020,Kosovo,,0.0,0,0.0,Less than once a month,All individuals who used internet in the last ...,0
5278,2020,Kosovo,,0.0,0,0.0,Less than once a week,All Individuals,1


In [10]:


spark.sql(
    """
    SELECT *
    FROM tabd_db.InternetUsageFrequency
    """
).show()

+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|year|              region|gender|usage_rate|penetration_percentage|internet_users| frequency_of_access|         individuals|frequency_of_access_percentage|
+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|2011|European Union fr...|  null|       0.0|                     0|           0.0|         Once a week|     All Individuals|                            65|
|2011|European Union fr...|  null|       0.0|                     0|           0.0|         Once a week|All individuals w...|                            94|
|2011|European Union fr...|  null|       0.0|                     0|           0.0|               Daily|     All Individuals|                            54|
|2011|European Union fr...|  null|       0.0|             

In [11]:
spark.stop()