In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...| 

In [3]:
spark.sql(
    """
    DROP TABLE IF EXISTS tabd_db.P_InternetUsage
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE tabd_db.P_InternetUsage (
        year INT,
        region VARCHAR(45),
        gender VARCHAR(45),
        usage_rate DOUBLE,
        penetration_percentage INT,
        internet_users DOUBLE,
        frequency_of_access VARCHAR(45),
        individuals VARCHAR(45),
        frequency_of_access_percentage INT

    )
    STORED AS PARQUET
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/tabd.db/P_InternetUsage/'
    """
)

        

# tblproperties('skip.header.line.count'='1')
# can be used for csvs with header
# but spark sql cannot understand that at the moment, while reading the data using sql queries
# so just when creating hive tables backed up by csvs, avoid headers
# in the project we wont use hive text tables, so all good.

DataFrame[]

In [4]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db|individualsactivi...|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...| 

In [5]:
# Let's look into HDFS

In [6]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.P_InternetUsage
    """
).show()

+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|year|              region|gender|usage_rate|penetration_percentage|internet_users| frequency_of_access|         individuals|frequency_of_access_percentage|
+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|2011|                Asia|  null|      null|                  null|        1016.8|Less than once a ...|All individuals w...|                             3|
|2011|              Europe|  null|      null|                  null|        500.72|         Once a week|     All Individuals|                            65|
|2011|       North America|  null|      null|                  null|        273.07|               Daily|     All Individuals|                            55|
|2011|Latin America / C...|  null|      null|             

In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED tabd_db.P_InternetUsage
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,year,int,
1,region,varchar(45),
2,gender,varchar(45),
3,usage_rate,double,
4,penetration_percentage,int,
5,internet_users,double,
6,frequency_of_access,varchar(45),
7,individuals,varchar(45),
8,frequency_of_access_percentage,int,
9,,,


In [8]:
# Let's put the files into HDFS

In [9]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.P_InternetUsage
    """
).toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,frequency_of_access,individuals,frequency_of_access_percentage
0,2011,Asia,,,,1016.80,Less than once a month,All individuals who used internet in the last ...,3.0
1,2011,Europe,,,,500.72,Once a week,All Individuals,65.0
2,2011,North America,,,,273.07,Daily,All Individuals,55.0
3,2011,Latin America / Caribbean,,,,235.82,Once a week,All individuals who used internet in the last ...,89.0
4,2011,Middle East,,,,77.02,Daily,All Individuals,63.0
...,...,...,...,...,...,...,...,...,...
12699,2019,Asia,female,,91.0,2300.47,Once a week,All individuals who used internet in the last ...,97.0
12700,2019,North America,male,,90.0,327.57,Daily,All Individuals,91.0
12701,2019,North America,female,,91.0,327.57,Once a week,All Individuals,87.0
12702,2019,Latin America / Caribbean,female,,91.0,453.70,Daily,All Individuals,82.0


In [10]:


spark.sql(
    """
    SELECT *
    FROM tabd_db.P_InternetUsage
    """
).show()

+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|year|              region|gender|usage_rate|penetration_percentage|internet_users| frequency_of_access|         individuals|frequency_of_access_percentage|
+----+--------------------+------+----------+----------------------+--------------+--------------------+--------------------+------------------------------+
|2011|                Asia|  null|      null|                  null|        1016.8|Less than once a ...|All individuals w...|                             3|
|2011|              Europe|  null|      null|                  null|        500.72|         Once a week|     All Individuals|                            65|
|2011|       North America|  null|      null|                  null|        273.07|               Daily|     All Individuals|                            55|
|2011|Latin America / C...|  null|      null|             

In [11]:
spark.stop()