In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# spark = SparkSession.builder.master('local[2]').getOrCreate()

from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
spark.sql(
    """
    SHOW TABLES FROM tabd_db
    """
).show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| tabd_db|adultinternetusag...|      false|
| tabd_db|distributiongloba...|      false|
| tabd_db|globalinternetusa...|      false|
| tabd_db|globalmobilepenet...|      false|
| tabd_db|globalsocialnetworks|      false|
| tabd_db| internetactivities1|      false|
| tabd_db| internetactivities2|      false|
| tabd_db|       internetusage|      false|
| tabd_db|internetusagefreq...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|numberofworldwide...|      false|
| tabd_db|p_dailyinternetus...|      false|
| tabd_db|p_enterprisesocia...|      false|
| tabd_db|p_globalsocialmed...|      false|
| tabd_db|p_globalsocialnet...|      false|
| tabd_db|p_individualsacti...|      false|
| tabd_db|     p_internetusage|      false|
| tabd_db|parentawarenessof...|      false|
| tabd_db|socialmediausagef...|      false|
| tabd_db|socialmediausebyp...| 

In [6]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.GlobalInternetUsageRateByGenderRegion
    
    """
).toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,frequency_of_access,individuals,frequency_of_access_percentage
0,2019,Europe,female,80.1,0,0.0,,,0
1,2019,Europe,male,85.1,0,0.0,,,0
2,2019,The Americas,female,77.0,0,0.0,,,0
3,2019,The Americas,male,76.3,0,0.0,,,0
4,2019,CIS,female,71.7,0,0.0,,,0
5,2019,CIS,male,74.0,0,0.0,,,0
6,2019,Arab States,female,47.3,0,0.0,,,0
7,2019,Arab States,male,61.3,0,0.0,,,0
8,2019,Asia Pacific,female,41.3,0,0.0,,,0
9,2019,Asia Pacific,male,48.3,0,0.0,,,0


In [7]:
spark.sql(
    """
    
    REFRESH TABLE tabd_db.NumberOfWorldwideInternetUsersByRegion
    
    """
)


spark.sql(
    """
    SELECT *
    FROM tabd_db.NumberOfWorldwideInternetUsersByRegion
    
    """
).toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,frequency_of_access,individuals,frequency_of_access_percentage
0,2009,Asia,,0.0,0,764.40,,,0
1,2009,Europe,,0.0,0,425.80,,,0
2,2009,North America,,0.0,0,259.60,,,0
3,2009,Latin America / Caribbean,,0.0,0,186.90,,,0
4,2009,Africa,,0.0,0,86.20,,,0
...,...,...,...,...,...,...,...,...,...
72,2020,North America,,0.0,0,332.91,,,0
73,2020,Latin America / Caribbean,,0.0,0,467.82,,,0
74,2020,Africa,,0.0,0,566.14,,,0
75,2020,Middle East,,0.0,0,184.86,,,0


In [8]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.InternetUsageFrequency
    
    """
).toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,frequency_of_access,individuals,frequency_of_access_percentage
0,2011,European Union from 2020,,0.0,0,0.0,Once a week,All Individuals,65
1,2011,European Union from 2020,,0.0,0,0.0,Once a week,All individuals who used internet in the last ...,94
2,2011,European Union from 2020,,0.0,0,0.0,Daily,All Individuals,54
3,2011,European Union from 2020,,0.0,0,0.0,Daily,All individuals who used internet in the last ...,78
4,2011,European Union from 2020,,0.0,0,0.0,At least once a week,All Individuals,11
...,...,...,...,...,...,...,...,...,...
5275,2020,Kosovo,,0.0,0,0.0,At least once a month,All individuals who used internet in the last ...,0
5276,2020,Kosovo,,0.0,0,0.0,Less than once a month,All Individuals,0
5277,2020,Kosovo,,0.0,0,0.0,Less than once a month,All individuals who used internet in the last ...,0
5278,2020,Kosovo,,0.0,0,0.0,Less than once a week,All Individuals,1


In [9]:
spark.sql(
    """
    SELECT *
    FROM tabd_db.AdultInternetUsagePenetration
    
    """
).toPandas()

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,frequency_of_access,individuals,frequency_of_access_percentage
0,2000,,male,0.0,54,0.0,,,0
1,2000,,female,0.0,50,0.0,,,0
2,2005,,male,0.0,69,0.0,,,0
3,2005,,female,0.0,67,0.0,,,0
4,2009,,male,0.0,77,0.0,,,0
5,2009,,female,0.0,75,0.0,,,0
6,2010,,male,0.0,77,0.0,,,0
7,2010,,female,0.0,76,0.0,,,0
8,2013,,male,0.0,84,0.0,,,0
9,2013,,female,0.0,84,0.0,,,0


In [10]:
internetusage = spark.sql(
    """
    
    SELECT DISTINCT tabd_db.NumberOfWorldwideInternetUsersByRegion.year, tabd_db.NumberOfWorldwideInternetUsersByRegion.region,
           tabd_db.AdultInternetUsagePenetration.gender, tabd_db.GlobalInternetUsageRateByGenderRegion.usage_rate,
           tabd_db.AdultInternetUsagePenetration.penetration_percentage, tabd_db.NumberOfWorldwideInternetUsersByRegion.internet_users,
           tabd_db.InternetUsageFrequency.frequency_of_access, tabd_db.InternetUsageFrequency.individuals, 
           tabd_db.InternetUsageFrequency.frequency_of_access_percentage
           
    
    FROM (tabd_db.NumberOfWorldwideInternetUsersByRegion 
    LEFT JOIN tabd_db.GlobalInternetUsageRateByGenderRegion 
    ON (tabd_db.NumberOfWorldwideInternetUsersByRegion.year = tabd_db.GlobalInternetUsageRateByGenderRegion.year and 
    tabd_db.NumberOfWorldwideInternetUsersByRegion.region = tabd_db.GlobalInternetUsageRateByGenderRegion.region)
         
    LEFT JOIN tabd_db.InternetUsageFrequency 
    ON tabd_db.InternetUsageFrequency.year = tabd_db.NumberOfWorldwideInternetUsersByRegion.year
    
    LEFT JOIN tabd_db.AdultInternetUsagePenetration 
    ON tabd_db.NumberOfWorldwideInternetUsersByRegion.year = tabd_db.AdultInternetUsagePenetration.year)
  
    """
)

In [11]:
spark.sql(
    """
    
    SELECT DISTINCT tabd_db.NumberOfWorldwideInternetUsersByRegion.year, tabd_db.NumberOfWorldwideInternetUsersByRegion.region,
           tabd_db.AdultInternetUsagePenetration.gender, tabd_db.GlobalInternetUsageRateByGenderRegion.usage_rate,
           tabd_db.AdultInternetUsagePenetration.penetration_percentage, tabd_db.NumberOfWorldwideInternetUsersByRegion.internet_users,
           tabd_db.InternetUsageFrequency.frequency_of_access, tabd_db.InternetUsageFrequency.individuals, 
           tabd_db.InternetUsageFrequency.frequency_of_access_percentage
           
    
    FROM (tabd_db.NumberOfWorldwideInternetUsersByRegion 
    LEFT JOIN tabd_db.GlobalInternetUsageRateByGenderRegion 
    ON (tabd_db.NumberOfWorldwideInternetUsersByRegion.year = tabd_db.GlobalInternetUsageRateByGenderRegion.year and 
    tabd_db.NumberOfWorldwideInternetUsersByRegion.region = tabd_db.GlobalInternetUsageRateByGenderRegion.region)
         
    LEFT JOIN tabd_db.InternetUsageFrequency 
    ON tabd_db.InternetUsageFrequency.year = tabd_db.NumberOfWorldwideInternetUsersByRegion.year
    
    LEFT JOIN tabd_db.AdultInternetUsagePenetration 
    ON tabd_db.NumberOfWorldwideInternetUsersByRegion.year = tabd_db.AdultInternetUsagePenetration.year)

  
    """
).toPandas()


         #FULL JOIN tabd_db.IndividualsActivities1 ON tabd_db.AdultInternetUsagePenetration.year = tabd_db.IndividualsActivities1.year
         #FULL JOIN tabd_db.IndividualsActivities2 ON tabd_db.IndividualsActivities1.year = tabd_db.IndividualsActivities2.year 
         #and tabd_db.IndividualsActivities1.region = tabd_db.IndividualsActivities2.region 

           #SELECT tabd_db.NumberOfWorldwideInternetUsersByRegion.year, tabd_db.NumberOfWorldwideInternetUsersByRegion.region,
           #tabd_db.AdultInternetUsagePenetration.gender, tabd_db.GlobalInternetUsageRateByGenderRegion.usage_rate,
           #tabd_db.AdultInternetUsagePenetration.penetration_percentage, tabd_db.NumberOfWorldwideInternetUsersByRegion.internet_users,
           #tabd_db.IndividualsActivities2.internet_activity, tabd_db.NumberOfWorldwideInternetUsersByRegion.type_of_individuals,
           #tabd_db.IndividualsActivities2.activity_percentage, tabd_db.InternetUsageFrequency.frequency_of_access,
           #tabd_db.InternetUsageFrequency.individuals, tabd_db.InternetUsageFrequency.frequency_of_access_percentage

         #FULL JOIN tabd_db.AdultInternetUsagePenetration ON tabd_db.InternetUsageFrequency.year = tabd_db.AdultInternetUsagePenetration.year
         #FULL JOIN tabd_db.IndividualsActivities1 FULL JOIN tabd_db.IndividualsActivities2 
         #ON tabd_db.IndividualsActivities1.year = tabd_db.IndividualsActivities2.year and 
         #tabd_db.IndividualsActivities1.region = tabd_db.IndividualsActivities2.region

Unnamed: 0,year,region,gender,usage_rate,penetration_percentage,internet_users,frequency_of_access,individuals,frequency_of_access_percentage
0,2011,Europe,,,,500.72,At least once a week,All individuals who used internet in the last ...,9.0
1,2011,Latin America / Caribbean,,,,235.82,Less than once a week,All Individuals,8.0
2,2011,Oceania / Australia,,,,23.93,Daily,All Individuals,59.0
3,2012,North America,,,,273.79,Daily,All individuals who used internet in the last ...,63.0
4,2012,Africa,,,,167.34,At least once a week,All individuals who used internet in the last ...,16.0
...,...,...,...,...,...,...,...,...,...
12699,2019,Africa,female,37.1,91.0,522.81,Daily,All Individuals,79.0
12700,2019,Africa,female,20.2,91.0,522.81,Daily,All Individuals,82.0
12701,2020,North America,,,,332.91,At least once a week,All Individuals,9.0
12702,2020,North America,,,,332.91,Once a week,All individuals who used internet in the last ...,97.0


In [12]:
internetusage \
    .write \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/tabd.db/P_InternetUsage/")