## Location of Hive metastore

In [0]:
spark.conf.get("spark.sql.warehouse.dir")

In [0]:
# File location & type
file_location = "/FileStore/tables/Customers.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The Applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema)  \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(file_location)

display(df)

Customer ID,Customer Name,Segment
CG-12520,Claire Gute,Consumer
DV-13045,Darrin Van Huff,Corporate
SO-20335,Sean O'Donnell,Consumer
BH-11710,Brosina Hoffman,Consumer
AA-10480,Andrew Allen,Consumer
IM-15070,Irene Maddox,Consumer
HP-14815,Harold Pawlan,Home Office
PK-19075,Pete Kriz,Consumer
AG-10270,Alejandro Grove,Consumer
ZD-21925,Zuschuss Donatelli,Consumer


In [0]:
# Checking the list of tables in Spark Catalog (Database "default")
spark.catalog.listTables()

In [0]:
# Now when we create a view using the existing df
df.createOrReplaceTempView("Customers")

In [0]:
# Checking the list of tables in Spark Catalog --> In this case database - default is not assigned to customers temp view & the table type is temporary
spark.catalog.listTables()

In [0]:
#spark.catalog.list <tab>
spark.catalog.listColumns("customers_csv")

In [0]:
#spark.catalog.list <tab>
spark.catalog.listDatabases()

In [0]:
#spark.catalog.list <tab>
spark.catalog.listTables()

## Spark SQL Query

In [0]:
myDF = spark.sql("SELECT * FROM customers WHERE Segment = 'Corporate'")

In [0]:
myDF.show()

In [0]:
# spark.sql("select * from customers").show()

spark.sql("select `Customer ID`, `Customer Name`, Segment from customers").show()

In [0]:
# Setup a second table
cust_sub = spark.createDataFrame([('CG-12520', 'Consumer', 'BRL'), ('DV-13045', 'Corporate', 'IND')])
cust_sub.createOrReplaceTempView("cust_sub")

In [0]:
spark.catalog.listTables()

In [0]:
# How a complex spark.sql query looks like --> # Here pcodes_table is the second table shown above.. similar situation
# maAgeDF = spark.sql("SELECT MEAN(age) as mean_age, \
#                             STDEV(age) as stdev_age \
#                      FROM people WHERE pcode IN \
#                      (SELECT pcode FROM pcodes_table WHERE state = 'MA')")

spark.sql("SELECT SUBSTRING(`Customer Name`, 0, CHARINDEX(' ', `Customer Name`)) as firstname FROM customers").show()

In [0]:
complexDF = spark.sql("SELECT SUBSTRING(`Customer Name`, 0, CHARINDEX(' ', `Customer Name`)) as firstname  \
                       FROM customers  \
                       WHERE `Customer ID` IN \
                       (SELECT _1 FROM cust_sub WHERE _3 = 'BRL') \
                       ")
complexDF.show()

--> Equivalent python scripts produce same results shown as below

In [0]:
spark.sql("SELECT * FROM customers WHERE `Customer ID` = 'DV-13045'").show(5)

In [0]:
spark.read.table("customers").where("`Customer ID` = 'DV-13045'").show(5)

## Let's query some parquet files

In [0]:
dbutils.fs.ls("/FileStore/webpage_files/")

In [0]:
dbutils.fs.head('dbfs:/FileStore/webpage_files/part-00000-tid-5635270027111026027-ad3dbd85-ffc6-4cf3-93b5-f021c4d37e22-27-1-c000.snappy.parquet', 200)

In [0]:
spark.sql("SELECT * FROM parquet.`/FileStore/webpage_files/part-00000-tid-5635270027111026027-ad3dbd85-ffc6-4cf3-93b5-f021c4d37e22-27-1-c000.snappy.parquet` WHERE associated_files LIKE 'ifruit%'").show()

## Creating and querying a view

In [0]:
spark.read.load("/FileStore/webpage_files/part-00000-tid-5635270027111026027-ad3dbd85-ffc6-4cf3-93b5-f021c4d37e22-27-1-c000.snappy.parquet") \
                .select("webpage") \
                .createTempView("webpages_temp")

spark.sql("SELECT * FROM webpages_temp WHERE `webpage` LIKE 'ifruit%'").show()

## Understanding global_temp

In [0]:
spark.catalog.listTables()

In [0]:
spark.catalog.listTables("global_temp")

In [0]:
spark.catalog.dropTempView("webpages_temp")

In [0]:
spark.catalog.listTables()

In [0]:
spark.read.load("/FileStore/webpage_files/part-00000-tid-5635270027111026027-ad3dbd85-ffc6-4cf3-93b5-f021c4d37e22-27-1-c000.snappy.parquet") \
                .select("webpage") \
                .createGlobalTempView("webpages_temp")

spark.sql("SELECT * FROM global_temp.webpages_temp WHERE `webpage` LIKE 'ifruit%'").show()

In [0]:
spark.catalog.listTables("global_temp")

In [0]:
spark.catalog.dropGlobalTempView("webpages_temp")

## The Catalog API

In [0]:
spark.catalog.listDatabases()

In [0]:
spark.catalog.listTables()

In [0]:
for table in spark.catalog.listTables():
  for column in spark.catalog.listColumns(table.name):
    print(table,column)

## Persistence & Joins

In [0]:
customersDF = spark.sql("SELECT * FROM customers")
joinedDF = customersDF.join(cust_sub.withColumnRenamed("_1", "Customer ID") \
                                    .withColumnRenamed("_2", "Segment") \
                                    .withColumnRenamed("_3", "Currency"), \
                             "Customer ID").persist()
joinedDF.show()

In [0]:
joinedDF.where("Currency = 'BRL'").show()

## Saving Dataframes

Multiple ways 

--> insertInto { save to an existing table in a database

--> saveAsParquetFile { save as a Parquet le (including schema)

--> saveAsTable { save as a Hive table

--> save { generic base function

In [0]:
customersDF.select("*").write.save("/FileStore/tables/Customers/", format="parquet")

In [0]:
customersDF.withColumnRenamed("Customer ID", "Customer_ID") \
           .withColumnRenamed("Customer Name", "CustName") \
           .select("*").write.save("/FileStore/tables/Customers/" \
           ,format="parquet")

In [0]:
dbutils.fs.ls("/FileStore/tables/Customers/")

## Save table to hive

In [0]:
spark.catalog.listTables()

In [0]:
customersDF = customersDF.withColumnRenamed("Customer ID", "Customer_ID") \
                         .withColumnRenamed("Customer Name", "CustName")

In [0]:
customersDF.write.saveAsTable("default.table")

In [0]:
spark.catalog.listTables()

In [0]:
# To look at the columns
spark.catalog.listColumns("table", "default")

In [0]:
# Run SQL queries on it

spark.sql("SELECT * FROM default.table").show()

In [0]:
customersDF.write.insertInto("default.table", overwrite=False)    # --> Now we'll have duplicates 

In [0]:
spark.sql("SELECT * FROM default.table").count()    # DUplicate rows exist

In [0]:
customersDF.write.insertInto("default.table", overwrite=True)

In [0]:
spark.sql("SELECT * FROM default.table").count()     # Overwrite=True parameter is needed to remove duplicates :) 

In [0]:
spark.sql("DROP TABLE default.table")

In [0]:
spark.catalog.listTables()