In [0]:
container_name = 'employees'
account_name = "datalake74"
mount_point = '/mnt/employees'

In [0]:
application_id = dbutils.secrets.get(scope="databricks-assignment-2", key='application-id')
tenant_id = dbutils.secrets.get(scope="databricks-assignment-2", key='tenant-id')
secret = dbutils.secrets.get(scope="databricks-assignment-2", key="secret1")

In [0]:
for i in dbutils.secrets.get(scope='databricks-assignment-2', key='secret1'):
    print(i)

In [0]:
dbutils.fs.unmount('/mnt/employees/bronze')

In [0]:
configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": application_id,
          "fs.azure.account.oauth2.client.secret": secret,
          "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"}

# Optionally, you can add <directory-name> to the source URI of your mount point.
dbutils.fs.mount(
  source = f"abfss://{container_name}@{account_name}.dfs.core.windows.net/",
  mount_point = mount_point,
  extra_configs = configs)

In [0]:
display(dbutils.fs.mounts())

mountPoint,source,encryptionType
/mnt/employees,abfss://employees@datalake74.dfs.core.windows.net/,
/databricks-datasets,databricks-datasets,
/mnt/gold,abfss://silver@datalake74.dfs.core.windows.net/,
/mnt/silver,abfss://silver@datalake74.dfs.core.windows.net/,
/databricks/mlflow-tracking,databricks/mlflow-tracking,
/databricks-results,databricks-results,
/databricks/mlflow-registry,databricks/mlflow-registry,
/mnt/bronze,abfss://bronze@datalake74.dfs.core.windows.net/,
/,DatabricksRoot,


In [0]:
employees_path = '/mnt/employees/bronze/employees.csv'
departments_path = '/mnt/employees/bronze/departments.csv'
countries_path = '/mnt/employees/bronze/countries.csv'
locations_path = '/mnt/employees/bronze/locations.csv'

In [0]:
from pyspark.sql.types import IntegerType, StringType, DoubleType, StructField, StructType

# Define schemas

employees_schema = StructType([
    StructField('EMPLOYEE_ID', IntegerType(), False),
    StructField('FIRST_NAME', StringType(), False),
    StructField('LAST_NAME', StringType(), False),
    StructField('EMAIL', StringType(), False),
    StructField('PHONE', StringType(), False),
    StructField('DATE_HIRED', StringType(), False),
    StructField('JOB_ID', StringType(), False),
    StructField('SALARY', IntegerType(), False),
    StructField('MANAGER_ID', IntegerType(), True),
    StructField('DEPARTMENT_ID', IntegerType(), False)
])

departments_schema = StructType([
    StructField('DEPARTMENT_ID', IntegerType(), False),
    StructField('DEPARTMENT_NAME', StringType(), False)
])

countries_schema = StructType([
    StructField('COUNTRY_ID', StringType(), False),
    StructField('COUNTRY_NAME', StringType(), False)
])

In [0]:
employees = spark.read.csv(employees_path, header=True, schema=employees_schema)

employees.display()

EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE,DATE_HIRED,JOB_ID,SALARY,MANAGER_ID,DEPARTMENT_ID
100,Steven,King,SKING,515.123.4567,09/01/2009,AD_PRES,24000,,90.0
101,Neena,Kochhar,NKOCHHAR,515.123.4568,12/07/2011,AD_VP,17000,100.0,90.0
102,Lex,De Haan,LDEHAAN,515.123.4569,03/31/2015,AD_VP,17000,100.0,90.0
103,Alexander,Hunold,AHUNOLD,590.423.4567,03/20/2012,IT_PROG,9000,102.0,60.0
104,Bruce,Ernst,BERNST,590.423.4568,08/05/2013,IT_PROG,6000,103.0,60.0
105,David,Austin,DAUSTIN,590.423.4569,09/10/2019,IT_PROG,4800,103.0,60.0
106,Valli,Pataballa,VPATABAL,590.423.4560,04/22/2020,IT_PROG,4800,103.0,60.0
107,Diana,Lorentz,DLORENTZ,590.423.5567,04/24/2021,IT_PROG,4200,103.0,60.0
108,Nancy,Greenberg,NGREENBE,515.124.4569,11/01/2016,FI_MGR,12000,101.0,100.0
109,Daniel,Faviet,DFAVIET,515.124.4169,10/31/2016,FI_ACCOUNT,9000,108.0,100.0


In [0]:
departments = spark.read.csv(departments_path, header=True, 
                            schema=departments_schema)

departments.display()

DEPARTMENT_ID,DEPARTMENT_NAME
10,Administration
20,Marketing
30,Purchasing
40,Human Resources
50,Shipping
60,IT
70,Public Relations
80,Sales
90,Executive
100,Finance


In [0]:
countries = spark.read.csv(countries_path, header=True, 
                          schema=countries_schema)

countries.display()

COUNTRY_ID,COUNTRY_NAME
AR,Argentina
AU,Australia
BE,Belgium
BR,Brazil
CA,Canada
CH,Switzerland
CN,China
DE,Germany
DK,Denmark
EG,Egypt


In [0]:
from pyspark.sql.functions import *

In [0]:
# Turn string to datetime datatype
employees = employees.withColumn('HIRE_DATE', to_date(employees.DATE_HIRED, 'MM/dd/yyyy'))
employees.display()

EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE,DATE_HIRED,JOB_ID,SALARY,MANAGER_ID,DEPARTMENT_ID,HIRE_DATE
100,Steven,King,SKING,515.123.4567,09/01/2009,AD_PRES,24000,,90.0,2009-09-01
101,Neena,Kochhar,NKOCHHAR,515.123.4568,12/07/2011,AD_VP,17000,100.0,90.0,2011-12-07
102,Lex,De Haan,LDEHAAN,515.123.4569,03/31/2015,AD_VP,17000,100.0,90.0,2015-03-31
103,Alexander,Hunold,AHUNOLD,590.423.4567,03/20/2012,IT_PROG,9000,102.0,60.0,2012-03-20
104,Bruce,Ernst,BERNST,590.423.4568,08/05/2013,IT_PROG,6000,103.0,60.0,2013-08-05
105,David,Austin,DAUSTIN,590.423.4569,09/10/2019,IT_PROG,4800,103.0,60.0,2019-09-10
106,Valli,Pataballa,VPATABAL,590.423.4560,04/22/2020,IT_PROG,4800,103.0,60.0,2020-04-22
107,Diana,Lorentz,DLORENTZ,590.423.5567,04/24/2021,IT_PROG,4200,103.0,60.0,2021-04-24
108,Nancy,Greenberg,NGREENBE,515.124.4569,11/01/2016,FI_MGR,12000,101.0,100.0,2016-11-01
109,Daniel,Faviet,DFAVIET,515.124.4169,10/31/2016,FI_ACCOUNT,9000,108.0,100.0,2016-10-31


In [0]:
# Drop unnecessary columns
employees = employees.drop('EMAIL', 'PHONE', 'DATE_HIRED')
employees.display()

EMPLOYEE_ID,FIRST_NAME,LAST_NAME,JOB_ID,SALARY,MANAGER_ID,DEPARTMENT_ID,HIRE_DATE
100,Steven,King,AD_PRES,24000,,90.0,2009-09-01
101,Neena,Kochhar,AD_VP,17000,100.0,90.0,2011-12-07
102,Lex,De Haan,AD_VP,17000,100.0,90.0,2015-03-31
103,Alexander,Hunold,IT_PROG,9000,102.0,60.0,2012-03-20
104,Bruce,Ernst,IT_PROG,6000,103.0,60.0,2013-08-05
105,David,Austin,IT_PROG,4800,103.0,60.0,2019-09-10
106,Valli,Pataballa,IT_PROG,4800,103.0,60.0,2020-04-22
107,Diana,Lorentz,IT_PROG,4200,103.0,60.0,2021-04-24
108,Nancy,Greenberg,FI_MGR,12000,101.0,100.0,2016-11-01
109,Daniel,Faviet,FI_ACCOUNT,9000,108.0,100.0,2016-10-31


In [0]:
dbutils.fs.rm('/mnt/employees/silver', recurse=True)

In [0]:
# Write files to /mnt/employees/silver
save_path = '/mnt/employees/silver'
employees.write.parquet(f"{save_path}/employees")
departments.write.parquet(f"{save_path}/departments")
countries.write.parquet(f"{save_path}/countries")

### GOLD SECTION

In [0]:
employees = employees.withColumn('FULL_NAME', concat(employees['FIRST_NAME'], lit(" "), employees['LAST_NAME']))
employees.display()

EMPLOYEE_ID,FIRST_NAME,LAST_NAME,JOB_ID,SALARY,MANAGER_ID,DEPARTMENT_ID,HIRE_DATE,FULL_NAME
100,Steven,King,AD_PRES,24000,,90.0,2009-09-01,Steven King
101,Neena,Kochhar,AD_VP,17000,100.0,90.0,2011-12-07,Neena Kochhar
102,Lex,De Haan,AD_VP,17000,100.0,90.0,2015-03-31,Lex De Haan
103,Alexander,Hunold,IT_PROG,9000,102.0,60.0,2012-03-20,Alexander Hunold
104,Bruce,Ernst,IT_PROG,6000,103.0,60.0,2013-08-05,Bruce Ernst
105,David,Austin,IT_PROG,4800,103.0,60.0,2019-09-10,David Austin
106,Valli,Pataballa,IT_PROG,4800,103.0,60.0,2020-04-22,Valli Pataballa
107,Diana,Lorentz,IT_PROG,4200,103.0,60.0,2021-04-24,Diana Lorentz
108,Nancy,Greenberg,FI_MGR,12000,101.0,100.0,2016-11-01,Nancy Greenberg
109,Daniel,Faviet,FI_ACCOUNT,9000,108.0,100.0,2016-10-31,Daniel Faviet


In [0]:
# Create employees table with the following columns 

# employee_id, full_name, hire_date, job_id, salary, department_name


# Perform left join with departments df and drop columns
employees = employees.join(departments, employees['DEPARTMENT_ID']==departments['DEPARTMENT_ID'], 'left').drop('FIRST_NAME', 'LAST_NAME', 'DEPARTMENT_ID', 'MANAGER_ID')
employees.display()

EMPLOYEE_ID,JOB_ID,SALARY,HIRE_DATE,FULL_NAME,DEPARTMENT_NAME
100,AD_PRES,24000,2009-09-01,Steven King,Executive
101,AD_VP,17000,2011-12-07,Neena Kochhar,Executive
102,AD_VP,17000,2015-03-31,Lex De Haan,Executive
103,IT_PROG,9000,2012-03-20,Alexander Hunold,IT
104,IT_PROG,6000,2013-08-05,Bruce Ernst,IT
105,IT_PROG,4800,2019-09-10,David Austin,IT
106,IT_PROG,4800,2020-04-22,Valli Pataballa,IT
107,IT_PROG,4200,2021-04-24,Diana Lorentz,IT
108,FI_MGR,12000,2016-11-01,Nancy Greenberg,Finance
109,FI_ACCOUNT,9000,2016-10-31,Daniel Faviet,Finance


In [0]:
employees = employees.select('EMPLOYEE_ID', 'FULL_NAME', 'HIRE_DATE', 'JOB_ID', 'SALARY', 'DEPARTMENT_NAME')
employees.display()

EMPLOYEE_ID,FULL_NAME,HIRE_DATE,JOB_ID,SALARY,DEPARTMENT_NAME
100,Steven King,2009-09-01,AD_PRES,24000,Executive
101,Neena Kochhar,2011-12-07,AD_VP,17000,Executive
102,Lex De Haan,2015-03-31,AD_VP,17000,Executive
103,Alexander Hunold,2012-03-20,IT_PROG,9000,IT
104,Bruce Ernst,2013-08-05,IT_PROG,6000,IT
105,David Austin,2019-09-10,IT_PROG,4800,IT
106,Valli Pataballa,2020-04-22,IT_PROG,4800,IT
107,Diana Lorentz,2021-04-24,IT_PROG,4200,IT
108,Nancy Greenberg,2016-11-01,FI_MGR,12000,Finance
109,Daniel Faviet,2016-10-31,FI_ACCOUNT,9000,Finance


In [0]:
save_path = '/mnt/employees/gold'
employees.write.parquet(f"{save_path}/employees")

In [0]:
%sql 
CREATE DATABASE IF NOT EXISTS employees

In [0]:
%sql 
CREATE TABLE employees.employees
(
  EMPLOYEE_ID int, 
  FULL_NAME string, 
  HIRE_DATE date, 
  JOB_ID string, 
  SALARY int, 
  DEPARTMENT_NAME string
)
USING parquet 
LOCATION '/mnt/employees/gold/employees'

In [0]:
%sql 
SELECT * FROM employees.employees;

EMPLOYEE_ID,FULL_NAME,HIRE_DATE,JOB_ID,SALARY,DEPARTMENT_NAME
100,Steven King,2009-09-01,AD_PRES,24000,Executive
101,Neena Kochhar,2011-12-07,AD_VP,17000,Executive
102,Lex De Haan,2015-03-31,AD_VP,17000,Executive
103,Alexander Hunold,2012-03-20,IT_PROG,9000,IT
104,Bruce Ernst,2013-08-05,IT_PROG,6000,IT
105,David Austin,2019-09-10,IT_PROG,4800,IT
106,Valli Pataballa,2020-04-22,IT_PROG,4800,IT
107,Diana Lorentz,2021-04-24,IT_PROG,4200,IT
108,Nancy Greenberg,2016-11-01,FI_MGR,12000,Finance
109,Daniel Faviet,2016-10-31,FI_ACCOUNT,9000,Finance


In [0]:
%sql 
DESCRIBE EXTENDED employees.employees

col_name,data_type,comment
EMPLOYEE_ID,int,
FULL_NAME,string,
HIRE_DATE,date,
JOB_ID,string,
SALARY,int,
DEPARTMENT_NAME,string,
,,
# Detailed Table Information,,
Database,employees,
Table,employees,
