In [1]:
import findspark
findspark.init('C:\spark-3.4.1-bin-hadoop3')

import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("globant_test") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.6,com.microsoft.azure:azure-storage:8.6.6") \
    .config("spark.sql.sources.partitionOverwriteMode","dynamic") \
    .config("spark.jars", "%SPARK_HOME%\jars\postgresql-42.5.4.jar") \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
import configparser
config = configparser.RawConfigParser()
config.read('config.ini')

account_key = config['azure']['accountKey']
account_name = config['azure']['accountName']
raw_container = config['azure']['rawContainer']
silver_container = config['azure']['silverContainer']
db_host = config['azure']['dbHost']
db_name = config['azure']['dbName']
db_user = config['azure']['dbUser']
db_pass = config['azure']['dbPass']

spark._jsc.hadoopConfiguration().set(f"fs.azure.account.key.{account_name}.dfs.core.windows.net", account_key)

raw_fs = f'abfss://{raw_container}@{account_name}.dfs.core.windows.net'
hmz_fs = f'abfss://{silver_container}@{account_name}.dfs.core.windows.net'
file_path = 'globant'

In [3]:
%run -i harmonize.ipynb

In [4]:
%run -i migration.ipynb

In [5]:
hmz = Harminozator()

In [6]:
hmz.harmonize(raw_fs, file_path)
hmz.writeSilver(hmz_fs, file_path)

In [7]:
mig = Migrator(hmz.config)

In [8]:
mig.migrate(hmz_fs, file_path, db_host, db_name, db_user, db_pass)

In [23]:
%run -i Reports.ipynb

In [24]:
rpt = Reporter()

hired_by_q = rpt.getHiredByQuarter(db_host, db_name, db_user, db_pass)
hired_by_q.show(truncate=False)

+----------+-----------------------------+---+---+---+---+
|department|job                          |Q1 |Q2 |Q3 |Q4 |
+----------+-----------------------------+---+---+---+---+
|Accounting|Account Representative IV    |1  |0  |0  |0  |
|Accounting|Actuary                      |0  |1  |0  |0  |
|Accounting|Analyst Programmer           |0  |0  |1  |0  |
|Accounting|Budget/Accounting Analyst III|0  |1  |0  |0  |
|Accounting|Cost Accountant              |0  |1  |0  |0  |
|Accounting|Database Administrator III   |0  |0  |0  |1  |
|Accounting|Desktop Support Technician   |0  |0  |1  |0  |
|Accounting|Food Chemist                 |1  |0  |0  |0  |
|Accounting|Graphic Designer             |0  |1  |0  |0  |
|Accounting|Health Coach III             |0  |0  |0  |1  |
|Accounting|Health Coach IV              |0  |0  |1  |0  |
|Accounting|Help Desk Technician         |0  |0  |1  |0  |
|Accounting|Junior Executive             |0  |0  |1  |0  |
|Accounting|Legal Assistant              |0  |0  |1  |1 

In [25]:
hired_by_dep = rpt.getHiredByDepartment(db_host, db_name, db_user, db_pass)
hired_by_dep.show(truncate=False)

+---+------------------------+-----+
|id |department              |hired|
+---+------------------------+-----+
|8  |Support                 |221  |
|5  |Engineering             |208  |
|7  |Services                |204  |
|6  |Human Resources         |204  |
|4  |Business Development    |187  |
|3  |Research and Development|151  |
|9  |Marketing               |143  |
|10 |Training                |114  |
|2  |Sales                   |94   |
|11 |Legal                   |57   |
|1  |Product Management      |49   |
|12 |Accounting              |38   |
|-1 |N/D                     |15   |
+---+------------------------+-----+

