In [0]:
%sql
CREATE DATABASE IF NOT EXISTS db_bronze;

In [0]:
%sql
DROP TABLE IF EXISTS db_bronze.table_hired_employees;
CREATE TABLE db_bronze.table_hired_employees(
  id bigint,
  name string,
  datetime string,
  department_id bigint,
  job_id bigint
);

DROP TABLE IF EXISTS db_bronze.table_departments;
CREATE TABLE db_bronze.table_departments(
  id bigint,
  department string
);

DROP TABLE IF EXISTS db_bronze.table_jobs;
CREATE TABLE db_bronze.table_jobs(
  id bigint,
  job string
)


In [0]:
%python
# coding: utf-8

# ||********************************************************************************************************
# || PROYECTO   		: POC -CHALLENGE GLOBLANT 
# || NOMBRE     		: challenge.py
# || TABLA DESTINO	: db_bronze.hired_employees
# ||                  db_bronze.departments 
# ||                  db_bronze.jobs
# || TABLAS FUENTES	: departments.csv
# ||                  hired_employees.csv
# ||                  jobs.csv
# || OBJETIVO   		: ETL - big data migrati
# || TIPO       		: pyspark
# || REPROCESABLE	  : NA
# || SCHEDULER		  : NA
# || JOB  		      : NA
# || VERSION   DESARROLLADOR           FECHA        DESCRIPCION
# || 1.1       ALEXIS DAVILA        21/03/23     Creacion del proceso
# *************************************************************************************************************

###
 # @section Import
 ##
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,DoubleType,FloatType,LongType

###
 # @section configuracion de recursos
 ##
spark = SparkSession.builder.appName("CSV to Database").getOrCreate()

###
 # @section funciones
 ##


def insert_csv_to_db(csv_file,table_name,nombre_columnas):
    """"
    Esta función inserta data de archivos csv a una base de datos especificando las columnas.
    :param csv_file: archivos csv
    :param table_name: nombre y esquema de la tabla en donde se inserta la data
    :param nombre_columnas: la lista de columnas de la tabla
    """
    # Lee la tabla y guarda su contenido en un DataFrame
    data_test = pd.read_csv(csv_file,sep=',', header='infer')
    df=spark.createDataFrame(data_test)
    nombres_columnas_final = nombre_columnas
    #Asigna las columnas en el dataframe
    df = df.toDF(*nombres_columnas_final)
    department_id='department_id'
    if department_id in nombres_columnas_final:
        df = df.withColumn("department_id",col("department_id").cast(LongType()))
        df = df.withColumn("job_id",col("job_id").cast(LongType()))
    else:
        print('Argumento "valida id" no válido')
    # Guarda el DataFrame en formato delta en la la tabla especificada
    df.show()
    df.write.format("delta").mode("overwrite").option("batchsize", "1000").option("mergeSchema", "true").saveAsTable(table_name)

def backup_table(table_name, backup_path, backup_format="avro"):
    """
    Esta función crea una copia de seguridad de una tabla en formato AVRO y la guarda en el sistema de archivos.
    :param table_name: nombre de la tabla a respaldar
    :param backup_path: ruta donde se guardará el archivo de respaldo
    :param backup_format: formato en que se guardará el archivo de respaldo. Por defecto es "avro".
    """
    # Lee la tabla y guarda su contenido en un DataFrame
    df = spark.table(table_name)
    # Guarda el DataFrame en formato AVRO en la ruta especificada
    df.write.format(backup_format).save(backup_path)
    
def restore_table(table_name, backup_path, backup_format="avro"):
    """
    Esta función restaura una tabla a partir de su copia de seguridad en formato AVRO.
    :param table_name: nombre de la tabla a restaurar
    :param backup_path: ruta donde se encuentra el archivo de copia de seguridad
    :param backup_format: formato del archivo de copia de seguridad. Por defecto es "avro".
    """
    # Lee el archivo de copia de seguridad en formato AVRO y carga su contenido en un DataFrame
    df = spark.read.format(backup_format).load(backup_path)
    # Escribe el contenido del DataFrame en la tabla especificada
    df.write.mode("overwrite").saveAsTable(table_name)

def main():
    
    csv_file_1 = "https://raw.githubusercontent.com/alexis18daes/databricks_challenge_repo/dev/hired_employees.csv" 
    csv_file_2 = "https://raw.githubusercontent.com/alexis18daes/databricks_challenge_repo/dev/departments.csv"
    csv_file_3 = "https://raw.githubusercontent.com/alexis18daes/databricks_challenge_repo/dev/jobs.csv"
    
    table_name_1 = "db_bronze.table_hired_employees"
    #table_name_2 = "db_bronze.table_departments"
    #table_name_3 = "db_bronze.table_jobs"
    
    nombre_columnas_1 = ["id","name","datetime","department_id","job_id"]
    #nombre_columnas_2 = ["id","department"]
    #nombre_columnas_3 = ["id","job"]
    
    insert_csv_to_db(csv_file_1,table_name_1,nombre_columnas_1)
    #insert_csv_to_db(csv_file_2,table_name_2,nombre_columnas_2)
    #insert_csv_to_db(csv_file_3,table_name_3,nombre_columnas_3)
    
#Ejecucion
main()

#spark.stop()

#Salida
#exit()



In [0]:
%sql
--Number of employees hired for each job and department in 2021 divided by quarter. The table must be ordered alphabetically by department and job.


SELECT x.department, x.job,
COUNT(CASE WHEN DATEPART(QUARTER, x.final_date) = 1 THEN id END) AS Q1,
COUNT(CASE WHEN DATEPART(QUARTER, x.final_date) = 2 THEN id END) AS Q2,
COUNT(CASE WHEN DATEPART(QUARTER, x.final_date) = 3 THEN id END) AS Q3,
COUNT(CASE WHEN DATEPART(QUARTER, x.final_date) = 4 THEN id END) AS Q4
FROM (
SELECT a.id,a.name, cast(to_timestamp('second',a.datetime::timestamp) as date) as final_date, b.department,c.job
from db_bronze.table_hired_employees a
left join db_bronze.table_departments b
on a.department_id=b.id
left join db_bronze.table_jobs c
on a.job_id=c.id) x
WHERE YEAR(x.final_date) = 2021
GROUP BY x.department, x.job
ORDER BY x.department, x.job;




In [0]:
%sql
--List of ids, name and number of employees hired of each department that hired more
--employees than the mean of employees hired in 2021 for all the departments, ordered
--by the number of employees hired (descending).

SELECT x.department, COUNT(x.id) as hired
FROM (
SELECT a.id,a.name, cast(to_timestamp('second',a.datetime::timestamp) as date) as final_date, b.department,c.job
from db_bronze.table_hired_employees a
left join db_bronze.table_departments b
on a.department_id=b.id
left join db_bronze.table_jobs c
on a.job_id=c.id
) x
WHERE YEAR(x.final_date) = 2021
GROUP BY x.department
HAVING COUNT(x.id) > (SELECT AVG(d.hired) FROM 
(SELECT z.department, COUNT(z.id) as hired
FROM(
SELECT a.id,a.name, cast(to_timestamp('second',a.datetime::timestamp) as date) as final_date, b.department,c.job
from db_bronze.table_hired_employees a
left join db_bronze.table_departments b
on a.department_id=b.id
left join db_bronze.table_jobs c
on a.job_id=c.id
) z
WHERE YEAR(z.final_date) = 2021
GROUP BY z.department) d)
ORDER BY hired DESC;

In [0]:
%sql
SELECT cast(to_timestamp(date_trunc('second','2025-12-15T08:27:34Z'::timestamp)) as date); 

"CAST(to_timestamp(date_trunc(second, 2025-12-15T08:27:34Z)) AS DATE)"
2025-12-15
