This notebook sets up the foundational imports required for data engineering tasks using PySpark and Pandas. It includes utilities for data manipulation, schema definition, and Spark session access.

In [None]:
import pyspark.sql.functions as F
import pandas as pd
from pyspark.sql.types import *
from notebookutils import mssparkutils as mu
from pyspark.sql import SparkSession

StatementMeta(, 78ecfaea-3ea8-4e67-992e-4915993dbd53, 4, Finished, Available, Finished)

In [None]:
spark = SparkSession.builder.getOrCreate()
try:
    spark.sql("CREATE SCHEMA if NOT exists bronze")
    print("Schema created Bronze")

except Exception as ex:
    print("error in schemas")

StatementMeta(, 78ecfaea-3ea8-4e67-992e-4915993dbd53, 5, Finished, Available, Finished)

Schema created Bronze


In [None]:
try:
    df_brokers = spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema", "true") \
    .load("Files/raw/brokers.csv")

    df_campaigns = spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema", "true") \
    .load("Files/raw/campaigns.csv")

    df_clients = spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema", "true") \
    .load("Files/raw/clients.csv")

    df_leads = spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema", "true") \
    .load("Files/raw/leads.csv")

    df_projects = spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema", "true") \
    .load("Files/raw/projects.csv")

    df_properties= spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema", "true") \
    .load("Files/raw/properties.csv")

    df_sales= spark.read.format("csv") \
    .option("header","true") \
    .option("inferSchema", "true") \
    .load("Files/raw/sales.csv")



except Exception as ex:
    print("error in Read")

StatementMeta(, 78ecfaea-3ea8-4e67-992e-4915993dbd53, 6, Finished, Available, Finished)

In [None]:
try:
    df_brokers_bronze = df_brokers.withColumn("_created_", F.current_timestamp()) \
                 .withColumn("_server_", F.lit("Azure_adsl")) \
                 .withColumn("_last_user_", F.lit("notebookMC1"))

    df_campaigns_bronze = df_campaigns.withColumn("_created_", F.current_timestamp()) \
                 .withColumn("_server_", F.lit("Azure_adsl")) \
                 .withColumn("_last_user_", F.lit("notebookMC1"))

    df_clients_bronze = df_clients.withColumn("_created_", F.current_timestamp()) \
                 .withColumn("_server_", F.lit("Azure_adsl")) \
                 .withColumn("_last_user_", F.lit("notebookMC1"))

    df_leads_bronze = df_leads.withColumn("_created_", F.current_timestamp()) \
                 .withColumn("_server_", F.lit("Azure_adsl")) \
                 .withColumn("_last_user_", F.lit("notebookMC1"))

    df_projects_bronze = df_projects.withColumn("_created_", F.current_timestamp()) \
                 .withColumn("_server_", F.lit("Azure_adsl")) \
                 .withColumn("_last_user_", F.lit("notebookMC1"))
    
    df_properties_bronze = df_properties.withColumn("_created_", F.current_timestamp()) \
                 .withColumn("_server_", F.lit("Azure_adsl")) \
                 .withColumn("_last_user_", F.lit("notebookMC1"))

    df_sales_bronze = df_sales.withColumn("_created_", F.current_timestamp()) \
                 .withColumn("_server_", F.lit("Azure_adsl")) \
                 .withColumn("_last_user_", F.lit("notebookMC1"))

except Exception as ex:
    print("error in Read")

StatementMeta(, 78ecfaea-3ea8-4e67-992e-4915993dbd53, 7, Finished, Available, Finished)

In [None]:
df_brokers_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze.brokers")
df_campaigns_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze.campaigns")
df_clients_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze.clients")
df_leads_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze.leads")
df_projects_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze.projects")
df_properties_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze.properties")
df_sales_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze.sales")



StatementMeta(, 78ecfaea-3ea8-4e67-992e-4915993dbd53, 8, Finished, Available, Finished)