In [1]:
import sys
import os
from random import random
from operator import add
from tabulate import tabulate
import logging
import argparse
import pandas as pd
from threading import Thread
from queue import Queue
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length, regexp_replace, lit
from pyspark.sql.types import *

def pandas_read_csv(file_path,**options):
    """
        Read small volume of data only using read.csv
        Args:
            **Options ----> Any
    """
    try:
        df = pd.read_csv(file_path,**options)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:  # Catch other potential exceptions (e.g., parsing errors)
        print(f"An error occurred while reading the CSV: {e}")
        return None

def loadTable(**kwargs):
    pathCheck = kwargs["path"].replace("/part*","")
    if not os.path.exists(pathCheck):
        return None
    try:
        sparkDqc.sql(f"""
        CREATE EXTERNAL TABLE IF NOT EXISTS {kwargs["tableName"]}
        USING PARQUET LOCATION '{kwargs["path"]}'
        """)
        return True
    except Exception as e:
        return None
    
def ListTable():
    listCreatedTable = []
    tables = sparkDqc.sql("SHOW TABLES").collect()
    for table in tables:
       listCreatedTable.append(table.tableName)
    return listCreatedTable

def getTables(**kwargs):
    df = pandas_read_csv(kwargs["path"], sep="|")
    df = df.query(f"BatchName == '{batchname}'")
    print(tabulate(df.head(), headers='keys', tablefmt='pretty'))
    df_refTable = df.query("RefTable.notnull()")
    distinct_reftable = df_refTable[["BatchName","RefTable"]].drop_duplicates().rename(columns={"RefTable": "JobName"})
    df_job = pandas_read_csv(kwargs["LoadPath"], sep="|")
    df_job = df_job[["BatchName","JobName"]].query(f"BatchName == '{batchname}'")
    joined_df = pd.concat([df_job, distinct_reftable], ignore_index=True).drop_duplicates()
    print(tabulate(joined_df.head(), headers='keys', tablefmt='pretty'))
    return joined_df

if __name__ == "__main__":
    path = "/mnt/apps/Files/Config/DataQuality_Config.csv"
    LoadPath = "/mnt/apps/Files/Config/master_job.csv"
    parquetOutput = "/mnt/apps/Files/data-movement/Parquet/"
    batchname = "BATCH_ACT_VAL"
    testWrite = "/mnt/apps/Files/Test"

    getDf = getTables(path=path, LoadPath=LoadPath)

    sparkDqc = (
        SparkSession
            .builder
            .appName(f"{batchname}_DQC")
            .master("local[*]")
            .config("spark.ui.port", "4222")
            .getOrCreate()
    )

    listJob = []
    for row in getDf.itertuples():
        listJob.append(row.JobName.lower())
        df_output = loadTable(path=parquetOutput + '/' + row.JobName + '/part*', tableName=row.JobName)
        
    print(ListTable())
    print(listJob)

    if all(item in ListTable() for item in listJob):
        df = sparkDqc.sql("""
                        SELECT COUNT(1) JML FROM PEOPLEPF
                        """)
        
        df.repartition(1).write.parquet(testWrite)
        print(df.show())
    else:
        sys.exit(1)

    #sparkDqc.stop()

+---+---------------+----------+------------------------------------+-----+----------+
|   |   BatchName   |  DqcId   |              Scripts               | Run | RefTable |
+---+---------------+----------+------------------------------------+-----+----------+
| 0 | BATCH_ACT_VAL | DQ000001 |  SELECT COUNT(1) CNT FROM ACMVPF   |  1  |   nan    |
| 1 | BATCH_ACT_VAL | DQ000002 |  SELECT COUNT(1) CNT FROM GENLPF   |  1  |   nan    |
| 2 | BATCH_ACT_VAL | DQ000003 | SELECT COUNT(1) CNT FROM PEOPLEPLF |  1  | PEOPLEPF |
+---+---------------+----------+------------------------------------+-----+----------+
+---+---------------+----------+
|   |   BatchName   | JobName  |
+---+---------------+----------+
| 0 | BATCH_ACT_VAL |  ACMVPF  |
| 1 | BATCH_ACT_VAL |  GENLPF  |
| 2 | BATCH_ACT_VAL | PEOPLEPF |
+---+---------------+----------+


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/28 17:14:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


['acmvpf', 'genlpf', 'peoplepf']
['acmvpf', 'genlpf', 'peoplepf']


AnalysisException: [PATH_ALREADY_EXISTS] Path file:/mnt/apps/Files/Test already exists. Set mode as "overwrite" to overwrite the existing path.

In [1]:
sparkDqc.stop()

NameError: name 'sparkDqc' is not defined

In [7]:
from pyspark.sql import SparkSession

spark = (
        SparkSession
        .builder
        .appName("Locally")
        .master("local[*]")
        .config("spark.ui.port", "4222")
        .config("spark.executor.instances", "2")  # Initial number of executors (if dynamicAllocation is false)
        .config("spark.executor.cores", "2")  # Cores per executor
        .config("spark.cores.max", "3") # Maximum total cores for the application
        .config("spark.dynamicAllocation.enabled", "true") #Enable dynamic allocation
        .config("spark.dynamicAllocation.minExecutors", "1") #minimum executors
        .config("spark.dynamicAllocation.maxExecutors", "5") #maximum executors
        .getOrCreate()
)

spark

In [9]:
spark.stop()