In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, regexp_replace

def spark_read_csv_from_os(spark, file_path, schema, **kwargs):
    base_options = {
        "inferSchema": "False",
        "header": "True",
        "quote": '"',
        "columnNameOfCorruptRecord": "rejected_records",
        "mode": "PERMISSIVE"
    }
    base_options.update(kwargs)
    
    try:
        #schema = StructType(schema.fields + [StructField("rejected_records", StringType(), True)])
        df = spark.read.options(**base_options).schema(schema).csv(file_path)
    
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:  # Catch other potential exceptions (e.g., parsing errors)
        print(f"An error occurred while reading the CSV: {e}")
        return None

if __name__ == "__main__":
    path = "/mnt/apps/Files/ETL4/TMP/test.csv"
    
    schema = StructType([
        StructField("Int", LongType(), True),
        StructField("Decimal", DecimalType(18, 2), True),
        StructField("Float", DecimalType(18, 2), True),
        StructField("Money", StringType(), True),
        StructField("Bigint", LongType(), True),
        StructField("DateTime", TimestampType(), True),
        StructField("Date", DateType(), True)
    ])
    
    spark = SparkSession. \
        builder. \
        appName("Testing") \
        .master("local[*]") \
        .config("spark.ui.port", "4222") \
        .getOrCreate()
    
    df = spark_read_csv_from_os(spark, path, schema, sep="|")
    df = df.withColumn("Money", regexp_replace(col("Money"), ",", "."))
    
    df.printSchema()
    df.show()
    
    spark.stop()
    

root
 |-- Int: long (nullable = true)
 |-- Decimal: decimal(18,2) (nullable = true)
 |-- Float: decimal(18,2) (nullable = true)
 |-- Money: string (nullable = true)
 |-- Bigint: long (nullable = true)
 |-- DateTime: timestamp (nullable = true)
 |-- Date: date (nullable = true)

+---+-------+------+-------+------+-------------------+----------+
|Int|Decimal| Float|  Money|Bigint|           DateTime|      Date|
+---+-------+------+-------+------+-------------------+----------+
|  1| 141.23|141.23|4141.32|     0|2025-03-22 10:00:00|2025-03-22|
+---+-------+------+-------+------+-------------------+----------+



In [2]:
import os
from pyspark.sql import SparkSession


def loadTable(**kwargs):
    pathCheck = kwargs["path"].replace("/part*","")
    if not os.path.exists(pathCheck):
        return None
    try:
        if kwargs["loadType"] == "Parquet":
            sparkDqc.sql(f"""
            CREATE EXTERNAL TABLE IF NOT EXISTS {kwargs["tableName"]}
            USING PARQUET LOCATION '{kwargs["path"]}'
            """)
            return True
        else:
            sparkDqc.sql(f"""
            CREATE EXTERNAL TABLE IF NOT EXISTS {kwargs["tableName"]}
            USING CSV
            OPTIONS (
                'path' '{kwargs["path"]}',
                'delimiter' '|',
                'compression' 'gzip',
                'header' 'true'
            )
            """)
            return True
    except Exception as e:
        return None

if __name__ == "__main__":

    path = "/mnt/apps/Files/data-movement/Parquet/RTRNPF"
    
    sparkDqc =  SparkSession. \
            builder. \
            appName("parquet") \
            .master("local[*]") \
            .config("spark.ui.port", "4222") \
            .getOrCreate()
            
    df_table = loadTable(path=path, loadType="Parquet", tableName="RTRNPF")
    df_sql = sparkDqc.sql("SELECT CAST(REPLACE(Budget,',','.') AS DECIMAL(18,2)), Budget, SUM(CAST(REPLACE(Budget,',','.') AS DECIMAL(18,2))) OVER(ORDER BY NULL) SM FROM RTRNPF").show(truncate=False)

    sparkDqc.stop()

25/03/23 11:32:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/23 11:32:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/23 11:32:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/23 11:32:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/23 11:32:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------------------------------------+----------+-----------+
|CAST(replace(Budget, ,, .) AS DECIMAL(18,2))|Budget    |SM         |
+--------------------------------------------+----------+-----------+
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32                                  |1000232,32|18004181.76|
|1000232.32         