In [237]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, length, regexp_replace

def spark_read_csv_from_os(spark, file_path, schema, header=True, **options):
    """
    Reads a CSV file from the operating system into a Spark DataFrame.

    Args:
        spark: The SparkSession object.
        file_path: The path to the CSV file.  Can be a local path or a path
                   that your Spark environment can access (e.g., if you're
                   using a distributed file system like HDFS).
        header (bool, optional): Whether the CSV file has a header row. Defaults to True.
        inferSchema (bool, optional): Whether to infer the schema from the data. Defaults to True.
        **options: Additional options to pass to the Spark CSV reader.  See
                   the Spark documentation for available options like `delimiter`,
                   `quote`, `escape`, etc.

    Returns:
        A Spark DataFrame representing the CSV data, or None if there's an error.

    Raises:
       FileNotFoundError: If the file path doesn't exist.
    """
    try:
        df = spark.read.csv(file_path, header=header, inferSchema=False, schema=schema, **options)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:  # Catch other potential exceptions (e.g., parsing errors)
        print(f"An error occurred while reading the CSV: {e}")
        return None
    
def pandas_read_csv(file_path,**options):
    """
        Read small volume of data only using read.csv
        Args:
            **Options ----> Any
    """
    try:
        df = pd.read_csv(file_path,**options)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:  # Catch other potential exceptions (e.g., parsing errors)
        print(f"An error occurred while reading the CSV: {e}")
        return None
    
def construct_sql_schema(**kwargs):
    """
        Args: kwargs path and sep -->>> Any
        this function is best practice to compute large amount of data to not reading schema metadata
        recommendation : 
    """

    fields = []
    type_mapping = {
        "varchar": StringType(),
        "nvarchar": StringType(),
        "int": IntegerType(),
        "bigint": LongType(),
        "date": DateType(),
        "decimal": DecimalType
    }

    df = pandas_read_csv(kwargs["path"],sep=kwargs["sep"])

    for row in df.itertuples():
        try:
            name, data_type_str = row.DataType.split("(", 1) if "(" in row.DataType else (row.DataType,"")
            name = name.strip()
            data_type_str = data_type_str[:-1].strip()
            parts = data_type_str.split(",")
            name_lower = name.lower()

            for keyword,spark_type in type_mapping.items():
                if keyword in name_lower:
                    if spark_type == DecimalType:
                        data_type = DecimalType() if not data_type_str else DecimalType(int(parts[0]),int(parts[1]))
                        fields.append(StructField(row.ColumnName, data_type, True))
                    else:
                        data_type = spark_type
                        fields.append(StructField(row.ColumnName, data_type, True))
                    break
        except Exception as e:  # Catch other potential errors
            print(f"Error processing file in construct schema {kwargs["path"]}: {e}")
            return None
    return StructType(fields)

def validateDecimal(**kwargs):
    df_contents = kwargs["df_contents"]
    is_valid = False
    errors = []
    for field in kwargs["dtypes"]:
        colName = field.name
        dType = str(field.dataType)
        if "decimal" in dType.lower():
            #print(colName)
            df_cleaned = df_contents.withColumn(
                f"{colName}_cleaned",
                regexp_replace(col(colName), "[^0-9.]", "")
            )
            df_empty = df_cleaned.filter((col(f"{colName}_cleaned")).isNull()) # is null due to schema defined as decimal if we use inferSchema its a string
            empty_count = df_empty.count()

            if empty_count > 0:
                is_valid = True
                sample_size = min(100, empty_count)
                invalid_rows = df_empty.select(colName).take(sample_size)
                error_msg = (f"Invalid {colName} values (containing only non-numeric characters) in {FullPathSchema}. Total count: {empty_count}")
                errors.append(error_msg)
            df_contents = df_contents.drop(f"{colName}_cleaned") 
    msg = "\n".join(errors) if errors else "True"
    return is_valid, msg, df_contents
            
if __name__ == "__main__":
    path = "/mnt/apps/Files/Config/master_job.csv"
    pathSchema = "/mnt/apps/Files/Schema/"

    df = pandas_read_csv(path,sep="|")
    df = df.query("JobName == 'PAT'")

    for row in df.itertuples():  # Collects all data to the driver - NOT recommended for large datasets
        filePath = row.SourceDirectory + '/' + row.FileName + '.' + row.FileType
        FullPathSchema = pathSchema + row.FileName + '.' + row.FileType
        spark = SparkSession.builder.appName(f"{row.JobName}").getOrCreate()
        df_dtype = construct_sql_schema(path=FullPathSchema, sep="|")
        df = spark_read_csv_from_os(spark, filePath, schema=df_dtype, quote='"', sep="|")
        result, dqc_msg, df_final = validateDecimal(dtypes=df_dtype, df_contents=df)
        df_final.show()
        if result:
            print(dqc_msg)
        else:
            print(dqc_msg)
    spark.stop()

+--------------------+-------+--------------------+-------+
|                 COA|AMT_RPT|         Description|AMT_ORG|
+--------------------+-------+--------------------+-------+
|11111111111111111...|2000.32|Does this type ne...| 123.00|
|      11111111111111|   NULL|Does this type ne...|1111.00|
|         12211111111|3000.00|Does this type ne...|   NULL|
|11111111111111111...|2000.00|Does this type ne...|   NULL|
+--------------------+-------+--------------------+-------+

Invalid AMT_RPT values (containing only non-numeric characters) in /mnt/apps/Files/Schema/etl4pat.csv. Total count: 1
Invalid AMT_ORG values (containing only non-numeric characters) in /mnt/apps/Files/Schema/etl4pat.csv. Total count: 2


In [None]:
path = "/mnt/apps/Files/Config/master_job.csv"
pathSchema = "/mnt/apps/Files/Schema/"

df = pandas_read_csv(path,sep="|")
df = df.query("JobName == 'PAT'")

for row in df.itertuples():  # Collects all data to the driver - NOT recommended for large datasets
    filePath = row.SourceDirectory + '/' + row.FileName + '.' + row.FileType
    FullPathSchema = pathSchema + row.FileName + '.' + row.FileType
    spark = SparkSession.builder.appName(f"{row.JobName}").getOrCreate()
    df_dtype = construct_sql_schema(path=FullPathSchema, sep="|")
    df = spark_read_csv_from_os(spark, filePath, schema=df_dtype, quote='"', sep="|")
    # Handle the error, e.g., skip the file, log the error, etc.
    df.show()
"""     result, dqc_msg, df_final = validateDecimal(dtypes=df_dtype, df_contents=df)
    df_final.show()
    if result:
        print(dqc_msg)
    else:
        print(dqc_msg) """
spark.stop()


root
 |-- COA: string (nullable = true)
 |-- AMT_RPT: decimal(18,2) (nullable = true)
 |-- Description: string (nullable = true)

+--------------------+----------+--------------------+
|                 COA|   AMT_RPT|         Description|
+--------------------+----------+--------------------+
|11111111111111111...|   2000.32|Does this type ne...|
|      11111111111111|2000000.00|Does this type ne...|
|         12211111111|   3000.00|Does this type ne...|
|11111111111111111...|   2000.00|Does this type ne...|
+--------------------+----------+--------------------+

+--------------------+----------+--------------------+
|                 COA|   AMT_RPT|         Description|
+--------------------+----------+--------------------+
|11111111111111111...|   2000.32|Does this type ne...|
|      11111111111111|2000000.00|Does this type ne...|
|         12211111111|   3000.00|Does this type ne...|
|11111111111111111...|   2000.00|Does this type ne...|
+--------------------+----------+-----------

25/02/09 15:56:11 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 3
CSV file: file:///mnt/apps/Files/ETL4/etl4pat.csv
25/02/09 15:56:11 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 4, schema size: 3
CSV file: file:///mnt/apps/Files/ETL4/etl4pat.csv


In [None]:
path = "/mnt/apps/Files/Config/master_job.csv"

df = pandas_read_csv(path,sep="|")
df = df.query("Flag == 1")
df.head()


Unnamed: 0,JobName,SourceDirectory,FileName,FileType,Flag
0,Renova,/mnt/apps/Files/ETL4,customers,zip,1
1,PAT,/mnt/apps/Files/ETL4,etl4pat,csv,1


In [26]:
from pyspark.sql.types import *

path = "/mnt/apps/Files/Schema/etl4pat.csv"
fields = []
type_mapping = {
    "varchar": VarcharType,
    "nvarchar": VarcharType,
    "int": IntegerType(),
    "bigint": LongType(),
    "date": DateType(),
    "decimal": DecimalType
}

df = pandas_read_csv(path,sep="|")
print(df)

for row in df.itertuples():
    name, data_type_str = row.DataType.split("(", 1) if "(" in row.DataType else (row.DataType,"")
    name = name.strip()
    data_type_str = data_type_str[:-1].strip()
    parts = data_type_str.split(",")
    name_lower = name.lower()
    print(data_type_str)

    for keyword,spark_type in type_mapping.items():
        if keyword in name_lower:
            if spark_type == VarcharType:
                data_type = VarcharType(4000) if data_type_str == "MAX" else VarcharType(int(data_type_str))
                fields.append(StructField(row.ColumnName, data_type, True))
            elif spark_type == DecimalType:
                data_type = DecimalType() if not data_type_str else DecimalType(int(parts[0]),int(parts[1]))
                fields.append(StructField(row.ColumnName, data_type, True))
            else:
                data_type = spark_type
                fields.append(StructField(row.ColumnName, data_type, True))
            break

print(fields)


    ColumnName       DataType
0          COA    Varchar(50)
1      AMT_RPT  Decimal(18,2)
2  Description   Varchar(MAX)
50
18,2
MAX
[StructField('COA', VarcharType(50), True), StructField('AMT_RPT', DecimalType(18,2), True), StructField('Description', VarcharType(4000), True)]


In [188]:
data_type_str = "Decimal(2,0)"
data_type, *args = data_type_str.split(")")

print(f"data_type: {data_type}")  # Output: data_type: Decimal(2,0
print(f"args: {args}")          # Output: args: ['']

data_type_str = "VARCHAR"
data_type, *args = data_type_str.split(")")

print(f"data_type: {data_type}")  # Output: data_type: VARCHAR
print(f"args: {args}")          # Output: args: []

data_type: Decimal(2,0
args: ['']
data_type: VARCHAR
args: []


In [374]:
data = "A,2"

print(len(data))

3


In [521]:
path = "/mnt/apps/Files/Schema/etl4pat.csv"
test = construct_sql_schema(path=path,sep="|")
print(test)

StructType([StructField('COA', VarcharType(100), True), StructField('AMT_RPT', DecimalType(2,0), True), StructField('Description', VarcharType(4000), True)])


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *  # Import data types for clarity

# Create a SparkSession (if you don't have one already)
spark = SparkSession.builder.appName("DataTypeExample").getOrCreate()

# Sample data (replace with your actual data)
data = [("Alice", 25, 2000.00), ("Bob", 30, 2000.00), ("Charlie", 22, 2000.00)]

# Define the schema explicitly (best practice)
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("height", DecimalType(2,0), True)
])

df = spark.createDataFrame(data, schema=schema)

# Display the schema
df.printSchema()

# Stop the SparkSession (good practice)
spark.stop()