In [None]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, length, regexp_replace, lit, udf, count, trim
from datetime import datetime

def spark_read_csv_from_os(spark, file_path, schema, **kwargs):
    """
    Reads a CSV file from the operating system into a Spark DataFrame.

    Args:
        spark: The SparkSession object.
        file_path: The path to the CSV file.  Can be a local path or a path
                   that your Spark environment can access (e.g., if you're
                   using a distributed file system like HDFS).
        header (bool, optional): Whether the CSV file has a header row. Defaults to True.
        inferSchema (bool, optional): Whether to infer the schema from the data. Defaults to True.
        **options: Additional options to pass to the Spark CSV reader.  See
                   the Spark documentation for available options like `delimiter`,
                   `quote`, `escape`, etc.

    Returns:
        A Spark DataFrame representing the CSV data, or None if there's an error.

    Raises:
       FileNotFoundError: If the file path doesn't exist.
    """
    base_options = {
        "inferSchema": "False",
        "header": "True",
        "quote": '"',
        "columnNameOfCorruptRecord": "rejected_records",
        "mode": "PERMISSIVE"
    }
    base_options.update(kwargs)

    try:
        schema = StructType(schema.fields + [StructField("rejected_records", StringType(), True)])
        df = spark.read.options(**base_options).schema(schema).csv(file_path)
    
        def parse_and_identify_errors(rejected_str, delimiter=","):
            if rejected_str:
                try:
                    parts = rejected_str.split(delimiter)
                    error_columns = []
                    error_reasons = []

                    for i, field in enumerate(schema.fields):
                        try:
                            if field.dataType == IntegerType():
                                int(parts[i])
                            elif field.dataType == DateType():
                                datetime.strptime(parts[i], "%Y-%m-%d").date() #change date format
                            # Add more data type checks as needed
                        except (ValueError, IndexError):
                            error_columns.append(field.name)
                            error_reasons.append(f"Invalid {field.dataType.typeName()}")

                    if error_columns:
                        error_detail_string = ", ".join([f"{col}: {reason}" for col, reason in zip(error_columns, error_reasons)])
                        #print(f"Rejected String: {rejected_str}, Error Details: {error_detail_string}") #debugging print
                        return error_detail_string
                    else:
                        print(f"Error parsing record: {e}, No errors found") #debugging print
                        return None
                except Exception as e:
                    print(f"Error parsing record: {e}, Rejected String: {rejected_str}") #debugging print
                    return f"Error parsing record: {e}"
            else:
                return None

        parse_errors_udf = udf(parse_and_identify_errors, StringType())

        rejected_df = df.filter(col("rejected_records").isNotNull()).withColumn(
            "error_details", parse_errors_udf(col("rejected_records"))
        )

        df = df.drop("rejected_records")
        rejected_df = rejected_df.drop("rejected_records")

        return df, rejected_df
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:  # Catch other potential exceptions (e.g., parsing errors)
        print(f"An error occurred while reading the CSV: {e}")
        return None
    
def pandas_read_csv(file_path,**options):
    """
        Read small volume of data only using read.csv
        Args:
            **Options ----> Any
    """
    try:
        df = pd.read_csv(file_path,**options)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:  # Catch other potential exceptions (e.g., parsing errors)
        print(f"An error occurred while reading the CSV: {e}")
        return None
    
def construct_sql_schema(**kwargs):
    """
        Args: kwargs path and sep -->>> Any
        this function is best practice to compute large amount of data to not reading schema metadata
        recommendation : 
    """

    fields = []
    type_mapping = {
        "varchar": StringType(),
        "nvarchar": StringType(),
        "int": IntegerType(),
        "bigint": LongType(),
        "date": DateType(),
        "decimal": DecimalType
    }

    df = pandas_read_csv(kwargs["path"],sep=kwargs["sep"])
    #print(df)

    for row in df.itertuples():
        try:
            name, data_type_str = row.DataType.split("(", 1) if "(" in row.DataType else (row.DataType,"")
            name = name.strip()
            data_type_str = data_type_str[:-1].strip()
            parts = data_type_str.split(",")
            name_lower = name.lower()

            for keyword,spark_type in type_mapping.items():
                if keyword in name_lower:
                    if spark_type == DecimalType:
                        data_type = DecimalType() if not data_type_str else DecimalType(int(parts[0]),int(parts[1]))
                        fields.append(StructField(row.ColumnName, data_type, True))
                    else:
                        data_type = spark_type
                        fields.append(StructField(row.ColumnName, data_type, True))
                    break
        except Exception as e:  # Catch other potential errors
            print(f"Error processing file in construct schema {kwargs["path"]}: {e}")
            return None
    return StructType(fields)

def validateDecimal(**kwargs):
    df_contents = kwargs["df_contents"]
    is_valid = False
    errors = []
    dqcId = "DQ000001"
    for field in kwargs["dtypes"]:
        colName = field.name
        dType = str(field.dataType)
        if "decimal" in dType.lower() or "int" in dType.lower():
            #print(colName)
            df_cleaned = df_contents.withColumn(
                f"{colName}_cleaned",
                regexp_replace(col(colName), "[^0-9.]", "")
            )
            df_empty = df_cleaned.filter((col(f"{colName}_cleaned")).isNull()) # is null due to schema defined as decimal if we use inferSchema its a string
            #print("test disini")
            #df_empty.show()
            empty_count = df_empty.count()

            if empty_count > 0:
                is_valid = True
                error_msg = (f"Invalid {colName} values (containing only non-numeric characters). Total count: {empty_count}")
                errors.append(error_msg)
            df_contents = df_contents.drop(f"{colName}_cleaned") 
            
    if is_valid == False:        
        error_msg = "DDL Decimal/Int Data Type Structure Checks Passed."
        errors.append(error_msg)
    msg = "\n".join(errors) if errors else "Data Quality Checks Passed." # this is for breakdown into rows from array
    return is_valid, errors, df_contents, dqcId

def writeOptions(df, dataMovement, **kwargs):
    base_options = {  # Keep this separate
        "header": "true",
        "delimiter": "|",
        "quote": '"'
    }
    base_options.update(kwargs)
    df.coalesce(1).write.format("csv").mode("overwrite").options(**base_options).save(dataMovement)

def writeToParquet(df, path):
    df = df.coalesce(50)
    df.write.parquet(path, mode="overwrite", compression="snappy")

def loadTable(**kwargs):
    spark.sql(f"DROP TABLE IF EXISTS spark_catalog.default.{kwargs['tableName']}") 
    spark.sql(f"""
    CREATE EXTERNAL TABLE {kwargs["tableName"]}
    USING CSV
    OPTIONS (
        header 'true',  -- If your CSV has a header row
        inferSchema 'false', -- Important: Set to false since we provide schema
        delimiter '|' -- Specify the delimiter if it's not a comma
    )
        LOCATION '{kwargs["path"]}'
    """)
    df = spark.sql(f"SELECT * FROM {kwargs["tableName"]}")
    return df


if __name__ == "__main__":
    path = "/mnt/apps/Files/Config/master_job.csv"
    pathSchema = "/mnt/apps/Files/Schema/"
    outputFile = "/mnt/apps/Files/data-movement/"
    parquetOutput = "/mnt/apps/Files/data-movement/Parquet/"
    dqcOutput = []
    dqcId = "DQ000001"

    spark = SparkSession. \
        builder. \
        appName("Testing") \
        .master("local[*]") \
        .config("spark.ui.port", "4222") \
        .getOrCreate()

    df = pandas_read_csv(path,sep="|")
    df = df.query(f"JobName == 'RTRNPF' | JobName == 'ACMVPF'")
    #print(df)

    for row in df.itertuples():  # Collects all data to the driver - NOT recommended for large datasets
        filePath = row.SourceDirectory + '/' + row.FileName + '*.' + row.FileType
        filePath = filePath.replace("/gcs", "/Files")
        dataMovement = outputFile + row.JobName + '/'
        dataMovementParquet = parquetOutput + row.JobName
        #print(row.JobName)
        FullPathSchema = pathSchema + row.FileName + '.' + row.FileType
        #print(FullPathSchema)
        #spark.stop()
        df_dtype = construct_sql_schema(path=FullPathSchema, sep="|")
        #print(df_dtype)
        df, rejected_df = spark_read_csv_from_os(spark, filePath, schema=df_dtype, sep=row.Delimiter)
        rejected_df.cache()
        rejected_df.take(1)
        is_empty = True if rejected_df.count() > 0 else False
        if is_empty:
            # need to catch malforme
            # Group by error_details 
            rejected_df.show(truncate=False)
            rejCnt = rejected_df.groupBy("error_details").agg(count("*").alias("total"))
            rejCnt.show(truncate=False)
            collected_rejCnt = rejCnt.collect()
            if collected_rejCnt:
                err_msg = collected_rejCnt[0]
                dqcOutput.append({"JobName":row.JobName, "Path":row.SourceDirectory, "dqID":dqcId, "CountRecords":err_msg.total, "Message":err_msg.error_details, "Status":"Failed"})
            else:
                print("No rejected errors found.")
            rejected_df.unpersist()
        else:
            df_count = df.count()
            #writeOptions(df, dataMovement)
            dqcOutput.append({"JobName":row.JobName, "Path":row.SourceDirectory, "dqID":dqcId, "CountRecords":df_count, "Message":"DDL Data Quality Check Passed !!!!", "Status":"Successful"})
        rejected_df.unpersist()
    print(dqcOutput)
    spark.stop()

                                                                                

+-----+---------------+----------+---------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|            City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|       error_details|
+-----+---------------+----------+---------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+
| NULL|4962fdbE6Bfee6D|       Pam|   Sparks|        Patel-Deleon|      Blakemouth|British Indian Oc...|    267-243-9490x035|    480-078-0535x889|nicolas00@faulkne...|       2020-11-29| https://nelson.com/|DDL Error please ...|
|    2|9b12Ae76fdBc9bE|      Gina|    Rocha|Acosta, Paul and ...|East Lynnchester|          

                                                                                

+-----+---------------+----------+---------+-----------------------+----------------+---------------------------------------------------+---------------------+---------------------+--------------------------------+-----------------+----------------------------+------------------------------------+
|Index|Customer Id    |First Name|Last Name|Company                |City            |Country                                            |Phone 1              |Phone 2              |Email                           |Subscription Date|Website                     |error_details                       |
+-----+---------------+----------+---------+-----------------------+----------------+---------------------------------------------------+---------------------+---------------------+--------------------------------+-----------------+----------------------------+------------------------------------+
|NULL |4962fdbE6Bfee6D|Pam       |Sparks   |Patel-Deleon           |Blakemouth      |British Indian Oce

25/03/01 09:48:01 WARN JavaUtils: Attempt to delete using native Unix OS command failed for path = /tmp/spark-a54f9980-7041-4384-92c9-2845a307b762/pyspark-fd0ef3d4-5c74-43dc-a464-829976b693ea. Falling back to Java IO way
java.io.IOException: Failed to delete: /tmp/spark-a54f9980-7041-4384-92c9-2845a307b762/pyspark-fd0ef3d4-5c74-43dc-a464-829976b693ea
	at org.apache.spark.network.util.JavaUtils.deleteRecursivelyUsingUnixNative(JavaUtils.java:174)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:109)
	at org.apache.spark.network.util.JavaUtils.deleteRecursively(JavaUtils.java:90)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively(SparkFileUtils.scala:121)
	at org.apache.spark.util.SparkFileUtils.deleteRecursively$(SparkFileUtils.scala:120)
	at org.apache.spark.util.Utils$.deleteRecursively(Utils.scala:1126)
	at org.apache.spark.util.ShutdownHookManager$.$anonfun$new$4(ShutdownHookManager.scala:65)
	at org.apache.spark.util.ShutdownHookManager$.$anonfun

In [291]:
spark.stop()
#help(spark.read.csv)

In [278]:
rejected_df.unpersist()

DataFrame[Index: int, Customer Id: string, First Name: string, Last Name: string, Company: string, City: string, Country: string, Phone 1: string, Phone 2: string, Email: string, Subscription Date: date, Website: string, error_details: string]

In [249]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark1 = SparkSession.builder.appName("CreateDataFrameExample").getOrCreate()

data = [
    ("invalid",6)
]

schema = ["error_details", "total"]

df = spark1.createDataFrame(data, schema=schema)

a = df.collect()[0]

print(a)

Row(error_details='invalid', total=6)


In [168]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize SparkSession
spark = SparkSession.builder.appName("GroupByErrorDetails").getOrCreate()

# Define the schema
schema = StructType([
    StructField("Index", StringType(), True),
    StructField("Customer Id", StringType(), True),
    StructField("First Name", StringType(), True),
    StructField("Last Name", StringType(), True),
    StructField("Company", StringType(), True),
    StructField("City", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Phone 1", StringType(), True),
    StructField("Phone 2", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("Subscription Date", StringType(), True),
    StructField("Website", StringType(), True),
    StructField("error_details", StringType(), True)
])

# Sample data (replace with your actual data source)
data = [
    (None, "4962fdbE6Bfee6D", "Pam", "Sparks", "Patel-Deleon", "Blakemouth", "British Indian Ocean Territory (Chagos Archipelago)", "267-243-9490x035", "480-078-0535x889", "nicolas00@faulkner-kramer.com", "2020-11-29", "https://nelson.com/", "Index: Invalid integer, Subscription Date: Invalid date"),
    ("2", "9b12Ae76fdBc9bE", "Gina", "Rocha", "Acosta, Paul and Barber", "East Lynnchester", "Costa Rica", "027.142.0940", "+1-752-593-4777x07171", "yfarley@morgan.com", None, "https://pineda-rogers.biz/", "Index: Invalid integer, Subscription Date: Invalid date"),
    (None, "Fc2c8D2BE1AEfDb", "Kristina", "Andrade", "Mann Ltd", "Port Taraton", "Pitcairn Islands", "(640)067-7023x66846", "001-367-405-8096x592", "ivillarreal@fowler.biz", "2020-09-11", "https://foley.com/", "Index: Invalid integer, Subscription Date: Invalid date"),
    (None, "9468BBc926AaAB3", "Zoe", "Hansen", "Tanner PLC", "Kimberlyfort", "Benin", "638-798-9796x0247", "(265)475-2386x9812", "duransheena@hughes.com", "2021-10-09", "https://franco-galloway.com/", "Index: Invalid integer, Subscription Date: Invalid date"),
    (None, "A1505BF376CC5Ed", "Aimee", "Brooks", "Walker Ltd", "Mitchellview", "Malaysia", "112.920.9961x77753", "471.896.6847x82788", "chambersdanielle@good-cannon.com", "2022-03-28", "http://solis.org/", "Index: Invalid integer, Subscription Date: Invalid date"),
    (None, "a24eB840950dac7", "Mackenzie", "Leonard", "Abbott Inc", "Bauerfort", "Ukraine", "+1-010-716-9313x74577", "315.423.2995", "bwheeler@hickman-acevedo.com", "2022-05-02", "http://www.pacheco.net/", "Index: Invalid integer, Subscription Date: Invalid date")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Group by 'error_details' and count occurrences
result_df = df.groupBy("error_details").agg(F.count("*").alias("count"))

# Show the result
result_df.show(truncate=False)

# Stop SparkSession
spark.stop()

25/03/01 05:42:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+-------------------------------------------------------+-----+
|error_details                                          |count|
+-------------------------------------------------------+-----+
|Index: Invalid integer, Subscription Date: Invalid date|6    |
+-------------------------------------------------------+-----+



In [None]:
path = "/mnt/apps/Files/Config/master_job.csv"
df = pandas_read_csv(path,sep="|")
df = df.query(f"BatchName == 'BATCH_ACT_VAL'")
print(df)

spark = SparkSession. \
        builder. \
        appName("Thread"). \
        getOrCreate()

tables = []
for row in df.itertuples():
    filePath = row.SourceDirectory + '/' + row.FileName + '.' + row.FileType
    filePath = filePath.replace("/gcs", "/Files")
    tables.append(filePath)

def loadTable(path):
    sc = path.split("/")[5]
    pathParquet = f"/mnt/apps/Files/data-movement/Parquet/{sc}"
    print(pathParquet)
    df_dtype = construct_sql_schema(path=FullPathSchema, sep="|")
    df = spark.read.csv(path, header=True, inferSchema=False, schema=df_dtype, sep="|")
    result, dqc_msg, df_final, dqcId = validateDecimal(dtypes=df_dtype, df_contents=df)
    df_count = df_final.count()
    if result:
        print(dqc_msg)
        dqcOutput.append({"JobName":row.JobName, "Path":row.SourceDirectory, "dqID":dqcId, "CountRecords":df_count, "Message":dqc_msg, "Status":"Failed"})
    else:
        writeToParquet(df_final, pathParquet)
        print(dqc_msg)
        dqcOutput.append({"JobName":row.JobName, "Path":row.SourceDirectory, "dqID":dqcId, "CountRecords":df_count, "Message":dqc_msg, "Status":"Successful"})
    
print(tables)

       BatchName JobName            SourceDirectory FileName FileType  Flag  \
1  BATCH_ACT_VAL  ACMVPF  /mnt/apps/gcs/ETL4/ACMVPF   ACMVPF      csv     1   
2  BATCH_ACT_VAL  RTRNPF  /mnt/apps/gcs/ETL4/RTRNPF   RTRNPF      csv     1   

  Delimiter  
1         |  
2         |  
['/mnt/apps/Files/ETL4/ACMVPF/ACMVPF.csv', '/mnt/apps/Files/ETL4/RTRNPF/RTRNPF.csv']


In [None]:
from threading import Thread
from queue import Queue

q = Queue()

workerCount = 2

def run_task(function, q):
    while not q.empty():
        value = q.get()
        function(value)
        q.task_done()

for table in tables:
    q.put(table)

for i in range(workerCount):
    t=Thread(target=run_task, args=(loadTable, q))
    t.daemon = True
    t.start()

print("running load")
q.join()
spark.stop()
print("running completed")

In [None]:
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name
import dask.dataframe as dd

#outputFile = "/mnt/apps/Files/ETL4/LAS/customers.csv"
outputFile = "/mnt/apps/Files/data-movement/Renova/part-00000*"
FullPathSchema = "/mnt/apps/Files/Schema/customers.csv"
ParquetPath = "/mnt/apps/Files/data-movement/Parquet/Renova/part-000*"
PdoutputFile = "/mnt/apps/Files/ETL4/LAS/customers.csv"

#df_dtype = construct_sql_schema(path=FullPathSchema, sep="|")
#print(df_dtype)

spark = SparkSession.builder.appName("TestReadCSV").getOrCreate()

#CSV reader
#df = spark.read.csv(outputFile, sep="|", header=True, schema=df_dtype, inferSchema=False).repartition(10)

# df_with_filename = df.withColumn("filename", input_file_name())
# null_count = df_with_filename.filter(col("Index").isNotNull())
# df_count = null_count.count()
# print(df_count)
# #null_count.limit(10).show(truncate=False)
# null_count.createOrReplaceTempView("readCSV")
# #df_parquet_count = null_count_parquet.count()
# #print(df_parquet_count)
# null_count = spark.sql(
#     """
#     SELECT country, cnt, sum(cnt) over () total_all
#     FROM (
#     SELECT Country, COUNT(1) AS cnt
#         FROM readCSV
#         GROUP BY Country
#     ) Z
# """)
# print(null_count.count())
#df.show(n=5, truncate=False) #default 20

#parquet part
# df_read_parquet = spark.read.parquet(outputFile, schema=df_dtype)
# null_count_parquet = df_read_parquet.filter(col("Index").isNotNull())
# null_count_parquet.createOrReplaceTempView("my_parquet_table")
# #df_parquet_count = null_count_parquet.count()
# #print(df_parquet_count)
# null_count_parquet = spark.sql(
#     """
#     SELECT country, cnt, sum(cnt) over () total_all
#     FROM (
#     SELECT Country, COUNT(1) AS cnt
#         FROM my_parquet_table
#         GROUP BY Country
#     ) Z
# """)
# print(null_count_parquet.count())
# null_count_parquet.show(n=5, truncate=False) #default 20

spark.stop()


In [None]:
list_of = []
for i in range(240):
    list_of.append(f"Cloned_{i + 1}|Varchar(100)")
   
df = pd.DataFrame(list_of, columns=["ColumnName"])
df.head(n=250)

df.to_csv("/mnt/apps/Files/NewSChema/new_schema_cust.csv",header=True, index=False)

In [None]:
path = "/mnt/apps/Files/Config/master_job.csv"
pathSchema = "/mnt/apps/Files/Schema/"

df = pandas_read_csv(path,sep="|")
df = df.query("JobName == 'PAT'")

for row in df.itertuples():  # Collects all data to the driver - NOT recommended for large datasets
    filePath = row.SourceDirectory + '/' + row.FileName + '.' + row.FileType
    FullPathSchema = pathSchema + row.FileName + '.' + row.FileType
    spark = SparkSession.builder.appName(f"{row.JobName}").getOrCreate()
    df_dtype = construct_sql_schema(path=FullPathSchema, sep="|")
    df = spark_read_csv_from_os(spark, filePath, schema=df_dtype, quote='"', sep="|")
    # Handle the error, e.g., skip the file, log the error, etc.
    df.show()
"""     result, dqc_msg, df_final = validateDecimal(dtypes=df_dtype, df_contents=df)
    df_final.show()
    if result:
        print(dqc_msg)
    else:
        print(dqc_msg) """
spark.stop()


In [12]:
path = "/mnt/apps/Files/ETL4/PEOPLEPF/people.csv"

df = pandas_read_csv(path, sep=",")
#df.head(1)
print(df.dtypes)


Index             int64
User Id          object
First Name       object
Last Name        object
Sex              object
Email            object
Phone            object
Date of birth    object
Job Title        object
dtype: object


In [None]:
from pyspark.sql.types import *

path = "/mnt/apps/Files/Schema/etl4pat.csv"
fields = []
type_mapping = {
    "varchar": VarcharType,
    "nvarchar": VarcharType,
    "int": IntegerType(),
    "bigint": LongType(),
    "date": DateType(),
    "decimal": DecimalType
}

df = pandas_read_csv(path,sep="|")
print(df)

for row in df.itertuples():
    name, data_type_str = row.DataType.split("(", 1) if "(" in row.DataType else (row.DataType,"")
    name = name.strip()
    data_type_str = data_type_str[:-1].strip()
    parts = data_type_str.split(",")
    name_lower = name.lower()
    print(data_type_str)

    for keyword,spark_type in type_mapping.items():
        if keyword in name_lower:
            if spark_type == VarcharType:
                data_type = VarcharType(4000) if data_type_str == "MAX" else VarcharType(int(data_type_str))
                fields.append(StructField(row.ColumnName, data_type, True))
            elif spark_type == DecimalType:
                data_type = DecimalType() if not data_type_str else DecimalType(int(parts[0]),int(parts[1]))
                fields.append(StructField(row.ColumnName, data_type, True))
            else:
                data_type = spark_type
                fields.append(StructField(row.ColumnName, data_type, True))
            break

print(fields)


In [None]:
data_type_str = "Decimal(2,0)"
data_type, *args = data_type_str.split(")")

print(f"data_type: {data_type}")  # Output: data_type: Decimal(2,0
print(f"args: {args}")          # Output: args: ['']

data_type_str = "VARCHAR"
data_type, *args = data_type_str.split(")")

print(f"data_type: {data_type}")  # Output: data_type: VARCHAR
print(f"args: {args}")          # Output: args: []

In [None]:
data = "A,2"

print(len(data))

In [None]:
path = "/mnt/apps/Files/Schema/etl4pat.csv"
test = construct_sql_schema(path=path,sep="|")
print(test)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *  # Import data types for clarity

# Create a SparkSession (if you don't have one already)
spark = SparkSession.builder.appName("DataTypeExample").getOrCreate()

# Sample data (replace with your actual data)
data = [("Alice", 25, 2000.00), ("Bob", 30, 2000.00), ("Charlie", 22, 2000.00)]

# Define the schema explicitly (best practice)
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("height", DecimalType(2,0), True)
])

df = spark.createDataFrame(data, schema=schema)

# Display the schema
df.printSchema()

# Stop the SparkSession (good practice)
spark.stop()

In [None]:
data = "/mnt/apps/gcs/ETL4/CONFIG/etl4pat*.csv"



In [None]:
from pyspark.sql import SparkSession

def writeOptions(df, path, **kwargs):
    base_options = {  # Keep this separate
        "header": "true",
        "delimiter": "|",
        "quote": '"',
        "mode":"overwrite",
        "format":"csv"
    }

    # Correct way to merge options:
    all_options = base_options.copy()  # Create a copy to avoid modifying base_options
    all_options.update(kwargs)       # Add or overwrite kwargs
    print(all_options)

    df.write.options(**all_options).save(path)

# Example usage (important: complete example):
spark = SparkSession.builder.appName("Example").getOrCreate()
data = [("Alice", 25), ("Bob", 30)]
df = spark.createDataFrame(data, ["name", "age"])

dataMovement = "path/to/save.csv" # Or your actual path

writeOptions(df, dataMovement)  # Now works correctly

spark.stop()  # Don't forget to stop the SparkSession

In [7]:
from pyspark.sql import SparkSession

path = "/mnt/apps/Files/data-movement/ACMVPF/part*.csv.gz"

spark = SparkSession.builder.appName("READCSVGZ").getOrCreate()

spark.sql(f"""
            CREATE EXTERNAL TABLE IF NOT EXISTS ACMVPF
            USING CSV 
            OPTIONS (
                    path '{path}',
                    delimiter '|',
                    header 'true',
                    compression 'gzip'
            )
            """)

spark.sql("DESCRIBE EXTENDED ACMVPF").show(truncate=False)
spark.sql("SELECT COUNT(*) FROM ACMVPF").show()
spark.sql("SELECT * FROM ACMVPF LIMIT 10").show()

spark.stop()

+----------------------------+----------------------------+-------+
|col_name                    |data_type                   |comment|
+----------------------------+----------------------------+-------+
|Index                       |string                      |NULL   |
|Customer Id                 |string                      |NULL   |
|First Name                  |string                      |NULL   |
|Last Name                   |string                      |NULL   |
|Company                     |string                      |NULL   |
|City                        |string                      |NULL   |
|Country                     |string                      |NULL   |
|Phone 1                     |string                      |NULL   |
|Phone 2                     |string                      |NULL   |
|Email                       |string                      |NULL   |
|Subscription Date           |string                      |NULL   |
|Website                     |string            

                                                                                

+--------+
|count(1)|
+--------+
|18000000|
+--------+

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|             City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|4962fdbE6Bfee6D|       Pam|   Sparks|        Patel-Deleon|       Blakemouth|British Indian Oc...|    267-243-9490x035|    480-078-0535x889|nicolas00@faulkne...|       2020-11-29| https://nelson.com/|
|    2|9b12Ae76fdBc9bE|      Gina|    Rocha|Acosta, Paul and ...| East Lynnchester|          Costa Rica|        027.