In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, regexp_replace

def spark_read_csv_from_os(spark, file_path, schema, **kwargs):
    base_options = {
        "inferSchema": "False",
        "header": "True",
        "quote": '"',
        "columnNameOfCorruptRecord": "rejected_records",
        "mode": "PERMISSIVE"
    }
    base_options.update(kwargs)
    
    try:
        #schema = StructType(schema.fields + [StructField("rejected_records", StringType(), True)])
        df = spark.read.options(**base_options).schema(schema).csv(file_path)
    
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path: {file_path}")
        return None
    except Exception as e:  # Catch other potential exceptions (e.g., parsing errors)
        print(f"An error occurred while reading the CSV: {e}")
        return None

if __name__ == "__main__":
    path = "/mnt/apps/Files/ETL4/TMP/test.csv"
    
    schema = StructType([
        StructField("Int", LongType(), True),
        StructField("Decimal", DecimalType(18, 2), True),
        StructField("Float", DecimalType(18, 2), True),
        StructField("Money", StringType(), True),
        StructField("Bigint", LongType(), True),
        StructField("DateTime", TimestampType(), True),
        StructField("Date", DateType(), True)
    ])
    
    spark = SparkSession. \
        builder. \
        appName("Testing") \
        .master("local[*]") \
        .config("spark.ui.port", "4222") \
        .getOrCreate()
    
    df = spark_read_csv_from_os(spark, path, schema, sep="|")
    df = df.withColumn("Money", regexp_replace(col("Money"), ",", "."))
    
    df.printSchema()
    df.show()
    
    spark.stop()
    

root
 |-- Int: long (nullable = true)
 |-- Decimal: decimal(18,2) (nullable = true)
 |-- Float: decimal(18,2) (nullable = true)
 |-- Money: string (nullable = true)
 |-- Bigint: long (nullable = true)
 |-- DateTime: timestamp (nullable = true)
 |-- Date: date (nullable = true)

+---+-------+------+-------+------+-------------------+----------+
|Int|Decimal| Float|  Money|Bigint|           DateTime|      Date|
+---+-------+------+-------+------+-------------------+----------+
|  1| 141.23|141.23|4141.32|     0|2025-03-22 10:00:00|2025-03-22|
+---+-------+------+-------+------+-------------------+----------+



In [8]:
import os
from pyspark.sql import SparkSession
from IPython.display import HTML
import pandas as pd


def loadTable(**kwargs):
    pathCheck = kwargs["path"].replace("/part*","")
    if not os.path.exists(pathCheck):
        return None
    try:
        if kwargs["loadType"] == "Parquet":
            sparkDqc.sql(f"""
            CREATE EXTERNAL TABLE IF NOT EXISTS {kwargs["tableName"]}
            USING PARQUET LOCATION '{kwargs["path"]}'
            """)
            return True
        else:
            sparkDqc.sql(f"""
            CREATE EXTERNAL TABLE IF NOT EXISTS {kwargs["tableName"]}
            USING CSV
            OPTIONS (
                'path' '{kwargs["path"]}',
                'delimiter' '|',
                'compression' 'gzip',
                'header' 'true'
            )
            """)
            return True
    except Exception as e:
        return None

if __name__ == "__main__":

    path = "/mnt/apps/Files/data-movement/Parquet/RTRNPF"
    
    sparkDqc =  SparkSession. \
            builder. \
            appName("parquet") \
            .master("local[*]") \
            .config("spark.ui.port", "4222") \
            .getOrCreate()
            
    df_table = loadTable(path=path, loadType="Parquet", tableName="RTRNPF")
    df_sql = sparkDqc.sql("SELECT * FROM RTRNPF LIMIT 100")

    html = df_sql.toPandas().to_html()  # Convert to HTML
    styled_html = f"""
    <style>
      table {{width: 100%; border-collapse: collapse;}}
      th, td {{border: 1px solid black; padding: 8px; text-align: left;}}
    </style>
    {html}
    """
    HTML(styled_html)

    sparkDqc.stop()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, isnull, count, length

def count_non_numeric(df, colName):
    df_cleaned = df.withColumn(
        f"{colName}_cleaned",
        regexp_replace(col(colName), "[^0-9.]", "")
    )

    df_non_numeric = df_cleaned.filter(
        (length(col(f"{colName}_cleaned")) == 0) & col(colName).isNotNull()
    )

    non_numeric_count = df_non_numeric.count()

    return non_numeric_count

spark = SparkSession.builder.appName("CountNonNumeric").getOrCreate()

data = [("1000.31",), (None,), ("AGAM",), ("AGAM",), ("AGAM",)]
df = spark.createDataFrame(data, ["AMT"])

non_numeric_count = count_non_numeric(df, "AMT")
print(f"Number of non-numeric rows: {non_numeric_count}")

spark.stop()

                                                                                

Number of non-numeric rows: 3


In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from IPython.display import HTML

spark = SparkSession.builder.appName("PrettySparkOutput").getOrCreate()

data = [
    (1, "4962fdbE6Bfee6D", "Pam", "Sparks", "Patel-Deleon", "Blakemouth", "British Indian Ocean Territory (Chagos Archipelago)", "267-243-9490x035", "480-078-0535x889", "nicolas00@faulkner-kramer.com", "2020-11-29", "https://nelson.com/", "1000232,32"),
    (2, "9b12Ae76fdBc9bE", "Gina", "Rocha", "Acosta, Paul and Barber", "East Lynnchester", "Costa Rica", "027.142.0940", "+1-752-593-4777x07171", "yfarley@morgan.com", "2021-01-03", "https://pineda-rogers.biz/", "1000232,32"),
    (3, "39edFd2F60C85BC", "Kristie", "Greer", "Ochoa PLC", "West Pamela", "Ecuador", "+1-049-168-7497x5053", "+1-311-216-7855", "jennyhayden@p", None, None, None)
]

columns = ["Index", "Customer Id", "First Name", "Last Name", "Company", "City", "Country", "Phone 1", "Phone 2", "Email", "Subscription Date", "Website", "Budget"]
df = spark.createDataFrame(data, columns)

pandas_df = df.toPandas()
#print(pandas_df)
html = pandas_df.to_html(index=False) # index=false to remove index column from html
styled_html = f"""
<style>
  table {{width: 100%; border-collapse: collapse;}}
  th, td {{border: 1px solid black; padding: 8px; text-align: left; word-wrap: break-word;}}
</style>
{html}
"""
HTML(styled_html)
HTML("<h1>Hello, World!</h1>")
spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/25 12:46:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from IPython.display import HTML, display

spark = SparkSession.builder.appName("PrettySparkOutput").getOrCreate()

data = [
    (1, "4962fdbE6Bfee6D", "Pam", "Sparks", "Patel-Deleon", "Blakemouth", "British Indian Ocean Territory (Chagos Archipelago)", "267-243-9490x035", "480-078-0535x889", "nicolas00@faulkner-kramer.com", "2020-11-29", "https://nelson.com/", "1000232,32"),
    (2, "9b12Ae76fdBc9bE", "Gina", "Rocha", "Acosta, Paul and Barber", "East Lynnchester", "Costa Rica", "027.142.0940", "+1-752-593-4777x07171", "yfarley@morgan.com", "2021-01-03", "https://pineda-rogers.biz/", "1000232,32"),
    (3, "39edFd2F60C85BC", "Kristie", "Greer", "Ochoa PLC", "West Pamela", "Ecuador", "+1-049-168-7497x5053", "+1-311-216-7855", "jennyhayden@p", None, None, None)
]

columns = ["Index", "Customer Id", "First Name", "Last Name", "Company", "City", "Country", "Phone 1", "Phone 2", "Email", "Subscription Date", "Website", "Budget"]
df = spark.createDataFrame(data, columns)

pandas_df = df.toPandas()
pandas_df = pandas_df.fillna("")

print(pandas_df)  # Check Pandas DataFrame content

HTML("<h1>Hello, World!</h1>")  # Basic HTML test

html = pandas_df.to_html(index=False)
print(html) # check the html string.

HTML(html)  # Try displaying basic HTML

display(pandas_df) # test the display method.

spark.stop()