In [None]:

#!pip install great-expectations

In [1]:
import findspark

findspark.init()
findspark.find()
import pyspark

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import sys
sys.path.append('src')
from utils import *

# Create a SparkSession with the given application name
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()

# Read CSV file into a DataFrame, with header and inferred schema
df = spark.read.csv("online retail.csv", header=True, inferSchema=True)

df.show()
print(df)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [3]:



# Initialize the DataFrame for validation
df = initialize_df(df)

# Expectations
expectation_1 = expect_table_column_count(df, min_value=1, max_value=8)
expectation_2 = expect_table_row_count(df, min_value=1, max_value=600000)
expectation_3 = expect_column_existence(df, 'InvoiceDate')
expectation_4 = expect_ordered_column_list(df, ['InvoiceNo', 'StockCode'])
expectation_5 = expect_column_value_range(df, 'Quantity', min_value=2, max_value=32)
expectation_6 = expect_column_min_max_range(df, 'UnitPrice', min_value=2.55, max_value=7.95)
expectation_7 = expect_unique_column_values(df, 'CustomerID')

# Assuming you've already loaded your Spark DataFrame into 'df'
# Call the function to validate and summarize the results
expectation_results = {
    'Table Column Count': expectation_1['success'],
    'Table Row Count': expectation_2['success'],
    'Column Existence': expectation_3['success'],
    'Ordered Column List': expectation_4['success'],
    'Column Value Range': expectation_5['success'],
    'Column Min/Max Range': expectation_6['success'],
    'Unique Column Values': expectation_7['success']
}

df = show_expectation_results(expectation_results)
df.show()


+--------------------+------+
|         Expectation|Status|
+--------------------+------+
|  Table Column Count|Passed|
|     Table Row Count|Passed|
|    Column Existence|Passed|
| Ordered Column List|Failed|
|  Column Value Range|Failed|
|Column Min/Max Range|Failed|
|Unique Column Values|Failed|
+--------------------+------+



In [None]:
# File path to save the DataFrame
output_file = "data_quality_report.csv"

# Write DataFrame to disk
df.write.csv(output_file, header=True, mode="overwrite")


