In [1]:
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, isnan, when

logging.basicConfig(
    filename="/home/jovyan/work/week5/data_quality.log",
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s'
)

In [2]:
spark = SparkSession.builder.appName("Week5_Data_Quality").getOrCreate()

In [3]:
def extract(path):
    logging.info(f"Extracting {path}")
    return spark.read.csv(path, header=True, inferSchema=True)

sales = extract("/home/jovyan/work/data/sales.csv")
customers = extract("/home/jovyan/work/data/customers.csv")

In [5]:
def count_missing_simple(df):
    logging.info("Checking for missing/null values")
    
    # Only check for NULL values (works for all data types)
    return df.select([
        count(when(col(c).isNull(), c)).alias(c) for c in df.columns
    ])

missing_sales = count_missing_simple(sales)
missing_sales.show()

missing_customers = count_missing_simple(customers)
missing_customers.show()

+--------+-----------+----------+-------+--------+----------+
|order_id|customer_id|order_date|product|quantity|unit_price|
+--------+-----------+----------+-------+--------+----------+
|       0|          0|         0|      0|       0|         0|
+--------+-----------+----------+-------+--------+----------+

+-----------+-------------+------+
|customer_id|customer_name|region|
+-----------+-------------+------+
|          0|            0|     0|
+-----------+-------------+------+



In [7]:
def count_duplicates(df, key_cols):
    logging.info("Checking duplicates")
    return df.count() - df.dropDuplicates(key_cols).count()

# First, let's see what columns we actually have
print("Sales columns:", sales.columns)
print("Customers columns:", customers.columns)

# Use the correct column names from your data
dup_sales = count_duplicates(sales, ["order_id"])  # Changed from "sale_id" to "order_id"
print("Duplicate sales:", dup_sales)
logging.info(f"Duplicate sales: {dup_sales}")

dup_customers = count_duplicates(customers, ["customer_id"])
print("Duplicate customers:", dup_customers)
logging.info(f"Duplicate customers: {dup_customers}")

Sales columns: ['order_id', 'customer_id', 'order_date', 'product', 'quantity', 'unit_price']
Customers columns: ['customer_id', 'customer_name', 'region']
Duplicate sales: 0
Duplicate customers: 0


In [11]:
def detect_outliers(df, colname):
    logging.info(f"Detecting outliers in {colname}")
    desc = df.select(colname).describe().toPandas()
    print(desc)  # Debug print
    # Check if 25% and 75% exist
    if not ("25%" in desc['summary'].values and "75%" in desc['summary'].values):
        print(f"Warning: Could not compute quartiles for column {colname}.")
        logging.warning(f"Could not compute quartiles for column {colname}.")
        return 0, None, None
    try:
        q1 = float(desc[desc['summary'] == '25%'][colname].values[0])
        q3 = float(desc[desc['summary'] == '75%'][colname].values[0])
        iqr = q3 - q1
        lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
        outlier_count = df.filter((col(colname) < lower) | (col(colname) > upper)).count()
        return outlier_count, lower, upper
    except Exception as e:
        print(f"Error during outlier calc: {e}")
        logging.warning(f"Error during outlier calc for {colname}: {e}")
        return 0, None, None

# Usage:
if "unit_price" in sales.columns:
    out_count, low, up = detect_outliers(sales, "unit_price")
    print(f"Outliers in unit_price: {out_count} (range: {low}–{up})")

  summary         unit_price
0   count                 10
1    mean             15.892
2  stddev  9.498211293595112
3     min                4.5
4     max              29.99
Outliers in unit_price: 0 (range: None–None)


In [12]:
from pyspark.sql.functions import col
sales = sales.withColumn("unit_price", col("unit_price").cast("double"))