# Profiles: Retail Sales


Yasmin Johana Garcia
Yesid Humberto Montaño Cuero

In [None]:
import os
# indicate your Spark version, here we use Spark 3.5 with pydeequ 1.4.0
os.environ["SPARK_VERSION"] = '3.2'
os.environ["HADOOP_HOME"] = "C:\\hadoop\\winutils\\hadoop-3.3.6"

In [None]:
from pyspark.sql import SparkSession, Row, DataFrame
import json
import pandas as pd
import sagemaker_pyspark

import pydeequ

classpath = ":".join(sagemaker_pyspark.classpath_jars())

spark = (SparkSession
    .builder
    .config("spark.driver.extraClassPath", classpath)
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .config("spark.driver.memory", "15g")
    .config("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")
    .getOrCreate())

In [None]:
df = spark.read.parquet("Retail_sales.parquet")

df.printSchema()

root
 |-- Store ID: string (nullable = true)
 |-- Product ID: long (nullable = true)
 |-- Date: string (nullable = true)
 |-- Units Sold: long (nullable = true)
 |-- Sales Revenue (USD): double (nullable = true)
 |-- Discount Percentage: long (nullable = true)
 |-- Marketing Spend (USD): long (nullable = true)
 |-- Store Location: string (nullable = true)
 |-- Product Category: string (nullable = true)
 |-- Day of the Week: string (nullable = true)
 |-- Holiday Effect: boolean (nullable = true)



In [None]:
from pydeequ.profiles import *

result = ColumnProfilerRunner(spark) \
    .onData(df) \
    .run()

In [None]:
for col, profile in result.profiles.items():
    print(profile)

NumericProfiles for column: Marketing Spend (USD): {
    "completeness": 1.0,
    "approximateNumDistinctValues": 195,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": null,
    "kll": "None",
    "mean": 49.94403333333333,
    "maximum": 199.0,
    "minimum": 0.0,
    "sum": 1498321.0,
    "stdDev": 64.4005815273869,
    "approxPercentiles": []
}
StandardProfiles for column: Store ID: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 1,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {
        "Boolean": 0,
        "Fractional": 0,
        "Integral": 0,
        "Unknown": 0,
        "String": 30000
    },
    "histogram": [
        [
            "Spearsland",
            30000,
            1.0
        ]
    ]
}
NumericProfiles for column: Units Sold: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 32,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts"

In [None]:
Units_Sold_profile = result.profiles['Units Sold']

print(f'Statistics of \'Units Sold\':')
print('\t',f"minimum: {Units_Sold_profile.minimum}")
print('\t',f"maximum: {Units_Sold_profile.maximum}")
print('\t',f"mean: {Units_Sold_profile.mean}")
print('\t',f"mean: {Units_Sold_profile.mean}")
print('\t',f"standard deviation: {Units_Sold_profile.stdDev}")

Statistics of 'Units Sold':
	 minimum: 0.0
	 maximum: 56.0
	 mean: 6.161966666666666
	 standard deviation: 3.3238732625591485
