In [2]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark.sql import udf
from pyspark.sql import DataFrame
from pyspark.sql import Row, Column
from pyspark.sql.utils import AnalysisException
from pyspark.pandas.typedef import as_spark_type

from pyspark.ml.feature import Bucketizer



In [3]:
from typing import Tuple
from enum import Enum, auto
from itertools import chain
from decimal import Decimal
from datetime import datetime, date
import contextlib
import sys
import math
import io
import re
import math
import pandas as pd
import numpy as np

In [4]:
spark = (
    SparkSession.builder.appName("Testes")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.optimizerEnabled', 'true')
    .config('spark.sql.execution.arrow.pyspark.enabled', 'true')
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    .config("spark.sql.legacy.timeParserPolicy", "CORRECTED")
    .config("spark.sql.repl.eagerEval.enabled", "true")
    .config("spark.sql.debug.maxToStringFields", "100000")
    .enableHiveSupport()
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
sc = spark.sparkContext

24/08/24 11:54:56 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
24/08/24 11:54:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/24 11:54:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [48]:
# Sample DataFrames for expected (reference) and actual distributions
expected_data = [(1,), (2,), (2,), (3,), (4,), (4,), (5,), (5,), (6,), (7,)]
actual_data = [(1,), (2,), (2,), (3,), (4,), (4,), (5,), (5,), (6,), (8,)]

expected_df = spark.createDataFrame(expected_data, ["value"])
actual_df = spark.createDataFrame(actual_data, ["value"])

In [49]:
actual_df.show()

+-----+
|value|
+-----+
|    1|
|    2|
|    2|
|    3|
|    4|
|    4|
|    5|
|    5|
|    6|
|    8|
+-----+



In [50]:
# Calculate CDF for the expected distribution
expected_window = Window.orderBy(F.col("value")).rangeBetween(Window.unboundedPreceding, 0)
expected_cdf = expected_df.withColumn("expected_cdf", F.cume_dist().over(expected_window))

In [51]:
# Calculate CDF for the actual distribution
actual_window = Window.orderBy(F.col("value")).rangeBetween(Window.unboundedPreceding, 0)
actual_cdf = actual_df.withColumn("actual_cdf", F.cume_dist().over(actual_window))

In [52]:
actual_cdf.show()

+-----+----------+
|value|actual_cdf|
+-----+----------+
|    1|       0.1|
|    2|       0.3|
|    2|       0.3|
|    3|       0.4|
|    4|       0.6|
|    4|       0.6|
|    5|       0.8|
|    5|       0.8|
|    6|       0.9|
|    8|       1.0|
+-----+----------+



In [53]:
# Merge the two CDFs by value
joined_cdf = expected_cdf.alias("e")\
    .join(other=actual_cdf.alias("a"), on=F.col("e.value") == F.col("a.value"), how="outer")\
    .select(
        F.col("e.value").alias("value"),
        F.col("e.expected_cdf"),
        F.col("a.actual_cdf")
    ).orderBy("value")

In [54]:
joined_cdf.show()

+-----+------------+----------+
|value|expected_cdf|actual_cdf|
+-----+------------+----------+
| null|        null|       1.0|
|    1|         0.1|       0.1|
|    2|         0.3|       0.3|
|    2|         0.3|       0.3|
|    2|         0.3|       0.3|
|    2|         0.3|       0.3|
|    3|         0.4|       0.4|
|    4|         0.6|       0.6|
|    4|         0.6|       0.6|
|    4|         0.6|       0.6|
|    4|         0.6|       0.6|
|    5|         0.8|       0.8|
|    5|         0.8|       0.8|
|    5|         0.8|       0.8|
|    5|         0.8|       0.8|
|    6|         0.9|       0.9|
|    7|         1.0|      null|
+-----+------------+----------+



In [55]:
# Fill missing values with the last seen value
# joined_cdf = joined_cdf.fillna(method='ffill')

# Replace any remaining nulls with 0 (in case the first value was null)
joined_cdf = joined_cdf.fillna(0)

In [56]:
joined_cdf.show()

+-----+------------+----------+
|value|expected_cdf|actual_cdf|
+-----+------------+----------+
|    0|         0.0|       1.0|
|    1|         0.1|       0.1|
|    2|         0.3|       0.3|
|    2|         0.3|       0.3|
|    2|         0.3|       0.3|
|    2|         0.3|       0.3|
|    3|         0.4|       0.4|
|    4|         0.6|       0.6|
|    4|         0.6|       0.6|
|    4|         0.6|       0.6|
|    4|         0.6|       0.6|
|    5|         0.8|       0.8|
|    5|         0.8|       0.8|
|    5|         0.8|       0.8|
|    5|         0.8|       0.8|
|    6|         0.9|       0.9|
|    7|         1.0|       0.0|
+-----+------------+----------+



In [57]:
# Calculate the absolute difference between the expected and actual CDFs
joined_cdf = joined_cdf.withColumn("ks_difference", F.abs(F.col("expected_cdf") - F.col("actual_cdf")))

In [58]:
joined_cdf.show()

+-----+------------+----------+-------------+
|value|expected_cdf|actual_cdf|ks_difference|
+-----+------------+----------+-------------+
|    0|         0.0|       1.0|          1.0|
|    1|         0.1|       0.1|          0.0|
|    2|         0.3|       0.3|          0.0|
|    2|         0.3|       0.3|          0.0|
|    2|         0.3|       0.3|          0.0|
|    2|         0.3|       0.3|          0.0|
|    3|         0.4|       0.4|          0.0|
|    4|         0.6|       0.6|          0.0|
|    4|         0.6|       0.6|          0.0|
|    4|         0.6|       0.6|          0.0|
|    4|         0.6|       0.6|          0.0|
|    5|         0.8|       0.8|          0.0|
|    5|         0.8|       0.8|          0.0|
|    5|         0.8|       0.8|          0.0|
|    5|         0.8|       0.8|          0.0|
|    6|         0.9|       0.9|          0.0|
|    7|         1.0|       0.0|          1.0|
+-----+------------+----------+-------------+



In [59]:
# Find the maximum difference, which is the KS statistic
ks_statistic = joined_cdf.agg({"ks_difference": "max"}).collect()[0][0]
print(f"KS Statistic: {ks_statistic}")

KS Statistic: 1.0
