In [148]:
import yaml
import json
import pandas as pd
import numpy as np
from pprint import pprint
from typing import Any, Union, Callable
from enum import Enum, auto
from collections import namedtuple, defaultdict
from random import choice
from abc import ABC, ABCMeta, abstractmethod
from sklearn import datasets
from pydantic import (
    BaseModel,
    ValidationError,
    field_validator,
    field_serializer,
    model_validator,
    computed_field,
    ValidatorFunctionWrapHandler,
    ValidationInfo,
    Field,
    ConfigDict,
)

In [3]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark.sql import udf
from pyspark.sql import DataFrame
from pyspark.sql import Row, Column
from pyspark.sql.utils import AnalysisException
from pyspark.pandas.typedef import as_spark_type

In [4]:
pd.DataFrame.iteritems = pd.DataFrame.items

In [5]:
spark = (
    SparkSession.builder.appName("Testes")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.optimizerEnabled', 'true')
    .config('spark.sql.execution.arrow.enabled', 'true')
    .config('spark.sql.execution.arrow.pyspark.enabled', 'true')
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    .config("spark.sql.legacy.timeParserPolicy", "CORRECTED")
    .config("spark.sql.repl.eagerEval.enabled", "true")
    .config("spark.sql.debug.maxToStringFields", "100000")
    .config("park.sql.execution.arrow.pyspark.fallback.enabled", "false")
    .enableHiveSupport()
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")



24/08/17 11:24:55 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
24/08/17 11:24:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/17 11:24:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load data

In [6]:
iris_data = datasets.load_iris(as_frame=True) # classification
df_iris = iris_data.frame

# bcancer_data = datasets.load_breast_cancer(as_frame=True) # classification
# df_bcancer = bcancer_data.frame

# diabetes_data = datasets.load_diabetes(as_frame=True) # regression
# df_diabetes = diabetes_data.frame

# wine_data = datasets.load_wine(as_frame=True) # classification
# df_wine = wine_data.frame


In [7]:
dfp = df_iris.rename({
    "sepal length (cm)": "sepal_length",
    "sepal width (cm)": "sepal_width",
    "petal length (cm)": "petal_length",
    "petal width (cm)": "petal_width",
}, axis=1)

In [8]:
dfp.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [9]:
# https://stackoverflow.com/questions/39109045/numpy-where-with-multiple-conditions

def energy_class(x: float):
    if x > 6:
        return 'high'
    elif x > 5:
        return 'medium'
    else:
        return 'low'


dfp['tipo'] = pd.cut(dfp['sepal_length'], bins=[0, 5, 6, np.inf], labels=['low', 'medium', 'high'])
# dfp['tipo'] = np.where(dfp['sepal_length'] > 7, 'high', np.where(dfp['sepal_length'] > 5, 'medium', 'low'))
# dfp['tipo'] = dfp['sepal_length'].apply(energy_class)
# dfp['tipo'] = np.select([dfp['sepal_length'] > 7, dfp['sepal_length'] > 5], ['high', 'medium'], default='low')
# dfp['tipo'] = np.vectorize(lambda x: 'high' if x > 5 else ('medium' if x > 3 else 'low'))(dfp['sepal_length'])
# dfp['tipo'] = dfp['sepal_length'].apply(lambda x: 'high' if x > 6 else ('medium' if x > 5 else 'low'))

In [10]:
dfp['tipo'].value_counts()

tipo
high      61
medium    57
low       32
Name: count, dtype: int64

In [11]:
def suit():
    return choice(('Spade', 'Heart', 'Diamond', 'Club'))

dfp['suit'] = [suit() for _ in range(len(dfp))]

In [13]:
dfp['id'] = [i for i, _ in enumerate(range(len(dfp)), start=1)]

In [15]:
# Missing
dfp.iloc[3, 6] = None
dfp.iloc[9, 6] = None

# Duplicates
dfp.iloc[1, 7] = 1

In [17]:
dfp.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,tipo,suit,id
0,5.1,3.5,1.4,0.2,0,medium,Heart,1
1,4.9,3.0,1.4,0.2,0,low,Diamond,1
2,4.7,3.2,1.3,0.2,0,low,Spade,3
3,4.6,3.1,1.5,0.2,0,low,,4
4,5.0,3.6,1.4,0.2,0,low,Heart,5
5,5.4,3.9,1.7,0.4,0,medium,Spade,6
6,4.6,3.4,1.4,0.3,0,low,Diamond,7
7,5.0,3.4,1.5,0.2,0,low,Heart,8
8,4.4,2.9,1.4,0.2,0,low,Heart,9
9,4.9,3.1,1.5,0.1,0,low,,10


In [18]:
df = spark.createDataFrame(dfp)

## Dev

In [198]:
class ExtendEnum(Enum):

    def _generate_next_value_(name: str, start, count, last_values):
        return name.lower()
    
    @classmethod
    def __getitem__(cls, name: str):
        return cls(name.lower())
    
    @classmethod
    def to_dict(cls):
        return {e.name: e.value for e in cls}
    
    @classmethod
    def keys(cls):
        return cls._member_names_
    
    @classmethod
    def values(cls):
        return list(map(lambda c: c.value, cls))
    
    def __str__(self):
        return str(self.value)
    
    def __repr__(self) -> str:
        return str(self.value)

In [199]:
class MetricType(ExtendEnum):
    COLUMN = "COLUMN"
    TABLE = "TABLE"

In [200]:
class MetricName(ExtendEnum):
    MISSING = auto()
    VOLUMETRY = auto()
    DUPLICITY = auto()
    MEAN = auto()
    TESTE = auto()

In [126]:
class MetricMetaClass(type):

    def __new__(cls, *args, **kwargs):
        return super().__new__(cls, *args, **kwargs)
    
    def __call__(cls, *args, **kwargs):
        return cls.evaluate(*args, **kwargs)
    
    @abstractmethod
    def evaluate(self):
        raise NotImplementedError

In [127]:
class CombineMeta(ABCMeta, MetricMetaClass):
    pass

In [128]:
class MetricBase(ABC):

    spark_session = SparkSession.getActiveSession()

    @classmethod
    def all_metrics(cls) -> list:
        return list(set([subclass.name for subclass in cls.__subclasses__()]))

    @classmethod
    def metrics_by_type(cls, metric_type: MetricType) -> list:
        metrics = []
        for subclass in cls.__subclasses__():
            if metric_type == subclass.type:
                metrics.append(subclass.name)
        return metrics

    @classmethod
    def get_metric(cls, metric_name: str) -> 'MetricBase':
        for subclass in cls.__subclasses__():
            if metric_name == subclass.name:
                return subclass

    @property
    @abstractmethod
    def name(self) -> str:
        raise NotImplementedError

    @staticmethod
    @abstractmethod
    def calculate(self, df: DataFrame, col: str | list) -> Any:
        "Metodo executa metrica"
    
    @staticmethod
    @abstractmethod
    def evaluate(self) -> Any:
        "Metodo executa metrica"


In [129]:
class ColumnMissing(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.MISSING.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, col: str) -> int:
        missing = df.select(col).where(F.col(col).isNull()).count()
        return missing 

In [130]:
class TableVolumetry(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.VOLUMETRY.value
    type: MetricType = MetricType.TABLE
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, *args, **kwargs) -> int:
        count = df.count()
        return count 

In [131]:
class ColumnDuplicity(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.DUPLICITY.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, col: Union[str, list]) -> int:
        if isinstance(col, str):
            col = col = [col]
        distinct = df.select(col).dropDuplicates(subset=col).count()
        total = df.count()
        return total - distinct

In [132]:
class ColumnTeste(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.TESTE.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, *args, **kwargs) -> int:
        ...

In [133]:
class ColumnMean(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.MEAN.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, col: str) -> int:
        mean = df.select(F.mean(col)).first()[0]
        return mean

In [134]:
all_metrics = MetricBase.all_metrics()
all_metrics

['missing', 'volumetry', 'mean', 'duplicity', 'teste']

In [135]:
# metrics_schema = {
#     "COLUNA": T.StructField("COLUNA", T.FloatType(), True),
#     MetricName.MEAN.value: T.StructField(MetricName.MEAN.value, T.FloatType(), True),
#     MetricName.DUPLICITY.value: T.StructField(MetricName.DUPLICITY.value, T.IntegerType(), True),
# }

In [92]:
# spark.createDataFrame([], schema=T.StructType(list(metrics_schema.values()))).createOrReplaceTempView("tb_metrics")
# spark.table("tb_metrics").show()

+------+----+---------+
|COLUNA|mean|duplicity|
+------+----+---------+
+------+----+---------+



## Config

In [260]:
file_path = "mrm.yaml"

with open(file_path) as f:
    config = yaml.safe_load(f)

In [261]:
config

{'reference': {'database': 'workspace_db',
  'table': 'tb_spec_dataset',
  'train_data': {'start': 202001, 'end': 202212}},
 'metrics': {'table': ['volumetry'],
  'keys': {'names': ['id'],
   '_keys': ['duplicity'],
   'individual': {'id': ['missing']}},
  'features': {'numerigcal': {'sepal_length': ['mean', 'missing'],
    'sepal_width': ['mean'],
    'petal_length': ['mean']},
   'categorical': {'tipo': ['missing'], 'suit': ['missing']}},
  'target': {'target': ['missing']}}}

## Measures

In [247]:
column_metrics = MetricBase.metrics_by_type(MetricType.COLUMN)
table_metrics = MetricBase.metrics_by_type(MetricType.TABLE)

In [268]:
def process_dict(config: dict, condition: Callable, output: dict | None = None):
    if output is None:
        output: dict[str, list] = {}
    for key, value in config.items():
        if condition(key):
            if isinstance(value, dict):
                process_dict(value, condition, output)
            if isinstance(value, list):
                for item in value:
                    if key not in output:
                        output[key] = []
                    output[key].append(item)
    return output


def process_config_metrics(metric_config: dict, type: MetricType) -> dict:
    if type == MetricType.COLUMN:
        condition = lambda x: x not in {"table", "names"}
    else:
        condition = lambda x: x == "table"

    metrics_output: dict[str, list[str]] = process_dict(metric_config, condition)
    return {key: list(set(value)) for key, value in metrics_output.items()}

In [270]:
process_config_metrics(config['metrics'], MetricType.TABLE)

{'table': ['volumetry']}

In [139]:
column_map = {item: [] for item in column_metrics}
column_map = defaultdict(list, column_map)

table_map = {item: [] for item in table_metrics}
table_map = defaultdict(list, table_map)

In [217]:
column_metrics

['missing', 'duplicity', 'teste', 'mean']

In [218]:
column_map

defaultdict(list,
            {'missing': ['id', 'tipo', 'suit', 'target'],
             'duplicity': ['_pk'],
             'teste': [],
             'mean': ['sepal_length', 'sepal_width', 'petal_length']})

In [140]:
# def process_dict(dict_value: dict, output: dict, condition: Callable):
#     for key, value in dict_value.items():
#         if condition(key):
#             if isinstance(value, dict):
#                 process_dict(value, output, condition)
#             if isinstance(value, list):
#                 for item in value:
#                     if item not in output:
#                         output[item] = []
#                     output[item].append(key)
#     return output


# def process_config_metrics(metric_config: dict, metric_map: dict, type: MetricType) -> dict:
#     if type == MetricType.COLUMN:
#         condition = lambda x: x not in {"table", "names"}
#     else:
#         condition = lambda x: x == "table"

#     metrics_output: dict[str, list[str]] = process_dict(metric_config, metric_map, condition)

#     for key, value in metrics_output.items():
#         if "together" in value:
#             new_value = value.copy()
#             new_value.remove("together")
#             new_value.append("_pk")
#             metrics_output[key] = new_value
    
#     return {key: list(set(value)) for key, value in metrics_output.items()}

In [141]:
metrics_column_map = process_config_metrics(config['metrics'], column_map, MetricType.COLUMN)
metrics_column_map

{'missing': ['suit', 'id', 'tipo', 'target'],
 'duplicity': ['_pk'],
 'teste': [],
 'mean': ['sepal_width', 'petal_length', 'sepal_length']}

In [142]:
metrics_table_map = process_config_metrics(config['metrics'], table_map, MetricType.TABLE)
metrics_table_map

{'volumetry': ['table']}

In [143]:
metrics_map = {
    "table": metrics_table_map,
    "column": metrics_column_map
}

In [146]:
struct_columns_metrics = defaultdict(dict[str, str])

for metric_name, columns in metrics_map['column'].items():
    print(f"{metric_name = }")
    for col in columns:
        print(f"  {col = }")
        # metric_id = MetricName.__getitem__(metric_name)
        Metric = MetricBase.get_metric(metric_name)
        real_value = col
        if col == "_pk":
            real_value = config['metrics']['keys']['names']
        print(f"  {real_value = }")
        measure = Metric(df, real_value)
        struct_columns_metrics[col][metric_name] = measure
    print("=======================")

metric_name = 'missing'
  col = 'suit'
  real_value = 'suit'
  col = 'id'
  real_value = 'id'
  col = 'tipo'
  real_value = 'tipo'
  col = 'target'
  real_value = 'target'
metric_name = 'duplicity'
  col = '_pk'
  real_value = ['id']
metric_name = 'teste'
metric_name = 'mean'
  col = 'sepal_width'
  real_value = 'sepal_width'
  col = 'petal_length'
  real_value = 'petal_length'
  col = 'sepal_length'
  real_value = 'sepal_length'


In [147]:
struct_columns_metrics

defaultdict(dict[str, str],
            {'suit': {'missing': 2},
             'id': {'missing': 0},
             'tipo': {'missing': 0},
             'target': {'missing': 0},
             '_pk': {'duplicity': 1},
             'sepal_width': {'mean': 3.0573333333333332},
             'petal_length': {'mean': 3.7579999999999996},
             'sepal_length': {'mean': 5.843333333333334}})

In [149]:
template = {item: None for item in all_metrics}
final = {}

for key, value in struct_columns_metrics.items():
    final[key] = template.copy() | value

In [150]:
pprint(final)

{'_pk': {'duplicity': 1,
         'mean': None,
         'missing': None,
         'teste': None,
         'volumetry': None},
 'id': {'duplicity': None,
        'mean': None,
        'missing': 0,
        'teste': None,
        'volumetry': None},
 'petal_length': {'duplicity': None,
                  'mean': 3.7579999999999996,
                  'missing': None,
                  'teste': None,
                  'volumetry': None},
 'sepal_length': {'duplicity': None,
                  'mean': 5.843333333333334,
                  'missing': None,
                  'teste': None,
                  'volumetry': None},
 'sepal_width': {'duplicity': None,
                 'mean': 3.0573333333333332,
                 'missing': None,
                 'teste': None,
                 'volumetry': None},
 'suit': {'duplicity': None,
          'mean': None,
          'missing': 2,
          'teste': None,
          'volumetry': None},
 'target': {'duplicity': None,
            'mean': None,
 

In [151]:
dfp_final = pd.DataFrame.from_dict(final, orient="index")
dfp_final = dfp_final.reset_index()
dfp_final = dfp_final.rename({"index": "columns"}, axis=1)

In [152]:
dfp_final.head(10)

Unnamed: 0,columns,missing,volumetry,mean,duplicity,teste
0,suit,2.0,,,,
1,id,0.0,,,,
2,tipo,0.0,,,,
3,target,0.0,,,,
4,_pk,,,,1.0,
5,sepal_width,,,3.057333,,
6,petal_length,,,3.758,,
7,sepal_length,,,5.843333,,


In [153]:
df_final = spark.createDataFrame(dfp_final)

In [154]:
df_final.show(10)

+------------+-------+---------+------------------+---------+-----+
|     columns|missing|volumetry|              mean|duplicity|teste|
+------------+-------+---------+------------------+---------+-----+
|        suit|    2.0|     null|              null|     null| null|
|          id|    0.0|     null|              null|     null| null|
|        tipo|    0.0|     null|              null|     null| null|
|      target|    0.0|     null|              null|     null| null|
|         _pk|   null|     null|              null|      1.0| null|
| sepal_width|   null|     null|3.0573333333333332|     null| null|
|petal_length|   null|     null|3.7579999999999996|     null| null|
|sepal_length|   null|     null| 5.843333333333334|     null| null|
+------------+-------+---------+------------------+---------+-----+



In [156]:
ColumnDuplicity.name

'duplicity'

## Measures

In [157]:
# class Descriptor:

#     def __set_name__(self, owner,  name):
#         self.name = name


# class Measure:

#     def __init__(self):
#         for metric in MetricName.values():
#             setattr(self, metric, None)

#     def set_attribute(self, attr_name, value):
#         if hasattr(self, attr_name):
#             setattr(self, attr_name, value)
#         else:
#             raise AttributeError(f"Attribute {attr_name} does not exist.")


# class TypedDescriptor:

#     def __init__(self, name, data_type):
#         self.name = name
#         self.data_type = data_type

#     def __get__(self, instance, owner):
#         if instance is None:
#             return self
#         return instance.__dict__.get(self.name, None)

#     def __set__(self, instance, value):
#         if not isinstance(value, self.data_type) and value is not None:
#             raise TypeError(f"Expected {self.data_type.__name__} for {self.name}, got {type(value).__name__}")
#         instance.__dict__[self.name] = value


# class DynamicClass:

#     def __init__(self):
#         for enum_member in MetricName:
#             setattr(self.__class__, enum_member.value, TypedDescriptor(enum_member.value, MetricName))
#             setattr(self, enum_member.value, None)

In [None]:
class Measure:

    def __init__(self, name: str):
        self._name = name
        self._metrics: list[MetricBase] = []
        for metric in MetricName.values():
            setattr(self, metric, None)

    def set_attribute(self, attr_name, value):
        if hasattr(self, attr_name):
            setattr(self, attr_name, value)
        else:
            raise AttributeError(f"Attribute {attr_name} does not exist.")

    def add(self, metric: MetricBase):
        self._metrics.append(metric)

    def run(self, df: DataFrame):
        results = [metric() for metric in self._metrics]
        

In [210]:
m1 = Measure()

In [214]:
m1.__dict__

{'missing': 2,
 'volumetry': None,
 'duplicity': None,
 'mean': None,
 'teste': None}

In [212]:
# m1.missing = 1
# m1.set_attribute("missing", 2)

In [215]:
metrics_map

{'table': {'volumetry': ['table']},
 'column': {'missing': ['suit', 'id', 'tipo', 'target'],
  'duplicity': ['_pk'],
  'teste': [],
  'mean': ['sepal_width', 'petal_length', 'sepal_length']}}

In [None]:
struct_columns_metrics = defaultdict(dict[str, str])


for metric_name, columns in metrics_map['column'].items():
    for col in columns:
        Metric = MetricBase.get_metric(metric_name)

        real_value = col
        if col == "_pk":
            real_value = config['metrics']['keys']['names']

        measure = Metric(df, real_value)


        struct_columns_metrics[col][metric_name] = measure
