In [140]:
import yaml
import json
import pandas as pd
import numpy as np
from pprint import pprint
from types import SimpleNamespace
from typing import Any, Union, Callable, NamedTuple
from functools import wraps
from enum import Enum, auto
from collections import namedtuple, defaultdict
from random import choice
from abc import ABC, ABCMeta, abstractmethod
from sklearn import datasets
from pydantic import (
    BaseModel,
    ValidationError,
    field_validator,
    field_serializer,
    model_validator,
    computed_field,
    ValidatorFunctionWrapHandler,
    ValidationInfo,
    Field,
    ConfigDict,
)

In [2]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark.sql import udf
from pyspark.sql import DataFrame
from pyspark.sql import Row, Column
from pyspark.sql.utils import AnalysisException
from pyspark.pandas.typedef import as_spark_type



In [3]:
pd.DataFrame.iteritems = pd.DataFrame.items

In [4]:
spark = (
    SparkSession.builder.appName("Testes")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.optimizerEnabled', 'true')
    .config('spark.sql.execution.arrow.enabled', 'true')
    .config('spark.sql.execution.arrow.pyspark.enabled', 'true')
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    .config("spark.sql.legacy.timeParserPolicy", "CORRECTED")
    .config("spark.sql.repl.eagerEval.enabled", "true")
    .config("spark.sql.debug.maxToStringFields", "100000")
    .config("park.sql.execution.arrow.pyspark.fallback.enabled", "false")
    .enableHiveSupport()
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")



24/08/18 09:06:49 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
24/08/18 09:06:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/18 09:06:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load data

In [5]:
iris_data = datasets.load_iris(as_frame=True) # classification
df_iris = iris_data.frame

# bcancer_data = datasets.load_breast_cancer(as_frame=True) # classification
# df_bcancer = bcancer_data.frame

# diabetes_data = datasets.load_diabetes(as_frame=True) # regression
# df_diabetes = diabetes_data.frame

# wine_data = datasets.load_wine(as_frame=True) # classification
# df_wine = wine_data.frame


In [6]:
dfp = df_iris.rename({
    "sepal length (cm)": "sepal_length",
    "sepal width (cm)": "sepal_width",
    "petal length (cm)": "petal_length",
    "petal width (cm)": "petal_width",
}, axis=1)

In [7]:
dfp.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
# https://stackoverflow.com/questions/39109045/numpy-where-with-multiple-conditions

def energy_class(x: float):
    if x > 6:
        return 'high'
    elif x > 5:
        return 'medium'
    else:
        return 'low'


dfp['tipo'] = pd.cut(dfp['sepal_length'], bins=[0, 5, 6, np.inf], labels=['low', 'medium', 'high'])
# dfp['tipo'] = np.where(dfp['sepal_length'] > 7, 'high', np.where(dfp['sepal_length'] > 5, 'medium', 'low'))
# dfp['tipo'] = dfp['sepal_length'].apply(energy_class)
# dfp['tipo'] = np.select([dfp['sepal_length'] > 7, dfp['sepal_length'] > 5], ['high', 'medium'], default='low')
# dfp['tipo'] = np.vectorize(lambda x: 'high' if x > 5 else ('medium' if x > 3 else 'low'))(dfp['sepal_length'])
# dfp['tipo'] = dfp['sepal_length'].apply(lambda x: 'high' if x > 6 else ('medium' if x > 5 else 'low'))

In [9]:
dfp['tipo'].value_counts()

tipo
high      61
medium    57
low       32
Name: count, dtype: int64

In [10]:
def suit():
    return choice(('Spade', 'Heart', 'Diamond', 'Club'))

dfp['suit'] = [suit() for _ in range(len(dfp))]

In [11]:
dfp['id'] = [i for i, _ in enumerate(range(len(dfp)), start=1)]

In [12]:
# Missing
dfp.iloc[3, 6] = None
dfp.iloc[9, 6] = None

# Duplicates
dfp.iloc[1, 7] = 1

In [13]:
dfp.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,tipo,suit,id
0,5.1,3.5,1.4,0.2,0,medium,Diamond,1
1,4.9,3.0,1.4,0.2,0,low,Spade,1
2,4.7,3.2,1.3,0.2,0,low,Diamond,3
3,4.6,3.1,1.5,0.2,0,low,,4
4,5.0,3.6,1.4,0.2,0,low,Diamond,5
5,5.4,3.9,1.7,0.4,0,medium,Club,6
6,4.6,3.4,1.4,0.3,0,low,Spade,7
7,5.0,3.4,1.5,0.2,0,low,Club,8
8,4.4,2.9,1.4,0.2,0,low,Spade,9
9,4.9,3.1,1.5,0.1,0,low,,10


In [14]:
df = spark.createDataFrame(dfp)

## Dev

In [15]:
class ExtendEnum(Enum):

    def _generate_next_value_(name: str, start, count, last_values):
        return name.lower()
    
    @classmethod
    def __getitem__(cls, name: str):
        return cls(name.lower())
    
    @classmethod
    def to_dict(cls):
        return {e.name: e.value for e in cls}
    
    @classmethod
    def keys(cls):
        return cls._member_names_
    
    @classmethod
    def values(cls):
        return list(map(lambda c: c.value, cls))
    
    def __str__(self):
        return str(self.value)
    
    def __repr__(self) -> str:
        return str(self.value)

In [16]:
class MetricType(ExtendEnum):
    COLUMN = "COLUMN"
    TABLE = "TABLE"

In [17]:
class MetricName(ExtendEnum):
    MISSING = auto()
    VOLUMETRY = auto()
    DUPLICITY = auto()
    MEAN = auto()
    TESTE = auto()

In [None]:
def replace_key_value(func: Callable) -> Callable:
    @wraps
    def wrapper(*args, **kwargs) -> Any:
        return func(*args, **kwargs)
    return wrapper

In [148]:
def teste(*args, **kwargs):
    print(f"{args = }")
    new_args = list(map(lambda x: x.replace('a', 'A'), args))
    print(f"{new_args = }")

In [149]:
teste("a", "b")

args = ('a', 'b')
new_args = ['A', 'b']


In [152]:
def replace_key_value(replace_map: dict):
    def decorator(func: Callable):
        @wraps
        def wrapper(*args, **kwargs) -> Any:
            new_args = []
            new_kwargs = {}

            for item in args:
                if item in replace_map.keys():
                    new_args.append(replace_map[item])
                else:
                     new_args.append(item)
            
            for key, value in kwargs.items():
                if key in replace_map.keys():
                    new_kwargs[key] = replace_map[key]
                else:
                    new_kwargs[key] = value

            # if set(replace_map.keys()).intersection(args):
            #     args = list(map(lambda x: x.replace('_key', replace_map.get('_key')), args))
            # if set(replace_map.keys()).intersection(kwargs.keys()):
            #     kwargs[param_name] = new_value
            return func(*new_args, **new_kwargs)
        return wrapper
    return decorator

In [105]:
class MetricMetaClass(type):

    def __new__(cls, *args, **kwargs):
        return super().__new__(cls, *args, **kwargs)
    
    def __call__(cls, df: DataFrame, *args, **kwargs):
        print(f"{type(df) = }")
        print(f"{args = }")
        print(f"{kwargs = }")
        return cls.evaluate(df, *args, **kwargs)
    
    @abstractmethod
    def evaluate(self):
        raise NotImplementedError

In [106]:
class CombineMeta(ABCMeta, MetricMetaClass):
    pass

In [107]:
class MetricBase(ABC):

    spark_session = SparkSession.getActiveSession()

    @classmethod
    def all_metrics(cls) -> list:
        return list(set([subclass.name for subclass in cls.__subclasses__()]))

    @classmethod
    def metrics_by_type(cls, metric_type: MetricType) -> list:
        metrics = []
        for subclass in cls.__subclasses__():
            if metric_type == subclass.type:
                metrics.append(subclass.name)
        return metrics

    @classmethod
    def get_metric(cls, metric_name: str) -> 'MetricBase':
        for subclass in cls.__subclasses__():
            if metric_name == subclass.name:
                return subclass

    @property
    @abstractmethod
    def name(self) -> str:
        raise NotImplementedError

    @staticmethod
    @abstractmethod
    def calculate(self, df: DataFrame, col: str | list) -> Any:
        "Metodo executa metrica"
    
    @staticmethod
    @abstractmethod
    def evaluate(self) -> Any:
        "Metodo executa metrica"


In [108]:
class ColumnMissing(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.MISSING.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, col: str) -> int:
        missing = df.select(col).where(F.col(col).isNull()).count()
        return missing 

In [109]:
class TableVolumetry(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.VOLUMETRY.value
    type: MetricType = MetricType.TABLE
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, *args, **kwargs) -> int:
        count = df.count()
        return count 

In [110]:
class ColumnDuplicity(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.DUPLICITY.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, col: Union[str, list]) -> int:
        print(f"{col = }")
        if isinstance(col, str):
            col = col = [col]
        distinct = df.select(col).dropDuplicates(subset=col).count()
        total = df.count()
        return total - distinct

In [111]:
class ColumnTeste(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.TESTE.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, *args, **kwargs) -> int:
        ...

In [112]:
class ColumnMean(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.MEAN.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(df: DataFrame, col: str) -> int:
        mean = df.select(F.mean(col)).first()[0]
        return mean

In [113]:
all_metrics = MetricBase.all_metrics()
all_metrics

['mean', 'duplicity', 'volumetry', 'missing', 'teste']

## Config

In [114]:
file_path = "mrm.yaml"

with open(file_path) as f:
    config = yaml.safe_load(f)

In [115]:
config

{'reference': {'database': 'workspace_db',
  'table': 'tb_spec_dataset',
  'train_data': {'start': 202001, 'end': 202212}},
 'metrics': {'table': ['volumetry'],
  'keys': {'names': ['id'],
   '_keys': ['duplicity'],
   'individual': {'id': ['missing']}},
  'features': {'numerical': {'sepal_length': ['mean', 'missing'],
    'sepal_width': ['mean'],
    'petal_length': ['mean']},
   'categorical': {'tipo': ['missing'], 'suit': ['missing']}},
  'target': {'target': ['missing']}}}

## Measures

In [133]:
def key_columns(config: dict) -> dict:
    return {"_keys": config['keys']['names']}

In [134]:
map_key_columns = key_columns(config['metrics'])
print(map_key_columns)

{'_keys': ['id']}


In [151]:
"_keyss" in map_key_columns.keys()

False

In [116]:
def columns_classification(config: dict):
    response = {}
    temp = {
        # "chaves": config['keys']['names'],
        "chave": ["_keys"],
        "numerica": list(config['features']['numerical'].keys()),
        "categorical": list(config['features']['categorical'].keys()),
        "target": list(config['target'].keys())
    }
    for key, values in temp.items():
        if isinstance(values, list):
            for value in values:
                response[value] = key
        else:
            response[values] = key
    return response

In [117]:
cols_classification = columns_classification(config['metrics'])
cols_classification

{'_keys': 'chave',
 'sepal_length': 'numerica',
 'sepal_width': 'numerica',
 'petal_length': 'numerica',
 'tipo': 'categorical',
 'suit': 'categorical',
 'target': 'target'}

In [118]:
column_metrics = MetricBase.metrics_by_type(MetricType.COLUMN)
table_metrics = MetricBase.metrics_by_type(MetricType.TABLE)

In [119]:
def process_dict(config: dict, condition: Callable, output: dict | None = None):
    if output is None:
        output: dict[str, list] = {}
    for key, value in config.items():
        if condition(key):
            if isinstance(value, dict):
                process_dict(value, condition, output)
            if isinstance(value, list):
                for item in value:
                    if key not in output:
                        output[key] = []
                    output[key].append(item)
    return output


def process_config_metrics(metric_config: dict, type: MetricType) -> dict:
    if type == MetricType.COLUMN:
        condition = lambda x: x not in {"table", "names"}
    else:
        condition = lambda x: x == "table"

    metrics_output: dict[str, list[str]] = process_dict(metric_config, condition)
    return {key: list(set(value)) for key, value in metrics_output.items()}

In [120]:
metrics_column_map = process_config_metrics(config['metrics'], MetricType.COLUMN)
metrics_table_map = process_config_metrics(config['metrics'], MetricType.TABLE)

In [122]:
metrics_map = metrics_column_map | {"_table": metrics_table_map["table"]}

In [123]:
pprint(metrics_map)

{'_keys': ['duplicity'],
 '_table': ['volumetry'],
 'id': ['missing'],
 'petal_length': ['mean'],
 'sepal_length': ['mean', 'missing'],
 'sepal_width': ['mean'],
 'suit': ['missing'],
 'target': ['missing'],
 'tipo': ['missing']}


## Measures

In [124]:
metrics_map

{'_keys': ['duplicity'],
 'id': ['missing'],
 'sepal_length': ['mean', 'missing'],
 'sepal_width': ['mean'],
 'petal_length': ['mean'],
 'tipo': ['missing'],
 'suit': ['missing'],
 'target': ['missing'],
 '_table': ['volumetry']}

In [125]:
# reponse_fileds = ["atributo", "classificacao", "tipo", "metrica_medida", "valor_medido"]
# ResponseTemplate = namedtuple("ResponseTemplate", reponse_fileds)
# ResponseTemplate.__new__.__defaults__ = (None, ) * len(reponse_fileds)

In [126]:
class Measures:

    def __init__(self, name: str, classification: str):
        self._name = name
        self._classification = classification
    #     # self._metrics: list[MetricBase] = []
    #     for metric in MetricName.values():
    #         setattr(self, metric, None)

    def set_attribute(self, attr_name, value) -> None:
        setattr(self, attr_name, value)
        # if hasattr(self, attr_name):
        #     setattr(self, attr_name, value)
        # else:
        #     raise AttributeError(f"Attribute {attr_name} does not exist.")

    def calculate(self, df: DataFrame) -> list[dict]:
        # results = [metric() for metric in self._metrics]
        response: list[dict] = []
        for value in self.__dict__.values():
            if isinstance(value, CombineMeta):
                # if _key
                metric_result = value(df, self._name)
                result = {
                    "atributo": self._name,
                    "classificacao": self._classification,
                    "tipo": value.type.value,
                    "metrica_medida": value.name,
                    "valor_medido": metric_result
                }
                response.append(result)
            # elif isinstance(value, str):
            #     if value == "_key":
            #         result = {
            #             "atributo": self._name,
            #             "classificacao": self._classification,
            #             "tipo": value.type.value,
            #             "metrica_medida": value.name,
            #             "valor_medido": metric_result
            #         }
            #     elif value == "_table":
            #         result = {
            #             "atributo": self._name,
            #             "classificacao": self._classification,
            #             "tipo": value.type.value,
            #             "metrica_medida": value.name,
            #             "valor_medido": metric_result
            #         } 
        return response

In [127]:
class HandlerMeasure:

    def __init__(self, metrics_map: dict):
        self._metrics_map = self.clean_metrics(metrics_map)

    @staticmethod
    def clean_metrics(metrics_map: dict) -> dict[str, list]:
        return {key: [] for key in metrics_map.keys()}

    def add(self, column: str, measure: Measures) -> None:
        self._metrics_map[column] = measure

    def run(self, df: DataFrame):
        # response_template = {metric_name: None for metric_name in all_metrics}
        response = {}
        for column, measure in self._metrics_map.items():
            print(f"--> {column = }")
            response[column] = measure.calculate(df)
            print("==================================================")


In [128]:
metrics_map

{'_keys': ['duplicity'],
 'id': ['missing'],
 'sepal_length': ['mean', 'missing'],
 'sepal_width': ['mean'],
 'petal_length': ['mean'],
 'tipo': ['missing'],
 'suit': ['missing'],
 'target': ['missing'],
 '_table': ['volumetry']}

In [129]:
cols_classification

{'_keys': 'chave',
 'sepal_length': 'numerica',
 'sepal_width': 'numerica',
 'petal_length': 'numerica',
 'tipo': 'categorical',
 'suit': 'categorical',
 'target': 'target'}

In [130]:
handler_measures = HandlerMeasure(metrics_map)

for column, metrics in metrics_map.items():
    measures = Measures(column, cols_classification.get(column))
    for metric_name in metrics:       
        Metric = MetricBase.get_metric(metric_name)
        measures.set_attribute(metric_name, Metric)
    handler_measures.add(column, measures)

In [131]:
handler_measures._metrics_map['_keys'].__dict__

{'_name': '_keys',
 '_classification': 'chave',
 'duplicity': __main__.ColumnDuplicity}

In [132]:
handler_measures.run(df)

--> column = '_keys'
type(df) = <class 'pyspark.sql.dataframe.DataFrame'>
args = ('_keys',)
kwargs = {}
col = '_keys'


AnalysisException: Column '_keys' does not exist. Did you mean one of the following? [id, suit, tipo, target, petal_length, petal_width, sepal_length, sepal_width];
'Project ['_keys]
+- LogicalRDD [sepal_length#0, sepal_width#1, petal_length#2, petal_width#3, target#4L, tipo#5, suit#6, id#7L], false
