In [1]:
import yaml
import json
import pandas as pd
import numpy as np
from typing import Any
from enum import Enum
from collections import namedtuple
from random import choice
from abc import ABC, ABCMeta, abstractmethod
from sklearn import datasets
from pydantic import (
    BaseModel,
    ValidationError,
    field_validator,
    field_serializer,
    model_validator,
    computed_field,
    ValidatorFunctionWrapHandler,
    ValidationInfo,
    Field,
    ConfigDict,
)

In [2]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window
from pyspark.sql import udf
from pyspark.sql import DataFrame
from pyspark.sql import Row, Column
from pyspark.sql.utils import AnalysisException
from pyspark.pandas.typedef import as_spark_type



In [13]:
pd.DataFrame.iteritems = pd.DataFrame.items

In [3]:
spark = (
    SparkSession.builder.appName("Testes")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.optimizerEnabled', 'true')
    .config('spark.sql.execution.arrow.enabled', 'true')
    .config('spark.sql.execution.arrow.pyspark.enabled', 'true')
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
    .config("spark.sql.legacy.timeParserPolicy", "CORRECTED")
    .config("spark.sql.repl.eagerEval.enabled", "true")
    .config("spark.sql.debug.maxToStringFields", "100000")
    .config("park.sql.execution.arrow.pyspark.fallback.enabled", "false")
    .enableHiveSupport()
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")



24/08/16 09:22:45 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
24/08/16 09:22:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/16 09:22:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load data

In [14]:
iris_data = datasets.load_iris(as_frame=True) # classification
df_iris = iris_data.frame

# bcancer_data = datasets.load_breast_cancer(as_frame=True) # classification
# df_bcancer = bcancer_data.frame

# diabetes_data = datasets.load_diabetes(as_frame=True) # regression
# df_diabetes = diabetes_data.frame

# wine_data = datasets.load_wine(as_frame=True) # classification
# df_wine = wine_data.frame


In [15]:
dfp = df_iris.rename({
    "sepal length (cm)": "sepal_length",
    "sepal width (cm)": "sepal_width",
    "petal length (cm)": "petal_length",
    "petal width (cm)": "petal_width",
}, axis=1)

In [16]:
dfp.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [17]:
# https://stackoverflow.com/questions/39109045/numpy-where-with-multiple-conditions

def energy_class(x: float):
    if x > 6:
        return 'high'
    elif x > 5:
        return 'medium'
    else:
        return 'low'


dfp['tipo'] = pd.cut(dfp['sepal_length'], bins=[0, 5, 6, np.inf], labels=['low', 'medium', 'high'])
# dfp['tipo'] = np.where(dfp['sepal_length'] > 7, 'high', np.where(dfp['sepal_length'] > 5, 'medium', 'low'))
# dfp['tipo'] = dfp['sepal_length'].apply(energy_class)
# dfp['tipo'] = np.select([dfp['sepal_length'] > 7, dfp['sepal_length'] > 5], ['high', 'medium'], default='low')
# dfp['tipo'] = np.vectorize(lambda x: 'high' if x > 5 else ('medium' if x > 3 else 'low'))(dfp['sepal_length'])
# dfp['tipo'] = dfp['sepal_length'].apply(lambda x: 'high' if x > 6 else ('medium' if x > 5 else 'low'))

In [18]:
dfp['tipo'].value_counts()

tipo
high      61
medium    57
low       32
Name: count, dtype: int64

In [19]:
def suit():
    return choice(('Spade', 'Heart', 'Diamond', 'Club'))

dfp['suit'] = [suit() for _ in range(len(dfp))]

In [20]:
dfp.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target,tipo,suit
0,5.1,3.5,1.4,0.2,0,medium,Club
1,4.9,3.0,1.4,0.2,0,low,Diamond
2,4.7,3.2,1.3,0.2,0,low,Club
3,4.6,3.1,1.5,0.2,0,low,Club
4,5.0,3.6,1.4,0.2,0,low,Heart


In [22]:
df = spark.createDataFrame(dfp)

## Dev

In [23]:
class MetricType(Enum):
    COLUMN = "COLUMN"
    TABLE = "TABLE"

In [24]:
class MetricName(Enum):
    MEAN = "MEAN"
    DUPLICITY = "DUPLICITY"

In [25]:
class MetricMetaClass(type):

    def __new__(cls, *args, **kwargs):
        return super().__new__(cls, *args, **kwargs)
    
    def __call__(cls, *args, **kwargs):
        print("MetaClass Mae")
        return cls.evaluate(*args, **kwargs)
    
    @abstractmethod
    def evaluate(self):
        raise NotImplementedError

In [26]:
class CombineMeta(ABCMeta, MetricMetaClass):
    pass

In [27]:
class MetricBase(ABC):

    @classmethod
    def all_metrics(cls) -> list:
        return [subclass.name for subclass in cls.__subclasses__()]
    
    @property
    @abstractmethod
    def name(self) -> str:
        raise NotImplementedError

    # @property
    # @abstractmethod
    # def type(self) -> str:
    #     raise NotImplementedError

    # @property
    # @abstractmethod
    # def schema(self) -> str:
    #     raise NotImplementedError

    @staticmethod
    @abstractmethod
    def calculate(self, df: DataFrame, col) -> Any:
        "Metodo executa metrica"
    
    @staticmethod
    @abstractmethod
    def evaluate(self) -> Any:
        "Metodo executa metrica"


In [28]:
class ColumnMean(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.MEAN.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(*args, **kwargs):
        print(*args) 

In [29]:
class ColumnDuplicity(MetricBase, metaclass=CombineMeta):
    
    name: MetricName = MetricName.DUPLICITY.value
    type: MetricType = MetricType.COLUMN
    schema: T.StructField = T.StructField(name, T.FloatType(), True)
    
    @classmethod
    def calculate(cls):
        print(type(cls.spark))

    @staticmethod
    def evaluate(*args, **kwargs):
        print(*args) 

In [30]:
all_metrics = MetricBase.all_metrics()
all_metrics

['MEAN', 'DUPLICITY']

In [31]:
# getattr(ColumnMean, "type")
ColumnMean.type

<MetricType.COLUMN: 'COLUMN'>

In [32]:
metrics_schema = {
    "COLUNA": T.StructField("COLUNA", T.FloatType(), True),
    MetricName.MEAN.value: T.StructField(MetricName.MEAN.value, T.FloatType(), True),
    MetricName.DUPLICITY.value: T.StructField(MetricName.DUPLICITY.value, T.IntegerType(), True),
}

In [33]:
spark.createDataFrame([], schema=T.StructType(list(metrics_schema.values()))).createOrReplaceTempView("tb_metrics")
spark.table("tb_metrics").show()

+------+----+---------+
|COLUNA|MEAN|DUPLICITY|
+------+----+---------+
+------+----+---------+



## Config

In [34]:
file_path = "mrm.yaml"

with open(file_path) as f:
    config = yaml.safe_load(f)

In [35]:
config

{'reference': {'database': 'workspace_db',
  'table': 'tb_spec_dataset',
  'train_data': {'start': 202001, 'end': 202212}},
 'metrics': {'table': ['volumetria'],
  'keys': {'names': ['pk_1', 'pk_2'],
   'together': ['duplicidade'],
   'individual': {'pk_1': ['ausencia'], 'pk_2': ['ausencia']}},
  'features': {'numerigcal': {'col_a': ['media', 'moda', 'variancia'],
    'col_b': ['media'],
    'col_c': ['moda']},
   'categorical': {'col_d': ['ausencia']}},
  'target': {'target_name': ['distribuicao']}}}

## Measures

In [36]:
Measures = namedtuple("Measures", all_metrics)

In [37]:
Measures._fields

('MEAN', 'DUPLICITY')

In [38]:
m1 = Measures(2, 0)

In [39]:
m1.MEAN

2

In [40]:
m1._asdict()

{'MEAN': 2, 'DUPLICITY': 0}

## Data

In [43]:
df.show(5, truncate=False)

+------------+-----------+------------+-----------+------+------+-------+
|sepal_length|sepal_width|petal_length|petal_width|target|tipo  |suit   |
+------------+-----------+------------+-----------+------+------+-------+
|5.1         |3.5        |1.4         |0.2        |0     |medium|Club   |
|4.9         |3.0        |1.4         |0.2        |0     |low   |Diamond|
|4.7         |3.2        |1.3         |0.2        |0     |low   |Club   |
|4.6         |3.1        |1.5         |0.2        |0     |low   |Club   |
|5.0         |3.6        |1.4         |0.2        |0     |low   |Heart  |
+------------+-----------+------------+-----------+------+------+-------+
only showing top 5 rows



In [None]:
Tenho um dicionario de entrada de uma funcao python e preciso que a saida seja no seguinte formato:

input = {
    'metrics': {
        'table': ['volumetria'],
        'keys': {
            'names': ['id'],
            'together': ['duplicidade'],
            'individual': {
                'id': ['ausencia']
            }
        },
        'features': {
            'numerigcal': {
                'col_a': ['media'],
                'col_b': ['media'],
                'col_c': ['media']
            },
            'categorical': {
                'col_d': ['ausencia']
            }
        },
        'target': {
            'target_name': ['ausencia']
        }
    }
}

output = {
    'ausencia': ['id', 'col_d', 'target_name'],
    'duplicidade': ['together'],
    'media': ['col_a', 'col_b', 'col_c']
}


Preciso agrupar individualmente os valores de cada lista com os valroes das chaves em um lista.
Pensei em fazer tipo com uma funcao nested que trata os valroes lista e se for dicionario entra na propria funcao.

In [45]:
x = {
    'metrics': {
        'table': ['volumetria'],
        'keys': {
            'names': ['id'],
            'together': ['duplicidade'],
            'individual': {
                'id': ['ausencia']
            }
        },
        'features': {
            'numerigcal': {
                'col_a': ['media'],
                'col_b': ['media'],
                'col_c': ['media']
            },
            'categorical': {
                'col_d': ['ausencia']
            }
        },
        'target': {
            'target_name': ['ausencia']
        }
    }
}

In [None]:
output = {
    'ausencia': ['id', 'col_d', 'target_name'],
    'duplicidade': ['together'],
    'media': ['col_a', 'col_b', 'col_c']
}

In [46]:
def process_dict(d, output=None):
    if output is None:
        output = {}

    for key, value in d.items():
        if isinstance(value, dict):
            process_dict(value, output)
        elif isinstance(value, list):
            for item in value:
                if item not in output:
                    output[item] = []
                output[item].append(key)

    return output

def process_input(input_dict):
    output = {}

    # Process top-level 'metrics' dictionary
    metrics_dict = input_dict.get('metrics', {})
    process_dict(metrics_dict, output)

    # Sort the lists in the output for consistency
    for k, v in output.items():
        output[k] = sorted(v)

    return output

In [47]:
output = process_input(x)
print(output)

{'volumetria': ['table'], 'id': ['names'], 'duplicidade': ['together'], 'ausencia': ['col_d', 'id', 'target_name'], 'media': ['col_a', 'col_b', 'col_c']}
