In [1]:
from pyspark.sql import SparkSession

spark=(
    SparkSession.builder.
    appName("Recipes ML Model - Are you a dessert?").
    config("spark.driver.memory","8g").
    getOrCreate()
)

spark

In [2]:
from pyspark.ml import Transformer
import pyspark.sql.functions as F
from pyspark.sql import DataFrame, Column

def scalarNAFillerFunction(
    df: DataFrame,
    inputCol: Column,
    outputCol: str,
    filler: float = 0.0
):
    return df.withColumn(outputCol, inputCol).fillna(filler,subset=outputCol)   
    

df=spark.createDataFrame(
    data=[[1, 2, 4, 1], [3, 6, 5, 4], [9, 4, None, 9], [11, 17, None, 3]],
    schema=["one","two","three","four"]
)

scalarNAFillerFunction(df, F.col("three"), "five", -99.0).show()

#df.withColumn("five",col=F.col("three")).fillna(-99.0,subset="five").show()


+---+---+-----+----+----+
|one|two|three|four|five|
+---+---+-----+----+----+
|  1|  2|    4|   1|   4|
|  3|  6|    5|   4|   5|
|  9|  4| NULL|   9| -99|
| 11| 17| NULL|   3| -99|
+---+---+-----+----+----+



In [3]:
from pyspark.ml.param import Param, Params, TypeConverters

filler = Param(
    parent=Params._dummy(),
    name="filler",
    doc="Value we want to replace our null values with.",
    typeConverter=TypeConverters.toFloat,
)

filler

Param(parent='undefined', name='filler', doc='Value we want to replace our null values with.')

In [4]:
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark import keyword_only

class ScalarNAFiller(Transformer, HasInputCol, HasOutputCol):

    #estas madres existen!
    filler = Param(
        parent=Params._dummy(),
        name="filler",
        doc="Value we want to replace our null values with.",
        typeConverter=TypeConverters.toFloat,
    )

    @keyword_only    
    def __init__(self, inputCol=None, outputCol=None, filler=None):
        super().__init__()
        #print(self._setDefault)
        self._setDefault(filler=None)
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    #esta tiene que estar definida!
    #esta cosa regresa this
    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, filler=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setFiller(self, new_filler):
        return self.setParams(filler=new_filler)

    def setInputCol(self, new_inputCol):
        return self.setParams(inputCol=new_inputCol)

    def setOutputCol(self, new_outputCol):
        return self.setParams(outputCol=new_outputCol)

    def getFiller(self):
        return self.getOrDefault(self.filler)
    
    def _transform(self, dataset: DataFrame):
        if not self.isSet("inputCol"):
            raise ValueError("No input column set for the ScalarNAFiller transformer.")

        input_column = dataset[self.getInputCol()]
        output_column = self.getOutputCol()

        na_filler = self.getFiller()

        return dataset.withColumn(
            output_column, 
            input_column.cast("double")
        ).fillna(
            na_filler,
            subset=output_column
        )

a=ScalarNAFiller(inputCol="three", outputCol="five", filler=-99)

df.show()
a.transform(df).show()
    

+---+---+-----+----+
|one|two|three|four|
+---+---+-----+----+
|  1|  2|    4|   1|
|  3|  6|    5|   4|
|  9|  4| NULL|   9|
| 11| 17| NULL|   3|
+---+---+-----+----+

+---+---+-----+----+-----+
|one|two|three|four| five|
+---+---+-----+----+-----+
|  1|  2|    4|   1|  4.0|
|  3|  6|    5|   4|  5.0|
|  9|  4| NULL|   9|-99.0|
| 11| 17| NULL|   3|-99.0|
+---+---+-----+----+-----+



In [5]:
a.setFiller(17).transform(df).show()
a.getFiller()

+---+---+-----+----+----+
|one|two|three|four|five|
+---+---+-----+----+----+
|  1|  2|    4|   1| 4.0|
|  3|  6|    5|   4| 5.0|
|  9|  4| NULL|   9|17.0|
| 11| 17| NULL|   3|17.0|
+---+---+-----+----+----+



17.0

In [6]:
a.transform(df, params={a.filler:40}).show()
a.getFiller()

+---+---+-----+----+----+
|one|two|three|four|five|
+---+---+-----+----+----+
|  1|  2|    4|   1| 4.0|
|  3|  6|    5|   4| 5.0|
|  9|  4| NULL|   9|40.0|
| 11| 17| NULL|   3|40.0|
+---+---+-----+----+----+



17.0

In [7]:
a.filler

Param(parent='ScalarNAFiller_6c1a06ed4dc9', name='filler', doc='Value we want to replace our null values with.')

#ExtremeValueCapper is an estimator

create the model clas as a Model which is a Transformer

Any value given by the floor replaced floor



In [8]:
def model_transform(
    df: DataFrame,
    inputCol: Column,
    outputCol: str,
    cap: float,
    floor: float,
):
    return df.withColumn(
        outputCol, 
        F.when(inputCol > cap, cap).
        when(inputCol < floor, floor).
        otherwise(inputCol)
    )

model_transform(df,F.col("four"),"new_column",1.0,0.0).show()

+---+---+-----+----+----------+
|one|two|three|four|new_column|
+---+---+-----+----+----------+
|  1|  2|    4|   1|       1.0|
|  3|  6|    5|   4|       1.0|
|  9|  4| NULL|   9|       1.0|
| 11| 17| NULL|   3|       1.0|
+---+---+-----+----+----------+



In [9]:
def estimator_fit(
    df: DataFrame, 
    inputCol: Column, 
    outputCol: str, 
    boundary: float
):
    avg, stddev = df.agg(F.mean(inputCol), F.stddev(inputCol)).head()
    
    cap = avg + boundary * stddev
    floor = avg - boundary * stddev

    print(cap, floor)
    return model_transform(df, inputCol, outputCol, cap, floor)


estimator_fit(df,F.col("one"),"new_column",1.0).show()

10.760952285695232 1.2390477143047667
+---+---+-----+----+------------------+
|one|two|three|four|        new_column|
+---+---+-----+----+------------------+
|  1|  2|    4|   1|1.2390477143047667|
|  3|  6|    5|   4|               3.0|
|  9|  4| NULL|   9|               9.0|
| 11| 17| NULL|   3|10.760952285695232|
+---+---+-----+----+------------------+



In [56]:
class _ExtremeValueCapperParams(HasInputCol,HasOutputCol):

    boundary = Param(
        parent=Params._dummy(),
        name="boundary",
        doc="Multiple of standard deviation for the cap and floor. Default = 0.0",
        typeConverter=TypeConverters.toFloat,
    )

    def __init__(self, *args):
        #print(type(super()))
        super().__init__(*args)
        self._setDefault(boundary=0.0)

    #se supone que no puedes hacer setboundary porque es el modelo
    #solo en el estimador puedes hacer setboundary
    def getBoundary(self):
        return self.getOrDefault(self.boundary)
    

    #no tiene el transform no puede ser llamado!


_ExtremeValueCapperParams()

_ExtremeValueCapperParams_eb73005c60b7

In [101]:

from pyspark.ml import Model

class ExtremeValueCapperModel(Model, _ExtremeValueCapperParams):

    cap = Param(
        Params._dummy(),
        "cap",
        "Upper bound of the values inputCol can take."
        " Values Will be capped to this value.",
        typeConverter=TypeConverters.toFloat
    )

    floor = Param(
        Params._dummy(),
        "floor",
        "Lower bound of the values inputCol can take."
        " Values Will be floored to this value.",
        typeConverter=TypeConverters.toFloat
    )

    @keyword_only
    def __init__(self, inputCol=None, outputCol=None, cap=None, floor=None):
        super().__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self, inputCol=None, outputCol=None, cap=None, floor=None):
        kwargs = self._input_kwargs
        return self._set(**kwargs)


    def setCap(self, new_cap):
        return self.setParams(cap=new_cap)

    def setFloor(self, new_floor):
        return self.setParams(floor=new_floor)

    def setInputCol(self, new_inputCol):
        return self.setParams(inputCol=new_inputCol)

    def setOutputCol(self, new_outputCol):
        return self.setParams(outputCol=new_outputCol)


    
    def getCap(self):
        return self.getOrDefault(self.cap)

    def getFloor(self):
        return self.getOrDefault(self.floor)

    
    def _transform(self, dataset):
        if not self.isSet("inputCol"):
            raise ValueError("No input column set for the ExtremeValueCapper model.")

        #print(self.getOutputCol())

        #antes son puros strings
        #aqui se crea la columna
        input_column = df[self.getInputCol()]
        output_column = self.getOutputCol()    

        cap_value = self.getOrDefault("cap")
        floor_value = self.getOrDefault("floor")

        return df.withColumn(
            output_column, 
            F.when(input_column > cap_value, cap_value).
            when(input_column < floor_value, floor_value).
            otherwise(input_column)
        )
        

ExtremeValueCapperModel(
    inputCol="one", 
    outputCol="new_column", 
    cap=0.9, 
    floor=0.9).transform(df).show()
    

+---+---+-----+----+----------+
|one|two|three|four|new_column|
+---+---+-----+----+----------+
|  1|  2|    4|   1|       0.9|
|  3|  6|    5|   4|       0.9|
|  9|  4| NULL|   9|       0.9|
| 11| 17| NULL|   3|       0.9|
+---+---+-----+----+----------+

