# Custom Transformations

In [9]:
import pyspark.sql.functions as F
from pyspark.ml import Pipeline, Transformer
from pyspark.sql import DataFrame
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import math
spark = SparkSession.builder.appName("asheesh").getOrCreate()

In [2]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

In [3]:
class Negative_Entropy(Transformer):   
    def __init__(self,inputCol="features", outputCol="negative_entropy"):
        super(Negative_Entropy, self).__init__()
        self.inputCol=inputCol
        self.outputCol=outputCol

    def _transform(self, df: DataFrame) -> DataFrame:
        df=df.withColumn(self.outputCol, F.udf(lambda z: sum([-(i+1.05)*math.log(i+1.05) for i in z.toArray().tolist()]),FloatType())(df[self.inputCol]))
        return df

In [4]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
df_norm=normalizer.transform(df)
df_norm.show()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

+---+--------------+--------------------+
| id|      features|        normFeatures|
+---+--------------+--------------------+
|  0|[1.0,0.5,-1.0]|[0.66666666666666...|
|  1| [2.0,1.0,1.0]|[0.81649658092772...|
|  2|[4.0,10.0,2.0]|[0.36514837167011...|
+---+--------------+--------------------+



In [5]:
negative_entropy=Negative_Entropy(inputCol="normFeatures", outputCol="negative_entropy")
df_entropy=negative_entropy.transform(df_norm)
df_entropy.show()

+---+--------------+--------------------+----------------+
| id|      features|        normFeatures|negative_entropy|
+---+--------------+--------------------+----------------+
|  0|[1.0,0.5,-1.0]|[0.66666666666666...|      -1.0089868|
|  1| [2.0,1.0,1.0]|[0.81649658092772...|      -2.2650192|
|  2|[4.0,10.0,2.0]|[0.36514837167011...|      -2.0729015|
+---+--------------+--------------------+----------------+



## Using Pipeline

In [6]:
pipeline = Pipeline(stages=[normalizer,negative_entropy])


In [7]:
a=pipeline.fit(df)

In [8]:
a.transform(df).show()

+---+--------------+--------------------+----------------+
| id|      features|        normFeatures|negative_entropy|
+---+--------------+--------------------+----------------+
|  0|[1.0,0.5,-1.0]|[0.66666666666666...|      -1.0089868|
|  1| [2.0,1.0,1.0]|[0.81649658092772...|      -2.2650192|
|  2|[4.0,10.0,2.0]|[0.36514837167011...|      -2.0729015|
+---+--------------+--------------------+----------------+

