# Transformers

In [1]:
from pyspark.sql import SparkSession
# Create SparkSession
spark = SparkSession.builder.appName("asheesh").getOrCreate()

22/03/28 18:12:26 WARN Utils: Your hostname, a-Lenovo-Legion-Y530-15ICH resolves to a loopback address: 127.0.1.1; using 192.168.1.4 instead (on interface wlp7s0)
22/03/28 18:12:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/28 18:12:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/28 18:12:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/28 18:12:27 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### 1)StringIndexer

StringIndexer encodes a string column of labels to a column of label indices

 id | category | categoryIndex
----|----------|---------------
 0  | a        | 0.0
 1  | b        | 2.0
 2  | c        | 1.0
 3  | a        | 0.0
 4  | a        | 0.0
 5  | c        | 1.0

In [2]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

                                                                                

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



### 2) IndexToString

Symmetrically to StringIndexer, IndexToString maps a column of label indices back to a column containing the original labels as strings.  A common use case is to produce indices from labels with StringIndexer, train a model with those indices and retrieve the original labels from the column of predicted indices with IndexToString.

 id | categoryIndex | originalCategory
----|---------------|-----------------
 0  | 0.0           | a
 1  | 2.0           | b
 2  | 1.0           | c
 3  | 0.0           | a
 4  | 0.0           | a
 5  | 1.0           | c

In [3]:
from pyspark.ml.feature import IndexToString, StringIndexer

converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)

print("Transformed indexed column '%s' back to original string column '%s' using "
      "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()


Transformed indexed column 'categoryIndex' back to original string column 'originalCategory' using labels in metadata
+---+-------------+----------------+
| id|categoryIndex|originalCategory|
+---+-------------+----------------+
|  0|          0.0|               a|
|  1|          2.0|               b|
|  2|          1.0|               c|
|  3|          0.0|               a|
|  4|          0.0|               a|
|  5|          1.0|               c|
+---+-------------+----------------+



### 3)Normalizer

Normalizer is a Transformer which transforms a dataset of Vector rows, normalizing each Vector to have unit norm.

In [5]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show()

Normalized using L^1 norm
+---+--------------+------------------+
| id|      features|      normFeatures|
+---+--------------+------------------+
|  0|[1.0,0.5,-1.0]|    [0.4,0.2,-0.4]|
|  1| [2.0,1.0,1.0]|   [0.5,0.25,0.25]|
|  2|[4.0,10.0,2.0]|[0.25,0.625,0.125]|
+---+--------------+------------------+



In [8]:
# L2 Normalization
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show(truncate=False)

Normalized using L^1 norm
+---+--------------+-----------------------------------------------------------+
|id |features      |normFeatures                                               |
+---+--------------+-----------------------------------------------------------+
|0  |[1.0,0.5,-1.0]|[0.6666666666666666,0.3333333333333333,-0.6666666666666666]|
|1  |[2.0,1.0,1.0] |[0.8164965809277261,0.4082482904638631,0.4082482904638631] |
|2  |[4.0,10.0,2.0]|[0.3651483716701107,0.9128709291752769,0.18257418583505536]|
+---+--------------+-----------------------------------------------------------+



### 4) StandardScaler

StandardScaler transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation and/or zero mean. It takes parameters:

    a)withStd: True by default. Scales the data to unit standard deviation.
    b) withMean: False by default. Centers the data with mean before scaling. It will build a dense output, so take care when applying to sparse input.


In [14]:
from pyspark.ml.feature import StandardScaler
from pyspark.mllib.random import RandomRDDs
from pyspark.ml.feature import *

def generate_random_uniform_df(nrows=10, ncols=10,numPartitions=10):
    return RandomRDDs.uniformVectorRDD(spark.sparkContext, nrows,ncols,numPartitions).map(lambda a : a.tolist()).toDF()

df=generate_random_uniform_df(nrows=1000,ncols=3,numPartitions=100)

vectorAssembler = VectorAssembler(inputCols=df.columns, outputCol="rawFeatures")
df=vectorAssembler.transform(df)

scaler = StandardScaler(inputCol="rawFeatures", outputCol="scaledFeatures")

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(df)
scaledData.select("rawFeatures","scaledFeatures").show(truncate=False)

+--------------------------------------------------------------+------------------------------------------------------------+
|rawFeatures                                                   |scaledFeatures                                              |
+--------------------------------------------------------------+------------------------------------------------------------+
|[0.5286404020372594,0.30841392119746824,0.9018507460856643]   |[1.7683005623340662,1.0875889439264128,3.109501342010325]   |
|[0.5139061715002577,0.4338516311180267,0.097096322497359]     |[1.71901460529459,1.5299317082586987,0.3347795036154757]    |
|[0.3035091275197125,0.9916321370006804,0.7904799776736032]    |[1.0152371230014354,3.496885433889147,2.725504815599368]    |
|[0.5538877734705204,0.7039511562083257,0.1786264357281525]    |[1.852752944200532,2.4824090027576715,0.6158880990296943]   |
|[0.7302572874882337,0.20408634026822958,0.3847373028643034]   |[2.4427084406291364,0.7196888078862519,1.3265400785778

###  5)Tokenizer

In [16]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words").show(truncate=False)

+-----------------------------------+------------------------------------------+
|sentence                           |words                                     |
+-----------------------------------+------------------------------------------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |
+-----------------------------------+------------------------------------------+



In [18]:
# Using Regx
regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")


tokenized = tokenizer.transform(sentenceDataFrame)


regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words").show(truncate=False)

+-----------------------------------+------------------------------------------+
|sentence                           |words                                     |
+-----------------------------------+------------------------------------------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
|Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |
+-----------------------------------+------------------------------------------+



### 6)StopWordsRemover

Stop words are words which should be excluded from the input, typically because the words appear frequently and don’t carry as much meaning.

 id | raw                         | filtered
----|-----------------------------|--------------------
 0  | [I, saw, the, red, baloon]  |  [saw, red, baloon]
 1  | [Mary, had, a, little, lamb]|[Mary, little, lamb]

In [25]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
remover_df =remover.transform(tokenized).select("sentence","filtered")
remover_df.show(truncate=False)

+-----------------------------------+-------------------------------------+
|sentence                           |filtered                             |
+-----------------------------------+-------------------------------------+
|Hi I heard about Spark             |[hi, heard, spark]                   |
|I wish Java could use case classes |[wish, java, use, case, classes]     |
|Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]|
+-----------------------------------+-------------------------------------+



### 7) n-gram

In [33]:
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("words","ngrams").show(truncate=False)

+------------------------------------------+------------------------------------------------------------------+
|words                                     |ngrams                                                            |
+------------------------------------------+------------------------------------------------------------------+
|[Hi, I, heard, about, Spark]              |[Hi I, I heard, heard about, about Spark]                         |
|[I, wish, Java, could, use, case, classes]|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic, regression, models, are, neat] |[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------+------------------------------------------------------------------+

