# Ch25 - Preprocessing and Feature Engineering

In [2]:
sales = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("../pyspark-training/data/The-Definitive-Guide/retail-data/by-day/*.csv")\
    .coalesce(5)\
    .where("Description IS NOT NULL")
fakeIntDF = spark.read.parquet("../pyspark-training/data/The-Definitive-Guide/simple-ml-integers")
simpleDF = spark.read.json("../pyspark-training/data/The-Definitive-Guide/simple-ml")
scaleDF = spark.read.parquet("../pyspark-training/data/The-Definitive-Guide/simple-ml-scaling")

In [3]:
sales.cache()
sales.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

## High-Level Transformers
### RFormula

In [4]:
from pyspark.ml.feature import RFormula

In [5]:
supervised = RFormula(formula = "lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show()

+-----+----+------+------------------+--------------------+-----+
|color| lab|value1|            value2|            features|label|
+-----+----+------+------------------+--------------------+-----+
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue| bad|    12|14.386294994851129|(10,[2,3,6,9],[12...|  0.0|
|green|good|    15| 38.97187133755819|(10,[1,2,3,5,8],[...|  1.0|
|green|good|    12|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
|green| bad|    16|14.386294994851129|(10,[1,2,3,5,8],[...|  0.0|
|  red|good|    35|14.386294994851129|(10,[0,2,3,4,7],[...|  1.0|
|  red| bad|     1| 38.97187133755819|(10,[0,2,3,4,7],[...|  0.0|
|  red| bad|     2|14.386294994851129|(10,[0,2,3,4,7],[...|  0.0|
|  red| bad|    16|14.386294994851129|(10,[0,2,3,4,7],[...|  0.0|
|  red|good|    45| 38.97187133755819|(10,[0,2,3,4,7],[...|  1.0|
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| ba

### SQL Transformers

In [6]:
from pyspark.ml.feature import SQLTransformer

In [8]:
basicTransformation = SQLTransformer()\
    .setStatement("""
        SELECT sum(Quantity), count(*), CustomerID
        FROM __THIS__
        GROUP BY CustomerID
    """)
basicTransformation.transform(sales).show()

+-------------+--------+----------+
|sum(Quantity)|count(1)|CustomerID|
+-------------+--------+----------+
|          197|      36|   15311.0|
|          301|      21|   16539.0|
|           32|       1|   15100.0|
|          449|      20|   12583.0|
|          112|       2|   15291.0|
|          260|      14|   13767.0|
|          165|       9|   17760.0|
|           72|      23|   17905.0|
|           60|       2|   17924.0|
|           53|       7|   17420.0|
|           93|       6|   16928.0|
|           86|       5|   14496.0|
|          108|      11|   13576.0|
|          544|      11|   13408.0|
|         1004|       6|   13694.0|
|         2704|    1130|      null|
|          173|      58|   17908.0|
|           24|       4|   17572.0|
|           39|       5|   16552.0|
|          102|      15|   17377.0|
+-------------+--------+----------+
only showing top 20 rows



### VectorAssembler

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show(3)

+----+----+----+------------------------------------+
|int1|int2|int3|VectorAssembler_6f59846f2c54__output|
+----+----+----+------------------------------------+
|   1|   2|   3|                       [1.0,2.0,3.0]|
|   4|   5|   6|                       [4.0,5.0,6.0]|
|   7|   8|   9|                       [7.0,8.0,9.0]|
+----+----+----+------------------------------------+



## Working with Continuous Features
### Bucketing

In [16]:
contDF = spark.range(20).selectExpr("cast(id as double)")

In [14]:
from pyspark.ml.feature import Bucketizer

In [17]:
bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0]
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id")
bucketer.transform(contDF).show()

+----+-------------------------------+
|  id|Bucketizer_8ea7704365f0__output|
+----+-------------------------------+
| 0.0|                            0.0|
| 1.0|                            0.0|
| 2.0|                            0.0|
| 3.0|                            0.0|
| 4.0|                            0.0|
| 5.0|                            1.0|
| 6.0|                            1.0|
| 7.0|                            1.0|
| 8.0|                            1.0|
| 9.0|                            1.0|
|10.0|                            2.0|
|11.0|                            2.0|
|12.0|                            2.0|
|13.0|                            2.0|
|14.0|                            2.0|
|15.0|                            2.0|
|16.0|                            2.0|
|17.0|                            2.0|
|18.0|                            2.0|
|19.0|                            2.0|
+----+-------------------------------+



In [18]:
from pyspark.ml.feature import QuantileDiscretizer

In [24]:
# bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
# fittedBucketer = bucketer.fit(contDF)
# fittedBucketer.transform(contDF).show()

### StandardScaler

In [25]:
from pyspark.ml.feature import StandardScaler

In [26]:
scaler = StandardScaler().setInputCol("features")
scaler.fit(scaleDF).transform(scaleDF).show()

+---+--------------+-----------------------------------+
| id|      features|StandardScaler_cabf1725b926__output|
+---+--------------+-----------------------------------+
|  0|[1.0,0.1,-1.0]|               [1.19522860933439...|
|  1| [2.0,1.1,1.0]|               [2.39045721866878...|
|  0|[1.0,0.1,-1.0]|               [1.19522860933439...|
|  1| [2.0,1.1,1.0]|               [2.39045721866878...|
|  1|[3.0,10.1,3.0]|               [3.58568582800318...|
+---+--------------+-----------------------------------+



### MinMaxScaler

In [27]:
from pyspark.ml.feature import MinMaxScaler

In [29]:
minMax = MinMaxScaler().setInputCol("features")
minMax.fit(scaleDF).transform(scaleDF).show()

+---+--------------+---------------------------------+
| id|      features|MinMaxScaler_00afcdc279ef__output|
+---+--------------+---------------------------------+
|  0|[1.0,0.1,-1.0]|                    [0.0,0.0,0.0]|
|  1| [2.0,1.1,1.0]|                    [0.5,0.1,0.5]|
|  0|[1.0,0.1,-1.0]|                    [0.0,0.0,0.0]|
|  1| [2.0,1.1,1.0]|                    [0.5,0.1,0.5]|
|  1|[3.0,10.1,3.0]|                    [1.0,1.0,1.0]|
+---+--------------+---------------------------------+



### MaxAbsScaler

In [30]:
from pyspark.ml.feature import MaxAbsScaler

In [31]:
maScaler = MaxAbsScaler().setInputCol("features")
maScaler.fit(scaleDF).transform(scaleDF).show()

+---+--------------+---------------------------------+
| id|      features|MaxAbsScaler_2e6fdf33a809__output|
+---+--------------+---------------------------------+
|  0|[1.0,0.1,-1.0]|             [0.33333333333333...|
|  1| [2.0,1.1,1.0]|             [0.66666666666666...|
|  0|[1.0,0.1,-1.0]|             [0.33333333333333...|
|  1| [2.0,1.1,1.0]|             [0.66666666666666...|
|  1|[3.0,10.1,3.0]|                    [1.0,1.0,1.0]|
+---+--------------+---------------------------------+



## Working with Categorical Features
### StringIndexer

In [32]:
from pyspark.ml.feature import StringIndexer

In [38]:
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labeling")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()

+-----+----+------+------------------+--------+
|color| lab|value1|            value2|labeling|
+-----+----+------+------------------+--------+
|green|good|     1|14.386294994851129|     1.0|
| blue| bad|     8|14.386294994851129|     0.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     1.0|
|green|good|    12|14.386294994851129|     1.0|
|green| bad|    16|14.386294994851129|     0.0|
|  red|good|    35|14.386294994851129|     1.0|
|  red| bad|     1| 38.97187133755819|     0.0|
|  red| bad|     2|14.386294994851129|     0.0|
|  red| bad|    16|14.386294994851129|     0.0|
|  red|good|    45| 38.97187133755819|     1.0|
|green|good|     1|14.386294994851129|     1.0|
| blue| bad|     8|14.386294994851129|     0.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     1.0|
|green|good|    12|14.386294994851129|     1.0|
|green| bad|    16|14.386294994851129|     0.0|
|  red|good|    35|14.386294994851129|  

### Converting Indexed Values Back to Text

In [35]:
from pyspark.ml.feature import IndexToString

In [40]:
IndexToString().setInputCol("labeling").transform(idxRes).show()

+-----+----+------+------------------+--------+----------------------------------+
|color| lab|value1|            value2|labeling|IndexToString_7ca09f65b4c6__output|
+-----+----+------+------------------+--------+----------------------------------+
|green|good|     1|14.386294994851129|     1.0|                              good|
| blue| bad|     8|14.386294994851129|     0.0|                               bad|
| blue| bad|    12|14.386294994851129|     0.0|                               bad|
|green|good|    15| 38.97187133755819|     1.0|                              good|
|green|good|    12|14.386294994851129|     1.0|                              good|
|green| bad|    16|14.386294994851129|     0.0|                               bad|
|  red|good|    35|14.386294994851129|     1.0|                              good|
|  red| bad|     1| 38.97187133755819|     0.0|                               bad|
|  red| bad|     2|14.386294994851129|     0.0|                               bad|
|  r

### One-Hot Encoding

In [43]:
from pyspark.ml.feature import OneHotEncoder

In [46]:
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show()

+-----+--------+----------------------------------+
|color|colorInd|OneHotEncoder_e9b35ae4530c__output|
+-----+--------+----------------------------------+
|green|     1.0|                     (2,[1],[1.0])|
| blue|     2.0|                         (2,[],[])|
| blue|     2.0|                         (2,[],[])|
|green|     1.0|                     (2,[1],[1.0])|
|green|     1.0|                     (2,[1],[1.0])|
|green|     1.0|                     (2,[1],[1.0])|
|  red|     0.0|                     (2,[0],[1.0])|
|  red|     0.0|                     (2,[0],[1.0])|
|  red|     0.0|                     (2,[0],[1.0])|
|  red|     0.0|                     (2,[0],[1.0])|
|  red|     0.0|                     (2,[0],[1.0])|
|green|     1.0|                     (2,[1],[1.0])|
| blue|     2.0|                         (2,[],[])|
| blue|     2.0|                         (2,[],[])|
|green|     1.0|                     (2,[1],[1.0])|
|green|     1.0|                     (2,[1],[1.0])|
|green|     

## Text Data Transformers 
### Tokenizing Text

In [47]:
from pyspark.ml.feature import Tokenizer

In [48]:
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)

+-----------------------------------+------------------------------------------+
|Description                        |DescOut                                   |
+-----------------------------------+------------------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |[white, hanging, heart, t-light, holder]  |
|WHITE METAL LANTERN                |[white, metal, lantern]                   |
|CREAM CUPID HEARTS COAT HANGER     |[cream, cupid, hearts, coat, hanger]      |
|KNITTED UNION FLAG HOT WATER BOTTLE|[knitted, union, flag, hot, water, bottle]|
|RED WOOLLY HOTTIE WHITE HEART.     |[red, woolly, hottie, white, heart.]      |
|SET 7 BABUSHKA NESTING BOXES       |[set, 7, babushka, nesting, boxes]        |
|GLASS STAR FROSTED T-LIGHT HOLDER  |[glass, star, frosted, t-light, holder]   |
|HAND WARMER UNION JACK             |[hand, warmer, union, jack]               |
|HAND WARMER RED POLKA DOT          |[hand, warmer, red, polka, dot]           |
|ASSORTED COLOUR BIRD ORNAME

### Removing Common Words

In [53]:
from pyspark.ml.feature import StopWordsRemover

In [61]:
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
    .setStopWords(englishStopWords)\
    .setInputCol("DescOut")\
    .setOutputCol("DescOut2")
tokenized = stops.transform(tokenized)
tokenized.show()

+--------------------+--------------------+--------------------+
|         Description|             DescOut|            DescOut2|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|[white, hanging, ...|[white, hanging, ...|
| WHITE METAL LANTERN|[white, metal, la...|[white, metal, la...|
|CREAM CUPID HEART...|[cream, cupid, he...|[cream, cupid, he...|
|KNITTED UNION FLA...|[knitted, union, ...|[knitted, union, ...|
|RED WOOLLY HOTTIE...|[red, woolly, hot...|[red, woolly, hot...|
|SET 7 BABUSHKA NE...|[set, 7, babushka...|[set, 7, babushka...|
|GLASS STAR FROSTE...|[glass, star, fro...|[glass, star, fro...|
|HAND WARMER UNION...|[hand, warmer, un...|[hand, warmer, un...|
|HAND WARMER RED P...|[hand, warmer, re...|[hand, warmer, re...|
|ASSORTED COLOUR B...|[assorted, colour...|[assorted, colour...|
|POPPY'S PLAYHOUSE...|[poppy's, playhou...|[poppy's, playhou...|
|POPPY'S PLAYHOUSE...|[poppy's, playhou...|[poppy's, playhou...|
|FELTCRAFT PRINCES...|[fe

### Converting Words into Numerical Representations

In [49]:
from pyspark.ml.feature import CountVectorizer

In [62]:
cv = CountVectorizer()\
    .setInputCol("DescOut2")\
    .setOutputCol("countVec")\
    .setMinTF(1)\
    .setMinDF(2)
fittedCV = cv.fit(tokenized)
fittedCV.transform(tokenized).show()

+--------------------+--------------------+--------------------+--------------------+
|         Description|             DescOut|            DescOut2|            countVec|
+--------------------+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|[white, hanging, ...|[white, hanging, ...|(932,[3,9,16,22,2...|
| WHITE METAL LANTERN|[white, metal, la...|[white, metal, la...|(932,[9,17,196],[...|
|CREAM CUPID HEART...|[cream, cupid, he...|[cream, cupid, he...|(932,[46,90,127,1...|
|KNITTED UNION FLA...|[knitted, union, ...|[knitted, union, ...|(932,[12,13,14,56...|
|RED WOOLLY HOTTIE...|[red, woolly, hot...|[red, woolly, hot...|(932,[0,9,198,204...|
|SET 7 BABUSHKA NE...|[set, 7, babushka...|[set, 7, babushka...|(932,[1,38,63,288...|
|GLASS STAR FROSTE...|[glass, star, fro...|[glass, star, fro...|(932,[16,24,26,43...|
|HAND WARMER UNION...|[hand, warmer, un...|[hand, warmer, un...|(932,[20,23,56,10...|
|HAND WARMER RED P...|[hand, warmer, re...|[hand, warm

### TF-IDF
Term frequency - inverse ducoment frequency

In [65]:
tfIdfIn = tokenized\
    .where("array_contains(DescOut, 'red')")\
    .select("DescOut")\
    .limit(10)
tfIdfIn.show(10, False)

+------------------------------------+
|DescOut                             |
+------------------------------------+
|[red, woolly, hottie, white, heart.]|
|[hand, warmer, red, polka, dot]     |
|[red, coat, rack, paris, fashion]   |
|[alarm, clock, bakelike, red]       |
|[set/2, red, retrospot, tea, towels]|
|[red, toadstool, led, night, light] |
|[hand, warmer, red, polka, dot]     |
|[edwardian, parasol, red]           |
|[red, woolly, hottie, white, heart.]|
|[edwardian, parasol, red]           |
+------------------------------------+



In [66]:
from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF().setInputCol("DescOut").setOutputCol("TFOut")
idf = IDF().setInputCol("TFOut").setOutputCol("IDFOut").setMinDocFreq(2)
idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)

+------------------------------------+------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
|DescOut                             |TFOut                                                             |IDFOut                                                                                                                        |
+------------------------------------+------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
|[red, woolly, hottie, white, heart.]|(262144,[30600,57341,81060,100086,195459],[1.0,1.0,1.0,1.0,1.0])  |(262144,[30600,57341,81060,100086,195459],[1.2992829841302609,1.2992829841302609,1.2992829841302609,1.2992829841302609,0.0])  |
|[hand, warmer, red, polka, dot]     |(262144,[58545,167356,178320,1

## Feature Manipulation
### PCA

In [68]:
from pyspark.ml.feature import PCA
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)

+---+--------------+------------------------------------------+
|id |features      |PCA_7065d5216223__output                  |
+---+--------------+------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|1  |[3.0,10.1,3.0]|[-10.872398139848944,0.030962697060149758]|
+---+--------------+------------------------------------------+



### Polynomial Expansion

In [71]:
from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("features").setDegree(2)
pe.transform(scaleDF).show(5, False)

+---+--------------+-----------------------------------------------------------------------------------+
|id |features      |PolynomialExpansion_5578bbcd1530__output                                           |
+---+--------------+-----------------------------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[1.0,1.0,0.1,0.1,0.010000000000000002,-1.0,-1.0,-0.1,1.0]                          |
|1  |[2.0,1.1,1.0] |[2.0,4.0,1.1,2.2,1.2100000000000002,1.0,2.0,1.1,1.0]                               |
|0  |[1.0,0.1,-1.0]|[1.0,1.0,0.1,0.1,0.010000000000000002,-1.0,-1.0,-0.1,1.0]                          |
|1  |[2.0,1.1,1.0] |[2.0,4.0,1.1,2.2,1.2100000000000002,1.0,2.0,1.1,1.0]                               |
|1  |[3.0,10.1,3.0]|[3.0,9.0,10.1,30.299999999999997,102.00999999999999,3.0,9.0,30.299999999999997,9.0]|
+---+--------------+-----------------------------------------------------------------------------------+



## Feature Selection
### ChiSqSelector

In [73]:
from pyspark.ml.feature import ChiSqSelector

In [78]:
# Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description", "CustomerId")).where("CustomerId IS NOT NULL")

# Count Vectorizer
cv = CountVectorizer().setInputCol("DescOut").setOutputCol("countVec").setMinTF(1).setMinDF(2)
fittedCV = cv.fit(tokenized)

# ChiSqSelector
prechi = fittedCV.transform(tokenized).where("customerID IS NOT NULL")
chisq = ChiSqSelector().setFeaturesCol("countVec").setLabelCol("CustomerId").setNumTopFeatures(2)
chisq.fit(prechi).transform(prechi).drop("customerID", "Description").show()

+--------------------+--------------------+----------------------------------+
|             DescOut|            countVec|ChiSqSelector_00ea411af33c__output|
+--------------------+--------------------+----------------------------------+
|[white, hanging, ...|(709,[4,8,16,23,2...|                         (2,[],[])|
|[white, metal, la...|(709,[8,25,159],[...|                         (2,[],[])|
|[cream, cupid, he...|(709,[49,97,109,1...|                         (2,[],[])|
|[knitted, union, ...|(709,[10,11,12,52...|                         (2,[],[])|
|[red, woolly, hot...|(709,[0,8,171,174...|                         (2,[],[])|
|[set, 7, babushka...|(709,[1,37,58,220...|                         (2,[],[])|
|[glass, star, fro...|(709,[16,24,31,42...|                         (2,[],[])|
|[hand, warmer, un...|(709,[17,18,52,90...|                         (2,[],[])|
|[hand, warmer, re...|(709,[0,17,18,298...|                         (2,[],[])|
|[assorted, colour...|(709,[53,61,88,24...|         