In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
   .builder \
   .master("local[2]") \
   .appName("Preprocessing and Feature Engineering") \
   .getOrCreate()

In [2]:
from pyspark.sql import Row
sc = spark.sparkContext
df = sc.parallelize([
    Row(name="Alice", age=30, height=None),
    Row(name="Bob", age=None, height=165),
    Row(name=None, age=25,  height=180)
]).toDF()

In [3]:
df.show()

+----+------+-----+
| age|height| name|
+----+------+-----+
|  30|  null|Alice|
|null|   165|  Bob|
|  25|   180| null|
+----+------+-----+



In [4]:
df.replace(30, 26).show()

+----+------+-----+
| age|height| name|
+----+------+-----+
|  26|  null|Alice|
|null|   165|  Bob|
|  25|   180| null|
+----+------+-----+



In [5]:
df.na.replace(25, 29).show()

+----+------+-----+
| age|height| name|
+----+------+-----+
|  30|  null|Alice|
|null|   165|  Bob|
|  29|   180| null|
+----+------+-----+



In [6]:
df1 = sc.parallelize([
    Row(name="Alice", age=30, height=170),
    Row(name="Bob", age=28, height=165),
    Row(name="Jilly", age=25,  height=180)
]).toDF()
df1.show()

+---+------+-----+
|age|height| name|
+---+------+-----+
| 30|   170|Alice|
| 28|   165|  Bob|
| 25|   180|Jilly|
+---+------+-----+



In [7]:
df1.na.replace(30, 26).show()

+---+------+-----+
|age|height| name|
+---+------+-----+
| 26|   170|Alice|
| 28|   165|  Bob|
| 25|   180|Jilly|
+---+------+-----+



In [8]:
df.join(df1, "name").show()

+-----+----+------+---+------+
| name| age|height|age|height|
+-----+----+------+---+------+
|  Bob|null|   165| 28|   165|
|Alice|  30|  null| 30|   170|
+-----+----+------+---+------+



In [9]:
df1.join(df, "name").show()

+-----+---+------+----+------+
| name|age|height| age|height|
+-----+---+------+----+------+
|  Bob| 28|   165|null|   165|
|Alice| 30|   170|  30|  null|
+-----+---+------+----+------+



In [10]:
df2 = df
df2.join(df1, "name").show()

+-----+----+------+---+------+
| name| age|height|age|height|
+-----+----+------+---+------+
|  Bob|null|   165| 28|   165|
|Alice|  30|  null| 30|   170|
+-----+----+------+---+------+



In [11]:
df.join(df, "name").show()

+-----+----+------+----+------+
| name| age|height| age|height|
+-----+----+------+----+------+
|  Bob|null|   165|null|   165|
|Alice|  30|  null|  30|  null|
+-----+----+------+----+------+



In [12]:
df.select((df["height"]-1).alias("new_height")).show()

+----------+
|new_height|
+----------+
|      null|
|       164|
|       179|
+----------+



In [13]:
from pyspark.sql.functions import coalesce, col
df.select(coalesce(col('name'), col('age'))).show()

+-------------------+
|coalesce(name, age)|
+-------------------+
|              Alice|
|                Bob|
|                 25|
+-------------------+



In [14]:
df.select(coalesce(col('name'))).show()

+--------------+
|coalesce(name)|
+--------------+
|         Alice|
|           Bob|
|          null|
+--------------+



In [15]:
df = df.fillna(0)

In [16]:
df.show()

+---+------+-----+
|age|height| name|
+---+------+-----+
| 30|     0|Alice|
|  0|   165|  Bob|
| 25|   180| null|
+---+------+-----+



In [17]:
df = df.fillna("John")

In [18]:
df.show()

+---+------+-----+
|age|height| name|
+---+------+-----+
| 30|     0|Alice|
|  0|   165|  Bob|
| 25|   180| John|
+---+------+-----+



In [19]:
from pyspark.sql.functions import create_map, col
df.select(create_map(col("name"), col("age"))).alias("complex_map").show()

+--------------+
|map(name, age)|
+--------------+
| [Alice -> 30]|
|    [Bob -> 0]|
|  [John -> 25]|
+--------------+



In [20]:
from pyspark.sql.functions import explode
df.select(create_map(col("name"), col("age")).alias("complex_map")).selectExpr("explode(complex_map)").show()

+-----+-----+
|  key|value|
+-----+-----+
|Alice|   30|
|  Bob|    0|
| John|   25|
+-----+-----+



## Formating model according to use case
   - In the case of most classification and regression algorithms, you want to get your data into a column of type Double to represent the label and a column of type Vector(either dense or sparse) to represent the features.
   - In the case of recommendation, you want to get your data into a column of users, a column of items (say movies or books) and a column of ratings.
   - In the case of unsupervised learning, a column of type Vector (either dense or sparse) is needed to represent the features.
   - In the case of graph analytics, you will want a dataframe of vertices and a dataframe of edges.
### transformers -> pyspark.ml.feature
   - Tokenizer is an example of transformer. It tokenizes a string, splitting on a given character

In [21]:
sales = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("../data/retail-data/by-day/*.csv")\
  .where("Description IS NOT NULL")
sales.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
|   580538|    21544|SKULLS  WATER TRA...|      48|2011-12-05 08:38:00|     0.85|   14075.0|United Kingdom|
|   580538|    23126|FELTCRA

In [22]:
sales = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("../data/retail-data/by-day/*.csv")\
  .coalesce(5)\
  .where("Description IS NOT NULL")
sales.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
|   580538|    21544|SKULLS  WATER TRA...|      48|2011-12-05 08:38:00|     0.85|   14075.0|United Kingdom|
|   580538|    23126|FELTCRA

In [23]:
fakeIntDF = spark.read.parquet("../data/simple-ml-integers")
simpleDF = spark.read.json("../data/simple-ml")
scaleDF = spark.read.parquet("../data/simple-ml-scaling/")

In [24]:
sales.cache()
sales.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
|   580538|    21544|SKULLS  WATER TRA...|      48|2011-12-05 08:38:00|     0.85|   14075.0|United Kingdom|
|   580538|    23126|FELTCRA

### Estimator
 - An estimator is necessary when a transformation you would like to perform must be initialized with data or information about the input column.

In [25]:
from pyspark.ml.feature import Tokenizer, StandardScaler
ss = StandardScaler(inputCol="features")
ss.fit(scaleDF).transform(scaleDF).show(truncate=False)

+---+--------------+------------------------------------------------------------+
|id |features      |StandardScaler_2b31baeb0d76__output                         |
+---+--------------+------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[1.1952286093343936,0.02337622911060922,-0.5976143046671968]|
|1  |[2.0,1.1,1.0] |[2.390457218668787,0.2571385202167014,0.5976143046671968]   |
|0  |[1.0,0.1,-1.0]|[1.1952286093343936,0.02337622911060922,-0.5976143046671968]|
|1  |[2.0,1.1,1.0] |[2.390457218668787,0.2571385202167014,0.5976143046671968]   |
|1  |[3.0,10.1,3.0]|[3.5856858280031805,2.3609991401715313,1.7928429140015902]  |
+---+--------------+------------------------------------------------------------+



In [26]:
scaleDF.show()

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  1|[3.0,10.1,3.0]|
+---+--------------+



In [31]:
Tokenizer(inputCol="Description").transform(sales.select("Description")).show(truncate=False)

+-----------------------------------+------------------------------------------+
|Description                        |Tokenizer_d90ddc64c435__output            |
+-----------------------------------+------------------------------------------+
|RABBIT NIGHT LIGHT                 |[rabbit, night, light]                    |
|DOUGHNUT LIP GLOSS                 |[doughnut, lip, gloss]                    |
|12 MESSAGE CARDS WITH ENVELOPES    |[12, message, cards, with, envelopes]     |
|BLUE HARMONICA IN BOX              |[blue, harmonica, in, box]                |
|GUMBALL COAT RACK                  |[gumball, coat, rack]                     |
|SKULLS  WATER TRANSFER TATTOOS     |[skulls, , water, transfer, tattoos]      |
|FELTCRAFT GIRL AMELIE KIT          |[feltcraft, girl, amelie, kit]            |
|CAMOUFLAGE LED TORCH               |[camouflage, led, torch]                  |
|WHITE SKULL HOT WATER BOTTLE       |[white, skull, hot, water, bottle]        |
|ENGLISH ROSE HOT WATER BOTT

In [34]:
from pyspark.sql.functions import split, col
sales.select(split(col("Description"), " ")).show(truncate=False)

+------------------------------------------+
|split(Description,  )                     |
+------------------------------------------+
|[RABBIT, NIGHT, LIGHT]                    |
|[DOUGHNUT, LIP, GLOSS, ]                  |
|[12, MESSAGE, CARDS, WITH, ENVELOPES]     |
|[BLUE, HARMONICA, IN, BOX, ]              |
|[GUMBALL, COAT, RACK]                     |
|[SKULLS, , WATER, TRANSFER, TATTOOS, ]    |
|[FELTCRAFT, GIRL, AMELIE, KIT]            |
|[CAMOUFLAGE, LED, TORCH]                  |
|[WHITE, SKULL, HOT, WATER, BOTTLE, ]      |
|[ENGLISH, ROSE, HOT, WATER, BOTTLE]       |
|[HOT, WATER, BOTTLE, KEEP, CALM]          |
|[SCOTTIE, DOG, HOT, WATER, BOTTLE]        |
|[ROSE, CARAVAN, DOORSTOP]                 |
|[GINGHAM, HEART, , DOORSTOP, RED]         |
|[STORAGE, TIN, VINTAGE, LEAF]             |
|[SET, OF, 4, KNICK, KNACK, TINS, POPPIES] |
|[POPCORN, HOLDER]                         |
|[GROW, A, FLYTRAP, OR, SUNFLOWER, IN, TIN]|
|[AIRLINE, BAG, VINTAGE, WORLD, CHAMPION, ]|
|[AIRLINE,

### High-Level Transformers
 - High level transformers, Rformula allows to concisely specify a number of transformations is one

In [27]:
simpleDF.show(5)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green|good|    15| 38.97187133755819|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 5 rows



In [28]:
from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~.+ color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show(3, False)

+-----+----+------+------------------+--------------------------------------------------------------------+-----+
|color|lab |value1|value2            |features                                                            |label|
+-----+----+------+------------------+--------------------------------------------------------------------+-----+
|green|good|1     |14.386294994851129|(10,[1,2,3,5,8],[1.0,1.0,14.386294994851129,1.0,14.386294994851129])|1.0  |
|blue |bad |8     |14.386294994851129|(10,[2,3,6,9],[8.0,14.386294994851129,8.0,14.386294994851129])      |0.0  |
|blue |bad |12    |14.386294994851129|(10,[2,3,6,9],[12.0,14.386294994851129,12.0,14.386294994851129])    |0.0  |
+-----+----+------+------------------+--------------------------------------------------------------------+-----+
only showing top 3 rows



In [32]:
from pyspark.ml.feature import SQLTransformer
basicTransformation = SQLTransformer() \
    .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
    ORDER BY CustomerID
    """)
basicTransformation.transform(sales).show()

+-------------+--------+----------+
|sum(Quantity)|count(1)|CustomerID|
+-------------+--------+----------+
|       283171|  133626|      null|
|            0|       2|   12346.0|
|         2458|     182|   12347.0|
|         2341|      31|   12348.0|
|          631|      73|   12349.0|
|          197|      17|   12350.0|
|          470|      95|   12352.0|
|           20|       4|   12353.0|
|          530|      58|   12354.0|
|          240|      13|   12355.0|
|         1591|      59|   12356.0|
|         2708|     131|   12357.0|
|          248|      19|   12358.0|
|         1612|     254|   12359.0|
|         1165|     129|   12360.0|
|           91|      10|   12361.0|
|         2212|     274|   12362.0|
|          408|      23|   12363.0|
|         1506|      85|   12364.0|
|          173|      23|   12365.0|
+-------------+--------+----------+
only showing top 20 rows



### VectorAssembler
 - it will be used in nearly every single pipeline you generate. It helps concatenate all features into one big vector you can then pass into an estimator.

In [33]:
fakeIntDF.show(3)

+----+----+----+
|int1|int2|int3|
+----+----+----+
|   7|   8|   9|
|   4|   5|   6|
|   1|   2|   3|
+----+----+----+



In [34]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show(5)

+----+----+----+------------------------------------+
|int1|int2|int3|VectorAssembler_bf236edce483__output|
+----+----+----+------------------------------------+
|   7|   8|   9|                       [7.0,8.0,9.0]|
|   4|   5|   6|                       [4.0,5.0,6.0]|
|   1|   2|   3|                       [1.0,2.0,3.0]|
+----+----+----+------------------------------------+



### Working with continuous variables
 - Continuous features are values from -Infinite to +/Infinite
    - Convert continuous features into categorical features vis a process called bucketing.
    - can be scaled and normalized features.
    - Transformers will only work on Double Types.

#### The following three conditions are to be met in bucketing split
  - The minimum value in your splits array must be less than the minimum value in dataFrame.
  - The maximum value in your splits array must be greater than the maximum value in dataFrame.
  - You need to specify at a minimum three values in the split array, which creates two buckets.