### Imports

In [1]:
from pyspark.ml.feature import StringIndexer, VectorIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

### Create a Spark Local Session

In [2]:
spark_session = SparkSession.builder.appName('PySparkLocalTest').getOrCreate()

In [3]:
spark_session

### Read Raw Data using PySpark Local

In [4]:
schema = StructType([StructField('id', IntegerType(), True), 
                     StructField('name', StringType(), True),
                     StructField('age', IntegerType(), True),
                     StructField('sex', StringType(), True),
                     StructField('weight', DoubleType(), True),
                     StructField('eye_color', StringType(), True)
                    ])

In [5]:
df = spark_session.read.csv('./DATA/raw.csv', header=True, schema=schema)

In [6]:
df.show()

+---+-----+---+---+------+---------+
| id| name|age|sex|weight|eye_color|
+---+-----+---+---+------+---------+
|102|bella| 34|  F|178.23|     blue|
|103| cara| 19|  F|149.21|    green|
|104|danny| 44|  M|200.33|     blue|
|105| emma| 28|  F|188.32|    black|
+---+-----+---+---+------+---------+



In [7]:
sex_indexer = StringIndexer(inputCol='sex', outputCol='indexed_sex')
df = sex_indexer.fit(df).transform(df)

In [8]:
df.show()

+---+-----+---+---+------+---------+-----------+
| id| name|age|sex|weight|eye_color|indexed_sex|
+---+-----+---+---+------+---------+-----------+
|102|bella| 34|  F|178.23|     blue|        0.0|
|103| cara| 19|  F|149.21|    green|        0.0|
|104|danny| 44|  M|200.33|     blue|        1.0|
|105| emma| 28|  F|188.32|    black|        0.0|
+---+-----+---+---+------+---------+-----------+



In [9]:
sex_encoder = OneHotEncoder(inputCol='indexed_sex', outputCol='sex_vector')

In [10]:
df = sex_encoder.transform(df)
df.show()

+---+-----+---+---+------+---------+-----------+-------------+
| id| name|age|sex|weight|eye_color|indexed_sex|   sex_vector|
+---+-----+---+---+------+---------+-----------+-------------+
|102|bella| 34|  F|178.23|     blue|        0.0|(1,[0],[1.0])|
|103| cara| 19|  F|149.21|    green|        0.0|(1,[0],[1.0])|
|104|danny| 44|  M|200.33|     blue|        1.0|    (1,[],[])|
|105| emma| 28|  F|188.32|    black|        0.0|(1,[0],[1.0])|
+---+-----+---+---+------+---------+-----------+-------------+



#### eye color transform

In [11]:
eye_color_indexer = StringIndexer(inputCol='eye_color', outputCol='indexed_eye_color')
df = eye_color_indexer.fit(df).transform(df)

<p> One-hot encoding transforms the values indexed column into a binary vector 
where at maximum one value may be 1. Since there are three values, the vector 
is of length 2 and the mapping is as follows: </p>

<ul> 
    <li> 0 => 10 
    <li> 1 => 01
    <li> 2 => 00 
</ul>

In [12]:
eye_color_encoder = OneHotEncoder(inputCol='indexed_eye_color', outputCol='eye_color_vector')

In [13]:
df = eye_color_encoder.transform(df)
df.show()

+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+
| id| name|age|sex|weight|eye_color|indexed_sex|   sex_vector|indexed_eye_color|eye_color_vector|
+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+
|102|bella| 34|  F|178.23|     blue|        0.0|(1,[0],[1.0])|              0.0|   (2,[0],[1.0])|
|103| cara| 19|  F|149.21|    green|        0.0|(1,[0],[1.0])|              1.0|   (2,[1],[1.0])|
|104|danny| 44|  M|200.33|     blue|        1.0|    (1,[],[])|              0.0|   (2,[0],[1.0])|
|105| emma| 28|  F|188.32|    black|        0.0|(1,[0],[1.0])|              2.0|       (2,[],[])|
+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+



In [14]:
# vector assembler will bring all the features to a 1D vector for us to save easily into CSV format
assembler = VectorAssembler(inputCols=['age',
                                       'weight',
                                       'sex_vector',
                                       'eye_color_vector'], 
                            outputCol='features')

In [15]:
df = assembler.transform(df)

In [16]:
df.show()

+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+--------------------+
| id| name|age|sex|weight|eye_color|indexed_sex|   sex_vector|indexed_eye_color|eye_color_vector|            features|
+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+--------------------+
|102|bella| 34|  F|178.23|     blue|        0.0|(1,[0],[1.0])|              0.0|   (2,[0],[1.0])|[34.0,178.23,1.0,...|
|103| cara| 19|  F|149.21|    green|        0.0|(1,[0],[1.0])|              1.0|   (2,[1],[1.0])|[19.0,149.21,1.0,...|
|104|danny| 44|  M|200.33|     blue|        1.0|    (1,[],[])|              0.0|   (2,[0],[1.0])|[44.0,200.33,0.0,...|
|105| emma| 28|  F|188.32|    black|        0.0|(1,[0],[1.0])|              2.0|       (2,[],[])|[28.0,188.32,1.0,...|
+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+--------------------+



In [17]:
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(df)

In [18]:
# normalize each feature to have unit standard deviation
df = scaler_model.transform(df)
df.show()

+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+--------------------+--------------------+
| id| name|age|sex|weight|eye_color|indexed_sex|   sex_vector|indexed_eye_color|eye_color_vector|            features|      scaledFeatures|
+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+--------------------+--------------------+
|102|bella| 34|  F|178.23|     blue|        0.0|(1,[0],[1.0])|              0.0|   (2,[0],[1.0])|[34.0,178.23,1.0,...|[0.6,0.5676838810...|
|103| cara| 19|  F|149.21|    green|        0.0|(1,[0],[1.0])|              1.0|   (2,[1],[1.0])|[19.0,149.21,1.0,...|[0.0,0.0,1.0,0.0,...|
|104|danny| 44|  M|200.33|     blue|        1.0|    (1,[],[])|              0.0|   (2,[0],[1.0])|[44.0,200.33,0.0,...|[1.0,1.0,0.0,1.0,...|
|105| emma| 28|  F|188.32|    black|        0.0|(1,[0],[1.0])|              2.0|       (2,[],[])|[28.0,188.32,1.0,...|[0.36,0.765062597...|
+---+-----+---+---+-

In [19]:
df.select('scaledFeatures').show(truncate=False)

+-------------------------------------+
|scaledFeatures                       |
+-------------------------------------+
|[0.6,0.5676838810641623,1.0,1.0,0.0] |
|[0.0,0.0,1.0,0.0,1.0]                |
|[1.0,1.0,0.0,1.0,0.0]                |
|[0.36,0.7650625978090764,1.0,0.0,0.0]|
+-------------------------------------+



### Combine above Steps into a Featurization Pipeline

In [20]:
df = spark_session.read.csv('./DATA/raw.csv', header=True, schema=schema)

In [21]:
sex_indexer = StringIndexer(inputCol='sex', outputCol='indexed_sex')
sex_encoder = OneHotEncoder(inputCol='indexed_sex', outputCol='sex_vector')
eye_color_indexer = StringIndexer(inputCol='eye_color', outputCol='indexed_eye_color')
eye_color_encoder = OneHotEncoder(inputCol='indexed_eye_color', outputCol='eye_color_vector')
assembler = VectorAssembler(inputCols=['age',
                                       'weight',
                                       'sex_vector',
                                       'eye_color_vector'], 
                            outputCol='features')
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')

In [22]:
# The pipeline comprises of the steps added above
pipeline = Pipeline(stages=[sex_indexer, sex_encoder, eye_color_indexer, eye_color_encoder, assembler, scaler])
    
# This step trains the feature transformers
model = pipeline.fit(df)
    
# This step transforms the dataset with information obtained from the previous fit
df = model.transform(df)

In [23]:
df.show()

+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+--------------------+--------------------+
| id| name|age|sex|weight|eye_color|indexed_sex|   sex_vector|indexed_eye_color|eye_color_vector|            features|      scaledFeatures|
+---+-----+---+---+------+---------+-----------+-------------+-----------------+----------------+--------------------+--------------------+
|102|bella| 34|  F|178.23|     blue|        0.0|(1,[0],[1.0])|              0.0|   (2,[0],[1.0])|[34.0,178.23,1.0,...|[0.6,0.5676838810...|
|103| cara| 19|  F|149.21|    green|        0.0|(1,[0],[1.0])|              1.0|   (2,[1],[1.0])|[19.0,149.21,1.0,...|[0.0,0.0,1.0,0.0,...|
|104|danny| 44|  M|200.33|     blue|        1.0|    (1,[],[])|              0.0|   (2,[0],[1.0])|[44.0,200.33,0.0,...|[1.0,1.0,0.0,1.0,...|
|105| emma| 28|  F|188.32|    black|        0.0|(1,[0],[1.0])|              2.0|       (2,[],[])|[28.0,188.32,1.0,...|[0.36,0.765062597...|
+---+-----+---+---+-

In [24]:
df.select('scaledFeatures').show(truncate=False)

+-------------------------------------+
|scaledFeatures                       |
+-------------------------------------+
|[0.6,0.5676838810641623,1.0,1.0,0.0] |
|[0.0,0.0,1.0,0.0,1.0]                |
|[1.0,1.0,0.0,1.0,0.0]                |
|[0.36,0.7650625978090764,1.0,0.0,0.0]|
+-------------------------------------+



In [25]:
age_udf = udf(lambda x: x[0].item(), DoubleType())
weight_udf = udf(lambda x: x[1].item(), DoubleType())
sex_udf = udf(lambda x: x[2].item(), DoubleType())
blue_eye_udf = udf(lambda x: x[3].item(), DoubleType())
black_eye_udf = udf(lambda x: x[3].item(), DoubleType())

In [26]:
df = df.select(age_udf('scaledFeatures').alias('age'), 
               weight_udf('scaledFeatures').alias('weight'),
               sex_udf('scaledFeatures').alias('sex'),
               blue_eye_udf('scaledFeatures').alias('is_blue_eye'),
               black_eye_udf('scaledFeatures').alias('is_black_eye'),
              )

In [27]:
df.show()

+----+------------------+---+-----------+------------+
| age|            weight|sex|is_blue_eye|is_black_eye|
+----+------------------+---+-----------+------------+
| 0.6|0.5676838810641623|1.0|        1.0|         1.0|
| 0.0|               0.0|1.0|        0.0|         0.0|
| 1.0|               1.0|0.0|        1.0|         1.0|
|0.36|0.7650625978090764|1.0|        0.0|         0.0|
+----+------------------+---+-----------+------------+



In [28]:
df.write.format('csv') \
        .option('header', True) \
        .mode('overwrite') \
        .option('sep', ',') \
        .save('file:////home/ec2-user/SageMaker/SageMaker-Studio-Examples/11.Processing/DATA/transformed.csv')