# Build Spark Pipelines
Let's build a simple Spark ML pipeline


In [1]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
spark

Initializing Spark...
Spark found in :  /Users/sujee/spark
Spark config:
	 spark.app.name=TestApp
	spark.master=local[*]
	executor.memory=2g
	spark.sql.warehouse.dir=/var/folders/lp/qm_skljd2hl4xtps5vw0tdgm0000gn/T/tmp16jzebqk
	some_property=some_value
Spark UI running on port 4040


In [2]:
import time
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

## Step 1 : Load 'simplified' Propsper data
And inspect

In [3]:
%%time

prosper = spark.read. \
          option("header", "true"). \
          option("inferSchema", "true").  \
          csv("/data/prosper-loan/prosper-loan-data-simplified.csv")

print("read {:,} records".format(prosper.count()))


read 200 records
CPU times: user 2.65 ms, sys: 1.48 ms, total: 4.13 ms
Wall time: 3.39 s


In [4]:
prosper.printSchema()
prosper.show()

root
 |-- LoanStatus: integer (nullable = true)
 |-- ListingCategory: string (nullable = true)
 |-- EmploymentStatus: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- StatedMonthlyIncome: double (nullable = true)
 |-- IncomeVerifiable: boolean (nullable = true)

+----------+---------------+----------------+-----------+-------------------+----------------+
|LoanStatus|ListingCategory|EmploymentStatus|CreditScore|StatedMonthlyIncome|IncomeVerifiable|
+----------+---------------+----------------+-----------+-------------------+----------------+
|         1|           Debt|        Employed|        740|        10416.66667|            true|
|         0|        Unknown|       Part-time|        760|                0.0|           false|
|         1|           Debt|       Full-time|        680|             5000.0|            true|
|         1|          Other|       Full-time|        800|             5000.0|            true|
|         1|           Debt|       Full-time|  

## Step 2: Extract a few columns
Extract 
- LoanStatus
- EmploymentStatus
- CreditScore

In [5]:
prosper2 = prosper.select("LoanStatus", "EmploymentStatus", "CreditScore")
prosper2.show()

+----------+----------------+-----------+
|LoanStatus|EmploymentStatus|CreditScore|
+----------+----------------+-----------+
|         1|        Employed|        740|
|         0|       Part-time|        760|
|         1|       Full-time|        680|
|         1|       Full-time|        800|
|         1|       Full-time|        600|
|         1|        Employed|        760|
|         1|       Full-time|        820|
|         1|       Full-time|        780|
|         1|   Self-employed|        680|
|         1|       Full-time|        820|
|         1|       Full-time|        700|
|         1|       Full-time|        600|
|         1|            null|        700|
|         1|           Other|        800|
|         0|       Full-time|       null|
|         1|       Full-time|        640|
|         1|        Employed|        660|
|         1|       Full-time|        740|
|         1|       Full-time|        680|
|         1|       Full-time|        680|
+----------+----------------+-----

## Step 3: Sanity check data and clean it
use `describe`

In [6]:
prosper2.describe().show()

+-------+-------------------+----------------+-----------------+
|summary|         LoanStatus|EmploymentStatus|      CreditScore|
+-------+-------------------+----------------+-----------------+
|  count|                200|             199|              199|
|   mean|               0.68|            null|679.7989949748744|
| stddev|0.46764673348231944|            null|65.93507519066739|
|    min|                  0|        Employed|              520|
|    max|                  1|   Self-employed|              840|
+-------+-------------------+----------------+-----------------+



In [7]:
# Drop any NA values.  Using `dataframe.na.drop()`
prosper_clean = prosper2.na.drop()
prosper_clean.show()
print(prosper_clean.count())

+----------+----------------+-----------+
|LoanStatus|EmploymentStatus|CreditScore|
+----------+----------------+-----------+
|         1|        Employed|        740|
|         0|       Part-time|        760|
|         1|       Full-time|        680|
|         1|       Full-time|        800|
|         1|       Full-time|        600|
|         1|        Employed|        760|
|         1|       Full-time|        820|
|         1|       Full-time|        780|
|         1|   Self-employed|        680|
|         1|       Full-time|        820|
|         1|       Full-time|        700|
|         1|       Full-time|        600|
|         1|           Other|        800|
|         1|       Full-time|        640|
|         1|        Employed|        660|
|         1|       Full-time|        740|
|         1|       Full-time|        680|
|         1|       Full-time|        680|
|         0|       Full-time|        580|
|         1|       Full-time|        540|
+----------+----------------+-----

## Step 4: StringIndexer for 'EmploymentStatus'

In [8]:
strIndexer_employment = StringIndexer(inputCol="EmploymentStatus", outputCol="EmploymentStatusIndex")
indexed1 = strIndexer_employment.fit(prosper_clean).transform(prosper_clean)
indexed1.show()

+----------+----------------+-----------+---------------------+
|LoanStatus|EmploymentStatus|CreditScore|EmploymentStatusIndex|
+----------+----------------+-----------+---------------------+
|         1|        Employed|        740|                  1.0|
|         0|       Part-time|        760|                  3.0|
|         1|       Full-time|        680|                  0.0|
|         1|       Full-time|        800|                  0.0|
|         1|       Full-time|        600|                  0.0|
|         1|        Employed|        760|                  1.0|
|         1|       Full-time|        820|                  0.0|
|         1|       Full-time|        780|                  0.0|
|         1|   Self-employed|        680|                  2.0|
|         1|       Full-time|        820|                  0.0|
|         1|       Full-time|        700|                  0.0|
|         1|       Full-time|        600|                  0.0|
|         1|           Other|        800

## Step 5: One Hot Encoding for 'EmploymentStatus'
Now that we have turned `EmploymentStatus` into a number, let's turn it into a vector

In [9]:
encoder_employment = OneHotEncoder(inputCol="EmploymentStatusIndex", outputCol="EmploymentStatusVector")
encoded1 = encoder_employment.transform(indexed1)
encoded1.show()

+----------+----------------+-----------+---------------------+----------------------+
|LoanStatus|EmploymentStatus|CreditScore|EmploymentStatusIndex|EmploymentStatusVector|
+----------+----------------+-----------+---------------------+----------------------+
|         1|        Employed|        740|                  1.0|         (6,[1],[1.0])|
|         0|       Part-time|        760|                  3.0|         (6,[3],[1.0])|
|         1|       Full-time|        680|                  0.0|         (6,[0],[1.0])|
|         1|       Full-time|        800|                  0.0|         (6,[0],[1.0])|
|         1|       Full-time|        600|                  0.0|         (6,[0],[1.0])|
|         1|        Employed|        760|                  1.0|         (6,[1],[1.0])|
|         1|       Full-time|        820|                  0.0|         (6,[0],[1.0])|
|         1|       Full-time|        780|                  0.0|         (6,[0],[1.0])|
|         1|   Self-employed|        680|  

## Step 6: Create a pipeline
Now we will do pipeline to do indexing and encoding 

In [10]:
strIndexer_employment = StringIndexer(inputCol="EmploymentStatus", outputCol="EmploymentStatusIndex")
encoder_employment = OneHotEncoder(inputCol=strIndexer_employment.getOutputCol(), outputCol="EmploymentStatusVector")
pipeline = Pipeline(stages=(strIndexer_employment, encoder_employment))


In [11]:
model1 = pipeline.fit(prosper_clean)
model1

PipelineModel_43978345a07904c300e7

In [12]:
prosper4 = model1.transform(prosper_clean)
prosper4.show()

+----------+----------------+-----------+---------------------+----------------------+
|LoanStatus|EmploymentStatus|CreditScore|EmploymentStatusIndex|EmploymentStatusVector|
+----------+----------------+-----------+---------------------+----------------------+
|         1|        Employed|        740|                  1.0|         (6,[1],[1.0])|
|         0|       Part-time|        760|                  3.0|         (6,[3],[1.0])|
|         1|       Full-time|        680|                  0.0|         (6,[0],[1.0])|
|         1|       Full-time|        800|                  0.0|         (6,[0],[1.0])|
|         1|       Full-time|        600|                  0.0|         (6,[0],[1.0])|
|         1|        Employed|        760|                  1.0|         (6,[1],[1.0])|
|         1|       Full-time|        820|                  0.0|         (6,[0],[1.0])|
|         1|       Full-time|        780|                  0.0|         (6,[0],[1.0])|
|         1|   Self-employed|        680|  

## Step 7: Add Normalizer to CreditScore
The FICO credit score ranges between 350 and 850.  Let's normalize this to the range of 0 to 100.

In [13]:
from pyspark.ml.feature import VectorAssembler

vector_assembler= VectorAssembler(inputCols=["CreditScore"], outputCol="CreditScoreVec")
prosper_credit_score_vectorized = vector_assembler.transform(prosper_clean)
prosper_credit_score_vectorized.show()

+----------+----------------+-----------+--------------+
|LoanStatus|EmploymentStatus|CreditScore|CreditScoreVec|
+----------+----------------+-----------+--------------+
|         1|        Employed|        740|       [740.0]|
|         0|       Part-time|        760|       [760.0]|
|         1|       Full-time|        680|       [680.0]|
|         1|       Full-time|        800|       [800.0]|
|         1|       Full-time|        600|       [600.0]|
|         1|        Employed|        760|       [760.0]|
|         1|       Full-time|        820|       [820.0]|
|         1|       Full-time|        780|       [780.0]|
|         1|   Self-employed|        680|       [680.0]|
|         1|       Full-time|        820|       [820.0]|
|         1|       Full-time|        700|       [700.0]|
|         1|       Full-time|        600|       [600.0]|
|         1|           Other|        800|       [800.0]|
|         1|       Full-time|        640|       [640.0]|
|         1|        Employed|  

In [14]:
from pyspark.ml.feature import MinMaxScaler


scaler_credit_score = MinMaxScaler(min=0, max=100, inputCol="CreditScoreVec", outputCol="CreditScoreVecNormalized")

scaler_credit_score_model = scaler_credit_score.fit(prosper_credit_score_vectorized)
propser5_scaled = scaler_credit_score_model.transform(prosper_credit_score_vectorized)
propser5_scaled.show()

+----------+----------------+-----------+--------------+------------------------+
|LoanStatus|EmploymentStatus|CreditScore|CreditScoreVec|CreditScoreVecNormalized|
+----------+----------------+-----------+--------------+------------------------+
|         1|        Employed|        740|       [740.0]|                 [68.75]|
|         0|       Part-time|        760|       [760.0]|                  [75.0]|
|         1|       Full-time|        680|       [680.0]|                  [50.0]|
|         1|       Full-time|        800|       [800.0]|                  [87.5]|
|         1|       Full-time|        600|       [600.0]|                  [25.0]|
|         1|        Employed|        760|       [760.0]|                  [75.0]|
|         1|       Full-time|        820|       [820.0]|                 [93.75]|
|         1|       Full-time|        780|       [780.0]|                 [81.25]|
|         1|   Self-employed|        680|       [680.0]|                  [50.0]|
|         1|    

In [15]:
propser5_scaled.sort("CreditScore", ascending=True).show()

+----------+----------------+-----------+--------------+------------------------+
|LoanStatus|EmploymentStatus|CreditScore|CreditScoreVec|CreditScoreVecNormalized|
+----------+----------------+-----------+--------------+------------------------+
|         0|       Full-time|        520|       [520.0]|                   [0.0]|
|         0|       Full-time|        520|       [520.0]|                   [0.0]|
|         1|       Full-time|        520|       [520.0]|                   [0.0]|
|         1|       Full-time|        540|       [540.0]|                  [6.25]|
|         1|       Full-time|        540|       [540.0]|                  [6.25]|
|         0|       Full-time|        540|       [540.0]|                  [6.25]|
|         0|       Full-time|        540|       [540.0]|                  [6.25]|
|         0|       Full-time|        540|       [540.0]|                  [6.25]|
|         1|       Full-time|        540|       [540.0]|                  [6.25]|
|         1|    

In [16]:
propser5_scaled.sort("CreditScore", ascending=False).show()

+----------+----------------+-----------+--------------+------------------------+
|LoanStatus|EmploymentStatus|CreditScore|CreditScoreVec|CreditScoreVecNormalized|
+----------+----------------+-----------+--------------+------------------------+
|         1|       Full-time|        840|       [840.0]|                 [100.0]|
|         1|       Full-time|        820|       [820.0]|                 [93.75]|
|         1|       Full-time|        820|       [820.0]|                 [93.75]|
|         1|       Full-time|        820|       [820.0]|                 [93.75]|
|         1|           Other|        800|       [800.0]|                  [87.5]|
|         1|       Full-time|        800|       [800.0]|                  [87.5]|
|         1|        Employed|        800|       [800.0]|                  [87.5]|
|         1|       Full-time|        800|       [800.0]|                  [87.5]|
|         1|       Full-time|        800|       [800.0]|                  [87.5]|
|         1|    

## Step: Final Pipeline Code
Let's do a pipeline from scratch


In [17]:
%%time

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler

prosper = spark.read. \
          option("header", "true"). \
          option("inferSchema", "true").  \
          csv("/data/prosper-loan/prosper-loan-data-simplified.csv")

prosper2 = prosper.select("LoanStatus", "EmploymentStatus", "CreditScore")
prosper_clean = prosper2.na.drop()


# transformers
strIndexer_employment = StringIndexer(inputCol="EmploymentStatus", outputCol="EmpIndex")
encoder_employment = OneHotEncoder(inputCol="EmpIndex", outputCol="EmpVector")
vector_assembler= VectorAssembler(inputCols=["CreditScore"], outputCol="CreditScoreVec")
scaler_credit_score = MinMaxScaler(min=0, max=100, inputCol="CreditScoreVec", outputCol="CreditScoreNormalized")

pipeline2 = Pipeline(stages=[strIndexer_employment, encoder_employment, vector_assembler, scaler_credit_score])
model2 = pipeline2.fit(prosper_clean)
prosper_final = model2.transform(prosper_clean)
prosper_final.show()

+----------+----------------+-----------+--------+-------------+--------------+---------------------+
|LoanStatus|EmploymentStatus|CreditScore|EmpIndex|    EmpVector|CreditScoreVec|CreditScoreNormalized|
+----------+----------------+-----------+--------+-------------+--------------+---------------------+
|         1|        Employed|        740|     1.0|(6,[1],[1.0])|       [740.0]|              [68.75]|
|         0|       Part-time|        760|     3.0|(6,[3],[1.0])|       [760.0]|               [75.0]|
|         1|       Full-time|        680|     0.0|(6,[0],[1.0])|       [680.0]|               [50.0]|
|         1|       Full-time|        800|     0.0|(6,[0],[1.0])|       [800.0]|               [87.5]|
|         1|       Full-time|        600|     0.0|(6,[0],[1.0])|       [600.0]|               [25.0]|
|         1|        Employed|        760|     1.0|(6,[1],[1.0])|       [760.0]|               [75.0]|
|         1|       Full-time|        820|     0.0|(6,[0],[1.0])|       [820.0]|   