# Build Spark Pipelines
Let's build a simple Spark ML pipeline


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

print('Spark UI running on http://YOURIPADDRESS:' + sc.uiWebUrl.split(':')[2])

## Step 1 : Load 'simplified' Propsper data
And inspect

In [None]:
prosper = spark.read. \
          option("header", "true"). \
          option("inferSchema", "true").  \
          csv("/data/prosper-loan/prosper-loan-data-simplified.csv")

prosper.show()
prosper.printSchema()
print (prosper.count())

## Step 2: Extract a few columns
Extract 
- LoanStatus
- EmploymentStatus
- CreditScore

In [None]:
prosper2 = prosper.select("???", "???", "???")
prosper2.show()

## Step 3: Sanity check data and clean it
use `describe`

In [None]:
prosper2.describe().show()

In [None]:
# Drop any NA values.  Using `dataframe.na.drop()`
prosper_clean = prosper2.???.???()
prosper_clean.show()
print(prosper_clean.count())

## Step 4: StringIndexer for 'EmploymentStatus'

In [None]:
# Hint : inputCol = EmploymentStatus,   outputCol=EmploymentStatusIndex
strIndexer_employment = StringIndexer(inputCol="???", outputCol="???")
indexed1 = strIndexer_employment.fit(prosper_clean).transform(prosper_clean)
indexed1.show()

## Step 5: One Hot Encoding for 'EmploymentStatus'
Now that we have turned `EmploymentStatus` into a number, let's turn it into a vector

In [None]:
# Hint : inputCol = this is the outputcol of 'StringIndexer' 
#                   (strIndexer_employment.getOutputCol()  )
encoder_employment = OneHotEncoder(inputCol=???, outputCol="EmploymentStatusVector")
encoded1 = encoder_employment.transform(indexed1)
encoded1.show()

## Step 6: Create a pipeline
Now we will do pipeline to do indexing and encoding 

In [None]:
## Hint : complete the following pipeline as follows
##   StringINdexer : inputCol='EmploymnetStatus' --> outputCol='EmpIndex'
##   OneHotEncoder : inputCol=strIndexer_employment.getOutputCol(), outputCol="EmpVector"

strIndexer_employment = StringIndexer(inputCol="???", outputCol="???")
encoder_employment = OneHotEncoder(inputCol=???, outputCol="???")
pipeline = Pipeline(stages=(strIndexer_employment, encoder_employment))


In [None]:
model1 = pipeline.fit(prosper_clean)
model1

In [None]:
prosper4 = model1.transform(prosper_clean)
prosper4.show()

## Step 7: Add Normalizer to CreditScore
The FICO credit score ranges between 350 and 850.  Let's normalize this to the range of 0 to 100.

In [None]:
from pyspark.ml.feature import VectorAssembler

vector_assembler= VectorAssembler(inputCols=["CreditScore"], outputCol="CreditScoreVec")
prosper_credit_score_vectorized = vector_assembler.transform(prosper_clean)
prosper_credit_score_vectorized.show()

In [None]:
from pyspark.ml.feature import MinMaxScaler

## Hint : set min=0 ,  max=100
scaler_credit_score = MinMaxScaler(min=???, max=???, inputCol="CreditScoreVec", outputCol="CreditScoreVecNormalized")

scaler_credit_score_model = scaler_credit_score.fit(prosper_credit_score_vectorized)
propser5_scaled = scaler_credit_score_model.transform(prosper_credit_score_vectorized)
propser5_scaled.show()

In [None]:
## see the bottom score
propser5_scaled.sort("CreditScore", ascending=True).show()

In [None]:
## see the top score
propser5_scaled.sort("CreditScore", ascending=False).show()

## Step: Final Pipeline Code
Let's do a pipeline from scratch


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler

prosper = spark.read. \
          option("header", "true"). \
          option("inferSchema", "true").  \
          csv("/data/prosper-loan/prosper-loan-data-simplified.csv")

prosper2 = prosper.select("LoanStatus", "EmploymentStatus", "CreditScore")
prosper_clean = prosper2.na.drop()



## Hint
##       StringIndexer (inputCol='EmploymentStatus',  outputCol='EmpIndex')
##       OneHotEncoder(inputCol='EmpIndex',  outputCol='EmpVector')

strIndexer_employment = StringIndexer(inputCol="???", outputCol="???")
encoder_employment = OneHotEncoder(inputCol="???", outputCol="???")
vector_assembler= VectorAssembler(inputCols=["CreditScore"], outputCol="CreditScoreVec")
scaler_credit_score = MinMaxScaler(min=0, max=100, inputCol="CreditScoreVec", outputCol="CreditScoreNormalized")

pipeline2 = Pipeline(stages=[strIndexer_employment, encoder_employment, vector_assembler, scaler_credit_score])

## Hint : fit 'prosper_clean' data
model2 = pipeline2.fit(???)

## Hint : transform 'prosper_clean' data
prosper_final = model2.transform(???)
prosper_final.show()