In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.6.tgz
!tar xvf spark-2.4.4-bin-hadoop2.6.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.6"
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="PySpark_dataframe")

# Flights Application Pipeline

Predicte the duration that each plane will take.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
spark = SparkSession.builder.master('local[2]') \
        .appName('ML_pipeline') \
        .getOrCreate()

In [None]:
# Read data from CSV file
flights = spark.read.csv('flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')
# Remove the 'flight' column
flights = flights.drop('flight')
# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')
# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()

# Import the round function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
# we use cast function to convert from boolean to integer (0 and 1)
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
# Cast convert boolean value to integer
flights_km = flights_km.withColumn('label', (flights_km.delay >= 15).cast('integer'))

ML pipeline combine steps into a pipeline

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.regression import LinearRegression

flights_train, flights_test = flights_km.randomSplit([0.8, 0.2], seed=17)
# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoderEstimator(
    inputCols=['org_idx', 'dow'],
    outputCols=['org_dummy','dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

In [None]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

In [None]:
predictions.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- org: string (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: integer (nullable = true)
 |-- km: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- org_idx: double (nullable = false)
 |-- org_dummy: vector (nullable = true)
 |-- dow_dummy: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [None]:
predictions.select(['duration', 'prediction']).show(10)

+--------+------------------+
|duration|        prediction|
+--------+------------------+
|     240|228.07305939731083|
|     160|150.30548893923202|
|     130| 131.8149836232325|
|     275| 264.8129555949399|
|      85| 92.38342409393233|
|      80| 76.29817085052741|
|     200| 202.2081505873696|
|     130|130.61036826812415|
|     315|318.91628671206485|
|     380| 360.3060773927724|
+--------+------------------+
only showing top 10 rows



To evaluate the regression analysis, calculate the root mean square error using the **RegressionEvaluator**. Here is the Python code for evaluating the model.

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol='duration')
RMSE = evaluator.evaluate(predictions)
print("Root Mean Squared Error = " + str(RMSE))

Root Mean Squared Error = 11.052743259520884


# SMS Spam Pipeline

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv('sms.csv', sep=';', header=False, schema=schema)

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

In [None]:
# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed = 13)

# Fit a Logistic Regression model to the training data
pipeline = pipeline.fit(sms_train)

# Make predictions on the testing data
prediction = pipeline.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   40|
|    0|       0.0|  989|
|    1|       1.0|  131|
|    0|       1.0|    1|
+-----+----------+-----+



# Cross validation
Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.

![alt text](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator


# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['km', 'org_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

# Evaluator
evaluator = RegressionEvaluator(labelCol='duration')
# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator,
                    numFolds=15)

In [None]:
cv = cv.fit(flights_train)

What's the average RMSE across the folds?

In [None]:
cv.avgMetrics

[11.032067161730204]

It will fit the best model

In [None]:
predictions = cv.transform(flights_test)

In [None]:
# Calculate the RMSE
evaluator.evaluate(predictions)

11.054736661521357

# GridSearch
Grid search is the process of performing hyper parameter tuning in order to determine the optimal values for a given model. This is significant as the performance of the entire model is based on the hyper parameter values specified.

In [None]:
from pyspark.ml.tuning import ParamGridBuilder

# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10]) \
               .addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

Number of models to be tested:  12


In [None]:
# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator,
                    numFolds=5)

In [None]:
cv = cv.fit(flights_train)

In [None]:
cv.avgMetrics

[11.035019891450107,
 11.03535610235059,
 11.036110194912034,
 11.03703576433622,
 11.068571233974426,
 11.143937053106576,
 11.161169236438601,
 11.502636787627265,
 11.681957968842847,
 14.529195004960945,
 17.022892594020245,
 19.146032337823037]

In [None]:
predictions = cv.transform(flights_test)

In [None]:
# Calculate the RMSE
evaluator.evaluate(predictions)

11.05490085418882

In [None]:
# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
print(best_model.stages[3].extractParamMap())

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
evaluator.evaluate(predictions)

[StringIndexer_343b67596672, OneHotEncoderEstimator_d02ad1f33cd4, VectorAssembler_e907c122c0b7, LinearRegression_17670ee0b15c]
{Param(parent='LinearRegression_17670ee0b15c', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'): 2, Param(parent='LinearRegression_17670ee0b15c', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.0, Param(parent='LinearRegression_17670ee0b15c', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0.'): 1.35, Param(parent='LinearRegression_17670ee0b15c', name='featuresCol', doc='features column name'): 'features', Param(parent='LinearRegression_17670ee0b15c', name='fitIntercept', doc='whether to fit an intercept term'): True, Param(parent='LinearRegression_17670ee0b15c', name='labelCol', doc='label column name'): 'duration', Param(parent='LinearRegression_17670ee0b15c', name='loss', doc='Th

11.05490085418882

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

In [None]:
# Create parameter grid
params = ParamGridBuilder()

# Add grid for hashing trick parameters
params = params.addGrid(hasher.numFeatures, [1024, 4096, 16384]) \
               .addGrid(hasher.binary, [True, False])

# Add grid for logistic regression parameters
params = params.addGrid(logistic.regParam, [0.01, 0.1, 1.0, 10.0]) \
               .addGrid(logistic.elasticNetParam, [0.0, 0.5, 1.0])

# Build parameter grid
params = params.build()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator,
                    numFolds=5)

In [None]:
cv =cv.fit(sms_train)

In [None]:
predictions = cv.transform(sms_test)

In [None]:
evaluator.evaluate(predictions)

0.9730462519936114